diff --git "a/examples/zduSFxRajkE_transcript.json" "b/examples/zduSFxRajkE_transcript.json" new file mode 100644--- /dev/null +++ "b/examples/zduSFxRajkE_transcript.json" @@ -0,0 +1,13690 @@ +[ + { + "start": 0.04, + "text": "hi everyone so in this video I'd like us" + }, + { + "start": 2.04, + "text": "to cover the process of tokenization in" + }, + { + "start": 4.08, + "text": "large language models now you see here" + }, + { + "start": 6.44, + "text": "that I have a set face and that's" + }, + { + "start": 8.28, + "text": "because uh tokenization is my least" + }, + { + "start": 10.32, + "text": "favorite part of working with large" + }, + { + "start": 11.679, + "text": "language models but unfortunately it is" + }, + { + "start": 13.48, + "text": "necessary to understand in some detail" + }, + { + "start": 15.519, + "text": "because it it is fairly hairy gnarly and" + }, + { + "start": 17.6, + "text": "there's a lot of hidden foot guns to be" + }, + { + "start": 19.48, + "text": "aware of and a lot of oddness with large" + }, + { + "start": 21.84, + "text": "language models typically traces back to" + }, + { + "start": 24.599, + "text": "tokenization so what is" + }, + { + "start": 26.64, + "text": "tokenization now in my previous video" + }, + { + "start": 28.92, + "text": "Let's Build GPT from scratch uh we" + }, + { + "start": 31.56, + "text": "actually already did tokenization but we" + }, + { + "start": 33.48, + "text": "did a very naive simple version of" + }, + { + "start": 35.8, + "text": "tokenization so when you go to the" + }, + { + "start": 37.48, + "text": "Google colab for that video uh you see" + }, + { + "start": 40.559, + "text": "here that we loaded our training set and" + }, + { + "start": 43.2, + "text": "our training set was this uh Shakespeare" + }, + { + "start": 45.52, + "text": "uh data set now in the beginning the" + }, + { + "start": 48.12, + "text": "Shakespeare data set is just a large" + }, + { + "start": 49.76, + "text": "string in Python it's just text and so" + }, + { + "start": 52.44, + "text": "the question is how do we plug text into" + }, + { + "start": 54.84, + "text": "large language models and in this case" + }, + { + "start": 58.079, + "text": "here we created a vocabulary of 65" + }, + { + "start": 61.44, + "text": "possible characters that we saw occur in" + }, + { + "start": 63.96, + "text": "this string these were the possible" + }, + { + "start": 65.799, + "text": "characters and we saw that there are 65" + }, + { + "start": 67.96, + "text": "of them and then we created a a lookup" + }, + { + "start": 70.64, + "text": "table for converting from every possible" + }, + { + "start": 73.4, + "text": "character a little string piece into a" + }, + { + "start": 76.32, + "text": "token an" + }, + { + "start": 77.759, + "text": "integer so here for example we tokenized" + }, + { + "start": 80.52, + "text": "the string High there and we received" + }, + { + "start": 83.28, + "text": "this sequence of" + }, + { + "start": 84.72, + "text": "tokens and here we took the first 1,000" + }, + { + "start": 87.6, + "text": "characters of our data set and we" + }, + { + "start": 89.92, + "text": "encoded it into tokens and because it is" + }, + { + "start": 92.56, + "text": "this is character level we received" + }, + { + "start": 94.64, + "text": "1,000 tokens in a sequence so token 18" + }, + { + "start": 98.96, + "text": "47" + }, + { + "start": 100.119, + "text": "Etc now later we saw that the way we" + }, + { + "start": 103.439, + "text": "plug these tokens into the language" + }, + { + "start": 105.64, + "text": "model is by using an embedding" + }, + { + "start": 108.479, + "text": "table and so basically if we have 65" + }, + { + "start": 111.479, + "text": "possible tokens then this embedding" + }, + { + "start": 113.479, + "text": "table is going to have 65 rows and" + }, + { + "start": 116.439, + "text": "roughly speaking we're taking the" + }, + { + "start": 118.159, + "text": "integer associated with every single" + }, + { + "start": 119.799, + "text": "sing Le token we're using that as a" + }, + { + "start": 121.52, + "text": "lookup into this table and we're" + }, + { + "start": 124.039, + "text": "plucking out the corresponding row and" + }, + { + "start": 126.479, + "text": "this row is a uh is trainable parameters" + }, + { + "start": 129.36, + "text": "that we're going to train using back" + }, + { + "start": 130.479, + "text": "propagation and this is the vector that" + }, + { + "start": 132.879, + "text": "then feeds into the Transformer um and" + }, + { + "start": 135.36, + "text": "that's how the Transformer Ser of" + }, + { + "start": 136.56, + "text": "perceives every single" + }, + { + "start": 138.12, + "text": "token so here we had a very naive" + }, + { + "start": 141.28, + "text": "tokenization process that was a" + }, + { + "start": 143.12, + "text": "character level tokenizer but in" + }, + { + "start": 145.239, + "text": "practice in state-ofthe-art uh language" + }, + { + "start": 147.28, + "text": "models people use a lot more complicated" + }, + { + "start": 148.959, + "text": "schemes unfortunately" + }, + { + "start": 150.44, + "text": "uh for constructing these uh token" + }, + { + "start": 154.36, + "text": "vocabularies so we're not dealing on the" + }, + { + "start": 156.64, + "text": "Character level we're dealing on chunk" + }, + { + "start": 158.64, + "text": "level and the way these um character" + }, + { + "start": 161.519, + "text": "chunks are constructed is using" + }, + { + "start": 163.879, + "text": "algorithms such as for example the bik" + }, + { + "start": 165.48, + "text": "pair in coding algorithm which we're" + }, + { + "start": 166.959, + "text": "going to go into in detail um and cover" + }, + { + "start": 171.0, + "text": "in this video I'd like to briefly show" + }, + { + "start": 172.879, + "text": "you the paper that introduced a bite" + }, + { + "start": 174.84, + "text": "level encoding as a mechanism for" + }, + { + "start": 176.92, + "text": "tokenization in the context of large" + }, + { + "start": 178.44, + "text": "language models and I would say that" + }, + { + "start": 180.599, + "text": "that's probably the gpt2 paper and if" + }, + { + "start": 182.72, + "text": "you scroll down here to the section" + }, + { + "start": 185.56, + "text": "input representation this is where they" + }, + { + "start": 187.72, + "text": "cover tokenization the kinds of" + }, + { + "start": 189.48, + "text": "properties that you'd like the" + }, + { + "start": 190.56, + "text": "tokenization to have and they conclude" + }, + { + "start": 193.0, + "text": "here that they're going to have a" + }, + { + "start": 194.959, + "text": "tokenizer where you have a vocabulary of" + }, + { + "start": 197.599, + "text": "50,2 57 possible" + }, + { + "start": 200.68, + "text": "tokens and the context size is going to" + }, + { + "start": 204.4, + "text": "be 1,24 tokens so in the in in the" + }, + { + "start": 207.36, + "text": "attention layer of the Transformer" + }, + { + "start": 209.239, + "text": "neural network" + }, + { + "start": 210.48, + "text": "every single token is attending to the" + }, + { + "start": 212.319, + "text": "previous tokens in the sequence and it's" + }, + { + "start": 214.08, + "text": "going to see up to 1,24 tokens so tokens" + }, + { + "start": 217.92, + "text": "are this like fundamental unit um the" + }, + { + "start": 220.68, + "text": "atom of uh large language models if you" + }, + { + "start": 223.12, + "text": "will and everything is in units of" + }, + { + "start": 224.799, + "text": "tokens everything is about tokens and" + }, + { + "start": 227.08, + "text": "tokenization is the process for" + }, + { + "start": 228.36, + "text": "translating strings or text into" + }, + { + "start": 231.08, + "text": "sequences of tokens and uh vice versa" + }, + { + "start": 234.879, + "text": "when you go into the Llama 2 paper as" + }, + { + "start": 236.879, + "text": "well I can show you that when you search" + }, + { + "start": 238.28, + "text": "token you're going to get get 63 hits um" + }, + { + "start": 241.72, + "text": "and that's because tokens are again" + }, + { + "start": 243.319, + "text": "pervasive so here they mentioned that" + }, + { + "start": 245.12, + "text": "they trained on two trillion tokens of" + }, + { + "start": 246.879, + "text": "data and so" + }, + { + "start": 248.439, + "text": "on so we're going to build our own" + }, + { + "start": 251.079, + "text": "tokenizer luckily the bite be encoding" + }, + { + "start": 253.04, + "text": "algorithm is not uh that super" + }, + { + "start": 255.12, + "text": "complicated and we can build it from" + }, + { + "start": 256.959, + "text": "scratch ourselves and we'll see exactly" + }, + { + "start": 258.519, + "text": "how this works before we dive into code" + }, + { + "start": 260.72, + "text": "I'd like to give you a brief Taste of" + }, + { + "start": 262.56, + "text": "some of the complexities that come from" + }, + { + "start": 264.12, + "text": "the tokenization because I just want to" + }, + { + "start": 266.12, + "text": "make sure that we motivate it" + }, + { + "start": 267.199, + "text": "sufficiently for why we are doing all" + }, + { + "start": 269.479, + "text": "this and why this is so gross so" + }, + { + "start": 272.639, + "text": "tokenization is at the heart of a lot of" + }, + { + "start": 274.199, + "text": "weirdness in large language models and I" + }, + { + "start": 276.12, + "text": "would advise that you do not brush it" + }, + { + "start": 277.759, + "text": "off a lot of the issues that may look" + }, + { + "start": 280.6, + "text": "like just issues with the new network" + }, + { + "start": 282.32, + "text": "architecture or the large language model" + }, + { + "start": 284.52, + "text": "itself are actually issues with the" + }, + { + "start": 286.6, + "text": "tokenization and fundamentally Trace uh" + }, + { + "start": 289.16, + "text": "back to it so if you've noticed any" + }, + { + "start": 291.759, + "text": "issues with large language models can't" + }, + { + "start": 294.24, + "text": "you know not able to do spelling tasks" + }, + { + "start": 296.16, + "text": "very easily that's usually due to" + }, + { + "start": 297.96, + "text": "tokenization simple string processing" + }, + { + "start": 300.16, + "text": "can be difficult for the large language" + }, + { + "start": 302.28, + "text": "model to perform" + }, + { + "start": 303.6, + "text": "natively uh non-english languages can" + }, + { + "start": 306.08, + "text": "work much worse and to a large extent" + }, + { + "start": 308.24, + "text": "this is due to" + }, + { + "start": 309.44, + "text": "tokenization sometimes llms are bad at" + }, + { + "start": 311.759, + "text": "simple arithmetic also can trace be" + }, + { + "start": 314.08, + "text": "traced to" + }, + { + "start": 315.479, + "text": "tokenization uh gbt2 specifically would" + }, + { + "start": 317.759, + "text": "have had quite a bit more issues with" + }, + { + "start": 319.639, + "text": "python than uh future versions of it due" + }, + { + "start": 322.16, + "text": "to tokenization there's a lot of other" + }, + { + "start": 324.4, + "text": "issues maybe you've seen weird warnings" + }, + { + "start": 325.88, + "text": "about a trailing whites space this is a" + }, + { + "start": 327.44, + "text": "tokenization issue um" + }, + { + "start": 330.68, + "text": "if you had asked GPT earlier about solid" + }, + { + "start": 333.52, + "text": "gold Magikarp and what it is you would" + }, + { + "start": 335.24, + "text": "see the llm go totally crazy and it" + }, + { + "start": 337.52, + "text": "would start going off about a completely" + }, + { + "start": 339.56, + "text": "unrelated tangent topic maybe you've" + }, + { + "start": 341.919, + "text": "been told to use yl over Json in" + }, + { + "start": 343.72, + "text": "structure data all of that has to do" + }, + { + "start": 345.44, + "text": "with tokenization so basically" + }, + { + "start": 347.639, + "text": "tokenization is at the heart of many" + }, + { + "start": 349.4, + "text": "issues I will look back around to these" + }, + { + "start": 351.88, + "text": "at the end of the video but for now let" + }, + { + "start": 354.08, + "text": "me just um skip over it a little bit and" + }, + { + "start": 356.919, + "text": "let's go to this web app um the Tik" + }, + { + "start": 359.96, + "text": "tokenizer bell.app so I have it loaded" + }, + { + "start": 362.919, + "text": "here and what I like about this web app" + }, + { + "start": 364.68, + "text": "is that tokenization is running a sort" + }, + { + "start": 366.56, + "text": "of live in your browser in JavaScript so" + }, + { + "start": 369.52, + "text": "you can just type here stuff hello world" + }, + { + "start": 371.96, + "text": "and the whole string" + }, + { + "start": 374.199, + "text": "rokenes so here what we see on uh the" + }, + { + "start": 378.479, + "text": "left is a string that you put in on the" + }, + { + "start": 380.36, + "text": "right we're currently using the gpt2" + }, + { + "start": 382.199, + "text": "tokenizer we see that this string that I" + }, + { + "start": 384.56, + "text": "pasted here is currently tokenizing into" + }, + { + "start": 387.08, + "text": "300 tokens and here they are sort of uh" + }, + { + "start": 390.52, + "text": "shown explicitly in different colors for" + }, + { + "start": 392.68, + "text": "every single token so for example uh" + }, + { + "start": 395.52, + "text": "this word tokenization became two tokens" + }, + { + "start": 398.88, + "text": "the token" + }, + { + "start": 400.72, + "text": "3,642 and" + }, + { + "start": 404.0, + "text": "1,634 the token um space is is token 318" + }, + { + "start": 410.16, + "text": "so be careful on the bottom you can show" + }, + { + "start": 411.919, + "text": "white space and keep in mind that there" + }, + { + "start": 414.599, + "text": "are spaces and uh sln new line" + }, + { + "start": 417.36, + "text": "characters in here but you can hide them" + }, + { + "start": 419.72, + "text": "for" + }, + { + "start": 421.599, + "text": "clarity the token space at is token 379" + }, + { + "start": 426.0, + "text": "the to the Token space the is 262 Etc so" + }, + { + "start": 431.08, + "text": "you notice here that the space is part" + }, + { + "start": 432.96, + "text": "of that uh token" + }, + { + "start": 435.96, + "text": "chunk now so this is kind of like how" + }, + { + "start": 438.639, + "text": "our English sentence broke up and that" + }, + { + "start": 441.16, + "text": "seems all well and good now now here I" + }, + { + "start": 444.039, + "text": "put in some arithmetic so we see that uh" + }, + { + "start": 446.919, + "text": "the token 127 Plus and then token six" + }, + { + "start": 451.8, + "text": "space 6 followed by 77 so what's" + }, + { + "start": 454.24, + "text": "happening here is that 127 is feeding in" + }, + { + "start": 456.639, + "text": "as a single token into the large" + }, + { + "start": 458.16, + "text": "language model but the um number 677" + }, + { + "start": 462.68, + "text": "will actually feed in as two separate" + }, + { + "start": 464.84, + "text": "tokens and so the large language model" + }, + { + "start": 467.0, + "text": "has to sort of um take account of that" + }, + { + "start": 470.72, + "text": "and process it correctly in its Network" + }, + { + "start": 473.879, + "text": "and see here 804 will be broken up into" + }, + { + "start": 476.199, + "text": "two tokens and it's is all completely" + }, + { + "start": 477.96, + "text": "arbitrary and here I have another" + }, + { + "start": 479.8, + "text": "example of four-digit numbers and they" + }, + { + "start": 482.039, + "text": "break up in a way that they break up and" + }, + { + "start": 483.919, + "text": "it's totally arbitrary sometimes you" + }, + { + "start": 485.28, + "text": "have um multiple digits single token" + }, + { + "start": 488.36, + "text": "sometimes you have individual digits as" + }, + { + "start": 490.36, + "text": "many tokens and it's all kind of pretty" + }, + { + "start": 492.24, + "text": "arbitrary and coming out of the" + }, + { + "start": 494.68, + "text": "tokenizer here's another example we have" + }, + { + "start": 497.479, + "text": "the string egg and you see here that" + }, + { + "start": 501.039, + "text": "this became two" + }, + { + "start": 502.36, + "text": "tokens but for some reason when I say I" + }, + { + "start": 504.759, + "text": "have an egg you see when it's a space" + }, + { + "start": 507.72, + "text": "egg it's two token it's sorry it's a" + }, + { + "start": 510.84, + "text": "single token so just egg by itself in" + }, + { + "start": 513.24, + "text": "the beginning of a sentence is two" + }, + { + "start": 514.76, + "text": "tokens but here as a space egg is" + }, + { + "start": 517.68, + "text": "suddenly a single token uh for the exact" + }, + { + "start": 520.519, + "text": "same string okay here lowercase egg" + }, + { + "start": 524.2, + "text": "turns out to be a single token and in" + }, + { + "start": 526.24, + "text": "particular notice that the color is" + }, + { + "start": 527.48, + "text": "different so this is a different token" + }, + { + "start": 529.36, + "text": "so this is case sensitive and of course" + }, + { + "start": 531.76, + "text": "a capital egg would also be different" + }, + { + "start": 534.56, + "text": "tokens and again um this would be two" + }, + { + "start": 537.44, + "text": "tokens arbitrarily so so for the same" + }, + { + "start": 540.079, + "text": "concept egg depending on if it's in the" + }, + { + "start": 542.32, + "text": "beginning of a sentence at the end of a" + }, + { + "start": 543.8, + "text": "sentence lowercase uppercase or mixed" + }, + { + "start": 546.24, + "text": "all this will be uh basically very" + }, + { + "start": 548.079, + "text": "different tokens and different IDs and" + }, + { + "start": 550.32, + "text": "the language model has to learn from raw" + }, + { + "start": 552.04, + "text": "data from all the internet text that" + }, + { + "start": 553.56, + "text": "it's going to be training on that these" + }, + { + "start": 555.16, + "text": "are actually all the exact same concept" + }, + { + "start": 557.44, + "text": "and it has to sort of group them in the" + }, + { + "start": 559.279, + "text": "parameters of the neural network and" + }, + { + "start": 561.32, + "text": "understand just based on the data" + }, + { + "start": 562.48, + "text": "patterns that these are all very similar" + }, + { + "start": 564.76, + "text": "but maybe not almost exactly similar but" + }, + { + "start": 567.399, + "text": "but very very similar" + }, + { + "start": 570.16, + "text": "um after the EG demonstration here I" + }, + { + "start": 572.8, + "text": "have um an introduction from open a eyes" + }, + { + "start": 575.64, + "text": "chbt in Korean so manaso Pang uh Etc uh" + }, + { + "start": 581.959, + "text": "so this is in Korean and the reason I" + }, + { + "start": 584.079, + "text": "put this here is because you'll notice" + }, + { + "start": 587.76, + "text": "that um non-english languages work" + }, + { + "start": 591.0, + "text": "slightly worse in Chachi part of this is" + }, + { + "start": 594.32, + "text": "because of course the training data set" + }, + { + "start": 595.64, + "text": "for Chachi is much larger for English" + }, + { + "start": 598.079, + "text": "and for everything else but the same is" + }, + { + "start": 599.959, + "text": "true not just for the large language" + }, + { + "start": 601.68, + "text": "model itself but also for the tokenizer" + }, + { + "start": 604.32, + "text": "so when we train the tokenizer we're" + }, + { + "start": 605.88, + "text": "going to see that there's a training set" + }, + { + "start": 607.24, + "text": "as well and there's a lot more English" + }, + { + "start": 609.24, + "text": "than non-english and what ends up" + }, + { + "start": 611.32, + "text": "happening is that we're going to have a" + }, + { + "start": 613.48, + "text": "lot more longer tokens for" + }, + { + "start": 616.6, + "text": "English so how do I put this if you have" + }, + { + "start": 619.6, + "text": "a single sentence in English and you" + }, + { + "start": 621.399, + "text": "tokenize it you might see that it's 10" + }, + { + "start": 623.56, + "text": "tokens or something like that but if you" + }, + { + "start": 625.48, + "text": "translate that sentence into say Korean" + }, + { + "start": 627.36, + "text": "or Japanese or something else you'll" + }, + { + "start": 629.44, + "text": "typically see that the number of tokens" + }, + { + "start": 630.839, + "text": "used is much larger and that's because" + }, + { + "start": 633.399, + "text": "the chunks here are a lot more broken up" + }, + { + "start": 636.76, + "text": "so we're using a lot more tokens for the" + }, + { + "start": 638.519, + "text": "exact same thing and what this does is" + }, + { + "start": 641.36, + "text": "it bloats up the sequence length of all" + }, + { + "start": 643.76, + "text": "the documents so you're using up more" + }, + { + "start": 646.24, + "text": "tokens and then in the attention of the" + }, + { + "start": 648.399, + "text": "Transformer when these tokens try to" + }, + { + "start": 649.92, + "text": "attend each other you are running out of" + }, + { + "start": 651.92, + "text": "context um in the maximum context length" + }, + { + "start": 655.12, + "text": "of that Transformer and so basically all" + }, + { + "start": 657.959, + "text": "the non-english text is stretched out" + }, + { + "start": 661.279, + "text": "from the perspective of the Transformer" + }, + { + "start": 663.44, + "text": "and this just has to do with the um" + }, + { + "start": 665.68, + "text": "trainings that used for the tokenizer" + }, + { + "start": 667.48, + "text": "and the tokenization itself so it will" + }, + { + "start": 670.04, + "text": "create a lot bigger tokens and a lot" + }, + { + "start": 672.079, + "text": "larger groups in English and it will" + }, + { + "start": 674.2, + "text": "have a lot of little boundaries for all" + }, + { + "start": 676.16, + "text": "the other non-english text um so if we" + }, + { + "start": 679.76, + "text": "translated this into English it would be" + }, + { + "start": 681.92, + "text": "significantly fewer" + }, + { + "start": 683.32, + "text": "tokens the final example I have here is" + }, + { + "start": 685.639, + "text": "a little snippet of python for doing FS" + }, + { + "start": 688.079, + "text": "buuz and what I'd like you to notice is" + }, + { + "start": 691.0, + "text": "look all these individual spaces are all" + }, + { + "start": 694.04, + "text": "separate tokens they are token" + }, + { + "start": 697.0, + "text": "220 so uh 220 220 220 220 and then space" + }, + { + "start": 702.76, + "text": "if is a single token and so what's going" + }, + { + "start": 705.32, + "text": "on here is that when the Transformer is" + }, + { + "start": 706.72, + "text": "going to consume or try to uh create" + }, + { + "start": 709.32, + "text": "this text it needs to um handle all" + }, + { + "start": 712.639, + "text": "these spaces individually they all feed" + }, + { + "start": 714.48, + "text": "in one by one into the entire" + }, + { + "start": 716.56, + "text": "Transformer in the sequence and so this" + }, + { + "start": 719.12, + "text": "is being extremely wasteful tokenizing" + }, + { + "start": 721.279, + "text": "it in this way and so as a result of" + }, + { + "start": 724.44, + "text": "that gpt2 is not very good with python" + }, + { + "start": 727.04, + "text": "and it's not anything to do with coding" + }, + { + "start": 728.68, + "text": "or the language model itself it's just" + }, + { + "start": 730.68, + "text": "that if he use a lot of indentation" + }, + { + "start": 732.079, + "text": "using space in Python like we usually do" + }, + { + "start": 735.399, + "text": "uh you just end up bloating out all the" + }, + { + "start": 737.399, + "text": "text and it's separated across way too" + }, + { + "start": 739.36, + "text": "much of the sequence and we are running" + }, + { + "start": 741.04, + "text": "out of the context length in the" + }, + { + "start": 742.76, + "text": "sequence uh that's roughly speaking" + }, + { + "start": 744.44, + "text": "what's what's happening we're being way" + }, + { + "start": 745.639, + "text": "too wasteful we're taking up way too" + }, + { + "start": 747.399, + "text": "much token space now we can also scroll" + }, + { + "start": 749.68, + "text": "up here and we can change the tokenizer" + }, + { + "start": 751.6, + "text": "so note here that gpt2 tokenizer creates" + }, + { + "start": 754.04, + "text": "a token count of 300 for this string" + }, + { + "start": 756.72, + "text": "here we can change it to CL 100K base" + }, + { + "start": 759.519, + "text": "which is the GPT for tokenizer and we" + }, + { + "start": 761.839, + "text": "see that the token count drops to 185 so" + }, + { + "start": 764.56, + "text": "for the exact same string we are now" + }, + { + "start": 766.8, + "text": "roughly having the number of tokens and" + }, + { + "start": 769.8, + "text": "roughly speaking this is because uh the" + }, + { + "start": 771.76, + "text": "number of tokens in the GPT 4 tokenizer" + }, + { + "start": 774.36, + "text": "is roughly double that of the number of" + }, + { + "start": 776.72, + "text": "tokens in the gpt2 tokenizer so we went" + }, + { + "start": 778.839, + "text": "went from roughly 50k to roughly 100K" + }, + { + "start": 781.639, + "text": "now you can imagine that this is a good" + }, + { + "start": 783.0, + "text": "thing because the same text is now" + }, + { + "start": 786.0, + "text": "squished into half as many tokens so uh" + }, + { + "start": 790.199, + "text": "this is a lot denser input to the" + }, + { + "start": 792.76, + "text": "Transformer and in the Transformer every" + }, + { + "start": 795.44, + "text": "single token has a finite number of" + }, + { + "start": 797.04, + "text": "tokens before it that it's going to pay" + }, + { + "start": 798.399, + "text": "attention to and so what this is doing" + }, + { + "start": 800.44, + "text": "is we're roughly able to see twice as" + }, + { + "start": 803.48, + "text": "much text as a context for what token to" + }, + { + "start": 806.519, + "text": "predict next uh because of this change" + }, + { + "start": 809.279, + "text": "but of course just increasing the number" + }, + { + "start": 810.8, + "text": "of tokens is uh not strictly better" + }, + { + "start": 813.399, + "text": "infinitely uh because as you increase" + }, + { + "start": 815.16, + "text": "the number of tokens now your embedding" + }, + { + "start": 816.92, + "text": "table is um sort of getting a lot larger" + }, + { + "start": 819.88, + "text": "and also at the output we are trying to" + }, + { + "start": 821.48, + "text": "predict the next token and there's the" + }, + { + "start": 822.88, + "text": "soft Max there and that grows as well" + }, + { + "start": 825.12, + "text": "we're going to go into more detail later" + }, + { + "start": 826.399, + "text": "on this but there's some kind of a Sweet" + }, + { + "start": 828.44, + "text": "Spot somewhere where you have a just" + }, + { + "start": 831.0, + "text": "right number of tokens in your" + }, + { + "start": 832.279, + "text": "vocabulary where everything is" + }, + { + "start": 833.88, + "text": "appropriately dense and still fairly" + }, + { + "start": 836.519, + "text": "efficient now one thing I would like you" + }, + { + "start": 838.36, + "text": "to note specifically for the gp4" + }, + { + "start": 840.16, + "text": "tokenizer is that the handling of the" + }, + { + "start": 843.56, + "text": "white space for python has improved a" + }, + { + "start": 845.44, + "text": "lot you see that here these four spaces" + }, + { + "start": 848.36, + "text": "are represented as one single token for" + }, + { + "start": 850.24, + "text": "the three spaces here and then the token" + }, + { + "start": 853.759, + "text": "SPF and here seven spaces were all" + }, + { + "start": 856.759, + "text": "grouped into a single token so we're" + }, + { + "start": 858.8, + "text": "being a lot more efficient in how we" + }, + { + "start": 860.199, + "text": "represent Python and this was a" + }, + { + "start": 861.92, + "text": "deliberate Choice made by open aai when" + }, + { + "start": 863.759, + "text": "they designed the gp4 tokenizer and they" + }, + { + "start": 867.56, + "text": "group a lot more space into a single" + }, + { + "start": 869.68, + "text": "character what this does is this" + }, + { + "start": 872.079, + "text": "densifies Python and therefore we can" + }, + { + "start": 875.199, + "text": "attend to more code before it when we're" + }, + { + "start": 878.12, + "text": "trying to predict the next token in the" + }, + { + "start": 879.72, + "text": "sequence and so the Improvement in the" + }, + { + "start": 882.04, + "text": "python coding ability from gbt2 to gp4" + }, + { + "start": 885.399, + "text": "is not just a matter of the language" + }, + { + "start": 887.079, + "text": "model and the architecture and the" + }, + { + "start": 888.839, + "text": "details of the optimization but a lot of" + }, + { + "start": 890.759, + "text": "the Improvement here is also coming from" + }, + { + "start": 892.24, + "text": "the design of the tokenizer and how it" + }, + { + "start": 894.24, + "text": "groups characters into tokens okay so" + }, + { + "start": 896.959, + "text": "let's now start writing some code" + }, + { + "start": 899.399, + "text": "so remember what we want to do we want" + }, + { + "start": 901.44, + "text": "to take strings and feed them into" + }, + { + "start": 903.72, + "text": "language models for that we need to" + }, + { + "start": 905.959, + "text": "somehow tokenize strings into some" + }, + { + "start": 908.8, + "text": "integers in some fixed vocabulary and" + }, + { + "start": 912.36, + "text": "then we will use those integers to make" + }, + { + "start": 914.24, + "text": "a look up into a lookup table of vectors" + }, + { + "start": 916.759, + "text": "and feed those vectors into the" + }, + { + "start": 918.0, + "text": "Transformer as an input now the reason" + }, + { + "start": 921.36, + "text": "this gets a little bit tricky of course" + }, + { + "start": 922.72, + "text": "is that we don't just want to support" + }, + { + "start": 924.0, + "text": "the simple English alphabet we want to" + }, + { + "start": 926.12, + "text": "support different kinds of languages so" + }, + { + "start": 928.12, + "text": "this is anango in Korean which is hello" + }, + { + "start": 931.639, + "text": "and we also want to support many kinds" + }, + { + "start": 933.0, + "text": "of special characters that we might find" + }, + { + "start": 934.72, + "text": "on the internet for example" + }, + { + "start": 937.319, + "text": "Emoji so how do we feed this text into" + }, + { + "start": 941.48, + "text": "uh" + }, + { + "start": 942.199, + "text": "Transformers well how's the what is this" + }, + { + "start": 944.48, + "text": "text anyway in Python so if you go to" + }, + { + "start": 946.56, + "text": "the documentation of a string in Python" + }, + { + "start": 949.6, + "text": "you can see that strings are immutable" + }, + { + "start": 951.519, + "text": "sequences of Unicode code" + }, + { + "start": 954.12, + "text": "points okay what are Unicode code points" + }, + { + "start": 957.88, + "text": "we can go to PDF so Unicode code points" + }, + { + "start": 961.48, + "text": "are defined by the Unicode Consortium as" + }, + { + "start": 964.68, + "text": "part of the Unicode standard and what" + }, + { + "start": 967.56, + "text": "this is really is that it's just a" + }, + { + "start": 969.0, + "text": "definition of roughly 150,000 characters" + }, + { + "start": 971.839, + "text": "right now and roughly speaking what they" + }, + { + "start": 974.72, + "text": "look like and what integers um represent" + }, + { + "start": 977.56, + "text": "those characters so it says 150,000" + }, + { + "start": 979.72, + "text": "characters across 161 scripts as of" + }, + { + "start": 982.639, + "text": "right now so if you scroll down here you" + }, + { + "start": 984.72, + "text": "can see that the standard is very much" + }, + { + "start": 986.279, + "text": "alive the latest standard 15.1 in" + }, + { + "start": 988.72, + "text": "September" + }, + { + "start": 990.199, + "text": "2023 and basically this is just a way to" + }, + { + "start": 993.92, + "text": "define lots of types of" + }, + { + "start": 996.92, + "text": "characters like for example all these" + }, + { + "start": 999.16, + "text": "characters across different scripts so" + }, + { + "start": 1001.88, + "text": "the way we can access the unic code code" + }, + { + "start": 1004.04, + "text": "Point given Single Character is by using" + }, + { + "start": 1005.959, + "text": "the or function in Python so for example" + }, + { + "start": 1008.199, + "text": "I can pass in Ord of H and I can see" + }, + { + "start": 1011.279, + "text": "that for the Single Character H the unic" + }, + { + "start": 1014.72, + "text": "code code point is" + }, + { + "start": 1016.48, + "text": "104 okay um but this can be arbitr" + }, + { + "start": 1020.399, + "text": "complicated so we can take for example" + }, + { + "start": 1022.16, + "text": "our Emoji here and we can see that the" + }, + { + "start": 1024.16, + "text": "code point for this one is" + }, + { + "start": 1026.4, + "text": "128,000 or we can take" + }, + { + "start": 1030.36, + "text": "un and this is 50,000 now keep in mind" + }, + { + "start": 1033.72, + "text": "you can't plug in strings here because" + }, + { + "start": 1036.72, + "text": "you uh this doesn't have a single code" + }, + { + "start": 1038.439, + "text": "point it only takes a single uni code" + }, + { + "start": 1040.679, + "text": "code Point character and tells you its" + }, + { + "start": 1043.959, + "text": "integer so in this way we can look" + }, + { + "start": 1046.799, + "text": "up all the um characters of this" + }, + { + "start": 1050.08, + "text": "specific string and their code points so" + }, + { + "start": 1052.16, + "text": "or of X forx in this string and we get" + }, + { + "start": 1056.76, + "text": "this encoding here now see here we've" + }, + { + "start": 1060.36, + "text": "already turned the raw code points" + }, + { + "start": 1062.2, + "text": "already have integers so why can't we" + }, + { + "start": 1064.44, + "text": "simply just use these integers and not" + }, + { + "start": 1066.84, + "text": "have any tokenization at all why can't" + }, + { + "start": 1068.559, + "text": "we just use this natively as is and just" + }, + { + "start": 1070.64, + "text": "use the code Point well one reason for" + }, + { + "start": 1072.88, + "text": "that of course is that the vocabulary in" + }, + { + "start": 1074.36, + "text": "that case would be quite long so in this" + }, + { + "start": 1076.799, + "text": "case for Unicode the this is a" + }, + { + "start": 1078.679, + "text": "vocabulary of" + }, + { + "start": 1079.799, + "text": "150,000 different code points but more" + }, + { + "start": 1082.64, + "text": "worryingly than that I think the Unicode" + }, + { + "start": 1085.039, + "text": "standard is very much alive and it keeps" + }, + { + "start": 1087.039, + "text": "changing and so it's not kind of a" + }, + { + "start": 1089.24, + "text": "stable representation necessarily that" + }, + { + "start": 1091.08, + "text": "we may want to use directly so for those" + }, + { + "start": 1093.88, + "text": "reasons we need something a bit better" + }, + { + "start": 1095.76, + "text": "so to find something better we turn to" + }, + { + "start": 1097.64, + "text": "encodings so if we go to the Wikipedia" + }, + { + "start": 1099.76, + "text": "page here we see that the Unicode" + }, + { + "start": 1101.28, + "text": "consortion defines three types of" + }, + { + "start": 1103.799, + "text": "encodings utf8 UTF 16 and UTF 32 these" + }, + { + "start": 1107.96, + "text": "encoding are the way by which we can" + }, + { + "start": 1110.72, + "text": "take Unicode text and translate it into" + }, + { + "start": 1113.48, + "text": "binary data or by streams utf8 is by far" + }, + { + "start": 1117.2, + "text": "the most common uh so this is the utf8" + }, + { + "start": 1119.96, + "text": "page now this Wikipedia page is actually" + }, + { + "start": 1122.0, + "text": "quite long but what's important for our" + }, + { + "start": 1124.4, + "text": "purposes is that utf8 takes every single" + }, + { + "start": 1126.44, + "text": "Cod point and it translates it to a by" + }, + { + "start": 1129.64, + "text": "stream and this by stream is between one" + }, + { + "start": 1132.36, + "text": "to four bytes so it's a variable length" + }, + { + "start": 1134.36, + "text": "encoding so depending on the Unicode" + }, + { + "start": 1136.48, + "text": "Point according to the schema you're" + }, + { + "start": 1138.039, + "text": "going to end up with between 1 to four" + }, + { + "start": 1139.76, + "text": "bytes for each code point on top of that" + }, + { + "start": 1143.0, + "text": "there's utf8 uh" + }, + { + "start": 1145.12, + "text": "utf16 and UTF 32 UTF 32 is nice because" + }, + { + "start": 1148.84, + "text": "it is fixed length instead of variable" + }, + { + "start": 1150.559, + "text": "length but it has many other downsides" + }, + { + "start": 1152.48, + "text": "as well so the full kind of spectrum of" + }, + { + "start": 1157.0, + "text": "pros and cons of all these different" + }, + { + "start": 1158.32, + "text": "three encodings are beyond the scope of" + }, + { + "start": 1160.48, + "text": "this video I just like to point out that" + }, + { + "start": 1162.52, + "text": "I enjoyed this block post and this block" + }, + { + "start": 1165.24, + "text": "post at the end of it also has a number" + }, + { + "start": 1167.039, + "text": "of references that can be quite useful" + }, + { + "start": 1169.24, + "text": "uh one of them is uh utf8 everywhere" + }, + { + "start": 1172.039, + "text": "Manifesto um and this Manifesto" + }, + { + "start": 1174.32, + "text": "describes the reason why utf8 is" + }, + { + "start": 1176.64, + "text": "significantly preferred and a lot nicer" + }, + { + "start": 1179.88, + "text": "than the other encodings and why it is" + }, + { + "start": 1181.799, + "text": "used a lot more prominently um on the" + }, + { + "start": 1185.48, + "text": "internet one of the major advantages" + }, + { + "start": 1188.08, + "text": "just just to give you a sense is that" + }, + { + "start": 1189.559, + "text": "utf8 is the only one of these that is" + }, + { + "start": 1192.0, + "text": "backwards compatible to the much simpler" + }, + { + "start": 1194.2, + "text": "asky encoding of text um but I'm not" + }, + { + "start": 1197.08, + "text": "going to go into the full detail in this" + }, + { + "start": 1198.48, + "text": "video so suffice to say that we like the" + }, + { + "start": 1201.0, + "text": "utf8 encoding and uh let's try to take" + }, + { + "start": 1203.84, + "text": "the string and see what we get if we" + }, + { + "start": 1206.039, + "text": "encoded into" + }, + { + "start": 1208.0, + "text": "utf8 the string class in Python actually" + }, + { + "start": 1210.76, + "text": "has do encode and you can give it the" + }, + { + "start": 1212.36, + "text": "encoding which is say utf8 now we get" + }, + { + "start": 1215.559, + "text": "out of this is not very nice because" + }, + { + "start": 1217.84, + "text": "this is the bytes is a bytes object and" + }, + { + "start": 1220.96, + "text": "it's not very nice in the way that it's" + }, + { + "start": 1222.76, + "text": "printed so I personally like to take it" + }, + { + "start": 1225.039, + "text": "through list because then we actually" + }, + { + "start": 1226.84, + "text": "get the raw B" + }, + { + "start": 1228.72, + "text": "of this uh encoding so this is the raw" + }, + { + "start": 1232.4, + "text": "byes that represent this string" + }, + { + "start": 1235.6, + "text": "according to the utf8 en coding we can" + }, + { + "start": 1238.08, + "text": "also look at utf16 we get a slightly" + }, + { + "start": 1240.559, + "text": "different by stream and we here we start" + }, + { + "start": 1243.24, + "text": "to see one of the disadvantages of utf16" + }, + { + "start": 1245.48, + "text": "you see how we have zero Z something Z" + }, + { + "start": 1247.96, + "text": "something Z something we're starting to" + }, + { + "start": 1249.679, + "text": "get a sense that this is a bit of a" + }, + { + "start": 1250.84, + "text": "wasteful encoding and indeed for simple" + }, + { + "start": 1253.919, + "text": "asky characters or English characters" + }, + { + "start": 1256.28, + "text": "here uh we just have the structure of 0" + }, + { + "start": 1258.559, + "text": "something Z something and it's not" + }, + { + "start": 1260.76, + "text": "exactly nice same for UTF 32 when we" + }, + { + "start": 1264.24, + "text": "expand this we can start to get a sense" + }, + { + "start": 1266.08, + "text": "of the wastefulness of this encoding for" + }, + { + "start": 1268.0, + "text": "our purposes you see a lot of zeros" + }, + { + "start": 1270.4, + "text": "followed by" + }, + { + "start": 1271.4, + "text": "something and so uh this is not" + }, + { + "start": 1274.84, + "text": "desirable so suffice it to say that we" + }, + { + "start": 1277.84, + "text": "would like to stick with utf8 for our" + }, + { + "start": 1280.88, + "text": "purposes however if we just use utf8" + }, + { + "start": 1283.88, + "text": "naively these are by streams so that" + }, + { + "start": 1286.4, + "text": "would imply a vocabulary length of only" + }, + { + "start": 1289.24, + "text": "256 possible tokens uh but this this" + }, + { + "start": 1293.12, + "text": "vocabulary size is very very small what" + }, + { + "start": 1295.32, + "text": "this is going to do if we just were to" + }, + { + "start": 1296.679, + "text": "use it naively is that all of our text" + }, + { + "start": 1299.88, + "text": "would be stretched out over very very" + }, + { + "start": 1301.919, + "text": "long sequences of bytes and so" + }, + { + "start": 1306.159, + "text": "um what what this does is that certainly" + }, + { + "start": 1309.32, + "text": "the embeding table is going to be tiny" + }, + { + "start": 1311.0, + "text": "and the prediction at the top at the" + }, + { + "start": 1312.32, + "text": "final layer is going to be very tiny but" + }, + { + "start": 1314.159, + "text": "our sequences are very long and remember" + }, + { + "start": 1316.44, + "text": "that we have pretty finite um context" + }, + { + "start": 1319.32, + "text": "length and the attention that we can" + }, + { + "start": 1321.0, + "text": "support in a transformer for" + }, + { + "start": 1322.76, + "text": "computational reasons and so we only" + }, + { + "start": 1325.52, + "text": "have as much context length but now we" + }, + { + "start": 1327.48, + "text": "have very very long sequences and this" + }, + { + "start": 1329.44, + "text": "is just inefficient and it's not going" + }, + { + "start": 1330.799, + "text": "to allow us to attend to sufficiently" + }, + { + "start": 1332.799, + "text": "long text uh before us for the purposes" + }, + { + "start": 1335.64, + "text": "of the next token prediction task so we" + }, + { + "start": 1338.36, + "text": "don't want to use the raw bytes of the" + }, + { + "start": 1341.6, + "text": "utf8 encoding we want to be able to" + }, + { + "start": 1344.2, + "text": "support larger vocabulary size that we" + }, + { + "start": 1346.919, + "text": "can tune as a hyper" + }, + { + "start": 1348.64, + "text": "but we want to stick with the utf8" + }, + { + "start": 1350.84, + "text": "encoding of these strings so what do we" + }, + { + "start": 1353.559, + "text": "do well the answer of course is we turn" + }, + { + "start": 1355.48, + "text": "to the bite pair encoding algorithm" + }, + { + "start": 1357.44, + "text": "which will allow us to compress these" + }, + { + "start": 1359.08, + "text": "bite sequences um to a variable amount" + }, + { + "start": 1362.6, + "text": "so we'll get to that in a bit but I just" + }, + { + "start": 1364.679, + "text": "want to briefly speak to the fact that I" + }, + { + "start": 1367.12, + "text": "would love nothing more than to be able" + }, + { + "start": 1369.279, + "text": "to feed raw bite sequences into uh" + }, + { + "start": 1372.96, + "text": "language models in fact there's a paper" + }, + { + "start": 1374.88, + "text": "about how this could potentially be done" + }, + { + "start": 1377.08, + "text": "uh from Summer last last year now the" + }, + { + "start": 1379.279, + "text": "problem is you actually have to go in" + }, + { + "start": 1380.96, + "text": "and you have to modify the Transformer" + }, + { + "start": 1382.279, + "text": "architecture because as I mentioned" + }, + { + "start": 1384.48, + "text": "you're going to have a problem where the" + }, + { + "start": 1386.64, + "text": "attention will start to become extremely" + }, + { + "start": 1388.24, + "text": "expensive because the sequences are so" + }, + { + "start": 1390.36, + "text": "long and so in this paper they propose" + }, + { + "start": 1393.44, + "text": "kind of a hierarchical structuring of" + }, + { + "start": 1395.76, + "text": "the Transformer that could allow you to" + }, + { + "start": 1397.64, + "text": "just feed in raw bites and so at the end" + }, + { + "start": 1400.36, + "text": "they say together these results" + }, + { + "start": 1401.919, + "text": "establish the viability of tokenization" + }, + { + "start": 1403.64, + "text": "free autor regressive sequence modeling" + }, + { + "start": 1405.32, + "text": "at scale so tokenization free would" + }, + { + "start": 1407.4, + "text": "indeed be amazing we would just feed B" + }, + { + "start": 1410.279, + "text": "streams directly into our models but" + }, + { + "start": 1412.279, + "text": "unfortunately I don't know that this has" + }, + { + "start": 1414.159, + "text": "really been proven out yet by" + }, + { + "start": 1416.08, + "text": "sufficiently many groups and a" + }, + { + "start": 1417.24, + "text": "sufficient scale uh but something like" + }, + { + "start": 1419.24, + "text": "this at one point would be amazing and I" + }, + { + "start": 1420.679, + "text": "hope someone comes up with it but for" + }, + { + "start": 1422.32, + "text": "now we have to come back and we can't" + }, + { + "start": 1424.44, + "text": "feed this directly into language models" + }, + { + "start": 1426.44, + "text": "and we have to compress it using the B" + }, + { + "start": 1428.279, + "text": "paare encoding algorithm so let's see" + }, + { + "start": 1429.84, + "text": "how that works so as I mentioned the B" + }, + { + "start": 1431.64, + "text": "paare encoding algorithm is not all that" + }, + { + "start": 1433.52, + "text": "complicated and the Wikipedia page is" + }, + { + "start": 1435.52, + "text": "actually quite instructive as far as the" + }, + { + "start": 1437.159, + "text": "basic idea goes go what we're doing is" + }, + { + "start": 1439.48, + "text": "we have some kind of a input sequence uh" + }, + { + "start": 1441.76, + "text": "like for example here we have only four" + }, + { + "start": 1443.64, + "text": "elements in our vocabulary a b c and d" + }, + { + "start": 1446.32, + "text": "and we have a sequence of them so" + }, + { + "start": 1448.0, + "text": "instead of bytes let's say we just have" + }, + { + "start": 1449.76, + "text": "four a vocab size of" + }, + { + "start": 1452.039, + "text": "four the sequence is too long and we'd" + }, + { + "start": 1454.12, + "text": "like to compress it so what we do is" + }, + { + "start": 1456.159, + "text": "that we iteratively find the pair of uh" + }, + { + "start": 1460.159, + "text": "tokens that occur the most" + }, + { + "start": 1463.44, + "text": "frequently and then once we've" + }, + { + "start": 1465.279, + "text": "identified that pair we repl replace" + }, + { + "start": 1468.48, + "text": "that pair with just a single new token" + }, + { + "start": 1470.88, + "text": "that we append to our vocabulary so for" + }, + { + "start": 1473.559, + "text": "example here the bite pair AA occurs" + }, + { + "start": 1476.279, + "text": "most often so we mint a new token let's" + }, + { + "start": 1478.919, + "text": "call it capital Z and we replace every" + }, + { + "start": 1481.679, + "text": "single occurrence of AA by Z so now we" + }, + { + "start": 1486.0, + "text": "have two Z's here so here we took a" + }, + { + "start": 1488.919, + "text": "sequence of 11 characters with" + }, + { + "start": 1491.799, + "text": "vocabulary size four and we've converted" + }, + { + "start": 1494.44, + "text": "it to a um sequence of only nine tokens" + }, + { + "start": 1498.64, + "text": "but now with a vocabulary of five" + }, + { + "start": 1500.559, + "text": "because we have a fifth vocabulary" + }, + { + "start": 1502.399, + "text": "element that we just created and it's Z" + }, + { + "start": 1504.96, + "text": "standing for concatination of AA and we" + }, + { + "start": 1507.52, + "text": "can again repeat this process so we" + }, + { + "start": 1510.24, + "text": "again look at the sequence and identify" + }, + { + "start": 1512.88, + "text": "the pair of tokens that are most" + }, + { + "start": 1515.64, + "text": "frequent let's say that that is now AB" + }, + { + "start": 1519.159, + "text": "well we are going to replace AB with a" + }, + { + "start": 1520.76, + "text": "new token that we meant call Y so y" + }, + { + "start": 1523.76, + "text": "becomes ab and then every single" + }, + { + "start": 1525.24, + "text": "occurrence of ab is now replaced with y" + }, + { + "start": 1528.039, + "text": "so we end up with this so now we only" + }, + { + "start": 1531.44, + "text": "have 1 2 3 4 5 6 seven characters in our" + }, + { + "start": 1535.159, + "text": "sequence but we have not just um four" + }, + { + "start": 1540.12, + "text": "vocabulary elements or five but now we" + }, + { + "start": 1542.32, + "text": "have six and for the final round we" + }, + { + "start": 1545.799, + "text": "again look through the sequence find" + }, + { + "start": 1547.64, + "text": "that the phrase zy or the pair zy is" + }, + { + "start": 1550.559, + "text": "most common and replace it one more time" + }, + { + "start": 1553.32, + "text": "with another um character let's say x so" + }, + { + "start": 1556.64, + "text": "X is z y and we replace all curses of zy" + }, + { + "start": 1559.919, + "text": "and we get this following sequence so" + }, + { + "start": 1562.12, + "text": "basically after we have gone through" + }, + { + "start": 1563.6, + "text": "this process instead of having a um" + }, + { + "start": 1568.48, + "text": "sequence of" + }, + { + "start": 1569.76, + "text": "11 uh tokens with a vocabulary length of" + }, + { + "start": 1573.64, + "text": "four we now have a sequence of 1 2 3" + }, + { + "start": 1578.159, + "text": "four five tokens but our vocabulary" + }, + { + "start": 1581.48, + "text": "length now is seven and so in this way" + }, + { + "start": 1585.159, + "text": "we can iteratively compress our sequence" + }, + { + "start": 1587.44, + "text": "I we Mint new tokens so in the in the" + }, + { + "start": 1590.279, + "text": "exact same way we start we start out" + }, + { + "start": 1592.399, + "text": "with bite sequences so we have 256" + }, + { + "start": 1596.24, + "text": "vocabulary size but we're now going to" + }, + { + "start": 1598.2, + "text": "go through these and find the bite pairs" + }, + { + "start": 1600.64, + "text": "that occur the most and we're going to" + }, + { + "start": 1602.559, + "text": "iteratively start minting new tokens" + }, + { + "start": 1604.84, + "text": "appending them to our vocabulary and" + }, + { + "start": 1606.76, + "text": "replacing things and in this way we're" + }, + { + "start": 1608.88, + "text": "going to end up with a compressed" + }, + { + "start": 1610.24, + "text": "training data set and also an algorithm" + }, + { + "start": 1612.96, + "text": "for taking any arbitrary sequence and" + }, + { + "start": 1615.279, + "text": "encoding it using this uh vocabul" + }, + { + "start": 1618.24, + "text": "and also decoding it back to Strings so" + }, + { + "start": 1621.0, + "text": "let's now Implement all that so here's" + }, + { + "start": 1623.24, + "text": "what I did I went to this block post" + }, + { + "start": 1625.679, + "text": "that I enjoyed and I took the first" + }, + { + "start": 1627.32, + "text": "paragraph and I copy pasted it here into" + }, + { + "start": 1630.0, + "text": "text so this is one very long line" + }, + { + "start": 1633.279, + "text": "here now to get the tokens as I" + }, + { + "start": 1635.96, + "text": "mentioned we just take our text and we" + }, + { + "start": 1637.36, + "text": "encode it into utf8 the tokens here at" + }, + { + "start": 1640.159, + "text": "this point will be a raw bites single" + }, + { + "start": 1642.76, + "text": "stream of bytes and just so that it's" + }, + { + "start": 1645.6, + "text": "easier to work with instead of just a" + }, + { + "start": 1647.64, + "text": "bytes object I'm going to convert all" + }, + { + "start": 1649.96, + "text": "those bytes to integers and then create" + }, + { + "start": 1652.64, + "text": "a list of it just so it's easier for us" + }, + { + "start": 1654.279, + "text": "to manipulate and work with in Python" + }, + { + "start": 1655.88, + "text": "and visualize and here I'm printing all" + }, + { + "start": 1658.0, + "text": "of that so this is the original um this" + }, + { + "start": 1662.08, + "text": "is the original paragraph and its length" + }, + { + "start": 1665.0, + "text": "is" + }, + { + "start": 1665.799, + "text": "533 uh code points and then here are the" + }, + { + "start": 1669.799, + "text": "bytes encoded in ut utf8 and we see that" + }, + { + "start": 1673.32, + "text": "this has a length of 616 bytes at this" + }, + { + "start": 1676.32, + "text": "point or 616 tokens and the reason this" + }, + { + "start": 1679.039, + "text": "is more is because a lot of these simple" + }, + { + "start": 1681.84, + "text": "asky characters or simple characters" + }, + { + "start": 1684.6, + "text": "they just become a single bite but a lot" + }, + { + "start": 1686.44, + "text": "of these Unicode more complex characters" + }, + { + "start": 1688.76, + "text": "become multiple bytes up to four and so" + }, + { + "start": 1691.08, + "text": "we are expanding that" + }, + { + "start": 1692.76, + "text": "size so now what we'd like to do as a" + }, + { + "start": 1694.799, + "text": "first step of the algorithm is we'd like" + }, + { + "start": 1696.24, + "text": "to iterate over here and find the pair" + }, + { + "start": 1698.919, + "text": "of bites that occur most frequently" + }, + { + "start": 1702.0, + "text": "because we're then going to merge it so" + }, + { + "start": 1704.12, + "text": "if you are working long on a notebook on" + }, + { + "start": 1705.799, + "text": "a side then I encourage you to basically" + }, + { + "start": 1707.76, + "text": "click on the link find this notebook and" + }, + { + "start": 1709.919, + "text": "try to write that function yourself" + }, + { + "start": 1711.88, + "text": "otherwise I'm going to come here and" + }, + { + "start": 1712.96, + "text": "Implement first the function that finds" + }, + { + "start": 1714.96, + "text": "the most common pair okay so here's what" + }, + { + "start": 1716.919, + "text": "I came up with there are many different" + }, + { + "start": 1718.399, + "text": "ways to implement this but I'm calling" + }, + { + "start": 1720.32, + "text": "the function get stats it expects a list" + }, + { + "start": 1722.159, + "text": "of integers I'm using a dictionary to" + }, + { + "start": 1724.48, + "text": "keep track of basically the counts and" + }, + { + "start": 1726.88, + "text": "then this is a pythonic way to iterate" + }, + { + "start": 1728.84, + "text": "consecutive elements of this list uh" + }, + { + "start": 1731.44, + "text": "which we covered in the previous video" + }, + { + "start": 1733.72, + "text": "and then here I'm just keeping track of" + }, + { + "start": 1735.919, + "text": "just incrementing by one um for all the" + }, + { + "start": 1738.559, + "text": "pairs so if I call this on all the" + }, + { + "start": 1740.399, + "text": "tokens here then the stats comes out" + }, + { + "start": 1743.399, + "text": "here so this is the dictionary the keys" + }, + { + "start": 1746.159, + "text": "are these topples of consecutive" + }, + { + "start": 1748.919, + "text": "elements and this is the count so just" + }, + { + "start": 1751.6, + "text": "to uh print it in a slightly better way" + }, + { + "start": 1754.679, + "text": "this is one way that I like to do that" + }, + { + "start": 1757.6, + "text": "where you it's a little bit compound" + }, + { + "start": 1760.559, + "text": "here so you can pause if you like but we" + }, + { + "start": 1762.36, + "text": "iterate all all the items the items" + }, + { + "start": 1765.039, + "text": "called on dictionary returns pairs of" + }, + { + "start": 1767.399, + "text": "key value and instead I create a list" + }, + { + "start": 1771.799, + "text": "here of value key because if it's a" + }, + { + "start": 1775.12, + "text": "value key list then I can call sort on" + }, + { + "start": 1777.279, + "text": "it and by default python will uh use the" + }, + { + "start": 1781.36, + "text": "first element which in this case will be" + }, + { + "start": 1783.559, + "text": "value to sort by if it's given tles and" + }, + { + "start": 1786.64, + "text": "then reverse so it's descending and" + }, + { + "start": 1788.72, + "text": "print that so basically it looks like" + }, + { + "start": 1790.88, + "text": "101 comma 32 was the most commonly" + }, + { + "start": 1793.96, + "text": "occurring consecutive pair and it" + }, + { + "start": 1795.72, + "text": "occurred 20 times we can double check" + }, + { + "start": 1798.2, + "text": "that that makes reasonable sense so if I" + }, + { + "start": 1800.44, + "text": "just search" + }, + { + "start": 1802.08, + "text": "10132 then you see that these are the 20" + }, + { + "start": 1805.2, + "text": "occurrences of that um pair and if we'd" + }, + { + "start": 1810.12, + "text": "like to take a look at what exactly that" + }, + { + "start": 1811.519, + "text": "pair is we can use Char which is the" + }, + { + "start": 1814.279, + "text": "opposite of or in Python so we give it a" + }, + { + "start": 1817.84, + "text": "um unic code Cod point so 101 and of 32" + }, + { + "start": 1822.039, + "text": "and we see that this is e and space so" + }, + { + "start": 1825.0, + "text": "basically there's a lot of E space here" + }, + { + "start": 1828.08, + "text": "meaning that a lot of these words seem" + }, + { + "start": 1829.48, + "text": "to end with e so here's eace as an" + }, + { + "start": 1832.12, + "text": "example so there's a lot of that going" + }, + { + "start": 1834.039, + "text": "on here and this is the most common pair" + }, + { + "start": 1836.72, + "text": "so now that we've identified the most" + }, + { + "start": 1838.24, + "text": "common pair we would like to iterate" + }, + { + "start": 1840.36, + "text": "over this sequence we're going to Mint a" + }, + { + "start": 1842.679, + "text": "new token with the ID of" + }, + { + "start": 1844.799, + "text": "256 right because these tokens currently" + }, + { + "start": 1847.84, + "text": "go from Z to 255 so when we create a new" + }, + { + "start": 1850.64, + "text": "token it will have an ID of" + }, + { + "start": 1852.84, + "text": "256 and we're going to iterate over this" + }, + { + "start": 1856.0, + "text": "entire um list and every every time we" + }, + { + "start": 1859.84, + "text": "see 101 comma 32 we're going to swap" + }, + { + "start": 1862.72, + "text": "that out for" + }, + { + "start": 1863.919, + "text": "256 so let's Implement that now and feel" + }, + { + "start": 1867.24, + "text": "free to uh do that yourself as well so" + }, + { + "start": 1869.96, + "text": "first I commented uh this just so we" + }, + { + "start": 1871.96, + "text": "don't pollute uh the notebook too much" + }, + { + "start": 1874.96, + "text": "this is a nice way of in Python" + }, + { + "start": 1877.96, + "text": "obtaining the highest ranking pair so" + }, + { + "start": 1880.399, + "text": "we're basically calling the Max on this" + }, + { + "start": 1883.08, + "text": "dictionary stats and this will return" + }, + { + "start": 1886.32, + "text": "the maximum" + }, + { + "start": 1887.679, + "text": "key and then the question is how does it" + }, + { + "start": 1890.159, + "text": "rank keys so you can provide it with a" + }, + { + "start": 1892.84, + "text": "function that ranks keys and that" + }, + { + "start": 1895.2, + "text": "function is just stats. getet uh stats." + }, + { + "start": 1898.2, + "text": "getet would basically return the value" + }, + { + "start": 1901.12, + "text": "and so we're ranking by the value and" + }, + { + "start": 1902.799, + "text": "getting the maximum key so it's 101" + }, + { + "start": 1905.48, + "text": "comma 32 as we saw now to actually merge" + }, + { + "start": 1909.2, + "text": "10132 um this is the function that I" + }, + { + "start": 1911.88, + "text": "wrote but again there are many different" + }, + { + "start": 1913.279, + "text": "versions of it so we're going to take a" + }, + { + "start": 1915.72, + "text": "list of IDs and the the pair that we" + }, + { + "start": 1917.72, + "text": "want to replace and that pair will be" + }, + { + "start": 1919.76, + "text": "replaced with the new index" + }, + { + "start": 1922.24, + "text": "idx so iterating through IDs if we find" + }, + { + "start": 1925.559, + "text": "the pair swap it out for idx so we" + }, + { + "start": 1928.44, + "text": "create this new list and then we start" + }, + { + "start": 1930.519, + "text": "at zero and then we go through this" + }, + { + "start": 1932.76, + "text": "entire list sequentially from left to" + }, + { + "start": 1934.84, + "text": "right and here we are checking for" + }, + { + "start": 1937.12, + "text": "equality at the current position with" + }, + { + "start": 1939.639, + "text": "the" + }, + { + "start": 1940.88, + "text": "pair um so here we are checking that the" + }, + { + "start": 1943.399, + "text": "pair matches now here is a bit of a" + }, + { + "start": 1945.48, + "text": "tricky condition that you have to append" + }, + { + "start": 1947.24, + "text": "if you're trying to be careful and that" + }, + { + "start": 1949.08, + "text": "is that um you don't want this here to" + }, + { + "start": 1951.679, + "text": "be out of Bounds at the very last" + }, + { + "start": 1953.76, + "text": "position when you're on the rightmost" + }, + { + "start": 1955.399, + "text": "element of this list otherwise this" + }, + { + "start": 1957.12, + "text": "would uh give you an autof bounds error" + }, + { + "start": 1959.279, + "text": "so we have to make sure that we're not" + }, + { + "start": 1960.679, + "text": "at the very very last element so uh this" + }, + { + "start": 1964.039, + "text": "would be false for that so if we find a" + }, + { + "start": 1966.6, + "text": "match we append to this new list that" + }, + { + "start": 1971.08, + "text": "replacement index and we increment the" + }, + { + "start": 1973.32, + "text": "position by two so we skip over that" + }, + { + "start": 1974.799, + "text": "entire pair but otherwise if we we" + }, + { + "start": 1977.12, + "text": "haven't found a matching pair we just" + }, + { + "start": 1979.08, + "text": "sort of copy over the um element at that" + }, + { + "start": 1982.12, + "text": "position and increment by one then" + }, + { + "start": 1985.24, + "text": "return this so here's a very small toy" + }, + { + "start": 1987.36, + "text": "example if we have a list 566 791 and we" + }, + { + "start": 1990.36, + "text": "want to replace the occurrences of 67" + }, + { + "start": 1992.36, + "text": "with 99 then calling this on that will" + }, + { + "start": 1996.36, + "text": "give us what we're asking for so here" + }, + { + "start": 1998.919, + "text": "the 67 is replaced with" + }, + { + "start": 2001.519, + "text": "99 so now I'm going to uncomment this" + }, + { + "start": 2003.76, + "text": "for our actual use case where we want to" + }, + { + "start": 2007.279, + "text": "take our tokens we want to take the top" + }, + { + "start": 2009.519, + "text": "pair here and replace it with 256 to get" + }, + { + "start": 2013.12, + "text": "tokens to if we run this we get the" + }, + { + "start": 2017.24, + "text": "following so recall that previously we" + }, + { + "start": 2020.88, + "text": "had a length 616 in this list and now we" + }, + { + "start": 2025.12, + "text": "have a length 596 right so this" + }, + { + "start": 2028.44, + "text": "decreased by 20 which makes sense" + }, + { + "start": 2030.159, + "text": "because there are 20 occurrences" + }, + { + "start": 2032.36, + "text": "moreover we can try to find 256 here and" + }, + { + "start": 2035.48, + "text": "we see plenty of occurrences on off it" + }, + { + "start": 2038.44, + "text": "and moreover just double check there" + }, + { + "start": 2039.76, + "text": "should be no occurrence of 10132 so this" + }, + { + "start": 2042.519, + "text": "is the original array plenty of them and" + }, + { + "start": 2045.0, + "text": "in the second array there are no" + }, + { + "start": 2046.159, + "text": "occurrences of 1032 so we've" + }, + { + "start": 2048.52, + "text": "successfully merged this single pair and" + }, + { + "start": 2051.599, + "text": "now we just uh iterate this so we are" + }, + { + "start": 2053.919, + "text": "going to go over the sequence again find" + }, + { + "start": 2055.48, + "text": "the most common pair and replace it so" + }, + { + "start": 2057.8, + "text": "let me now write a y Loop that uses" + }, + { + "start": 2059.48, + "text": "these functions to do this um sort of" + }, + { + "start": 2061.8, + "text": "iteratively and how many times do we do" + }, + { + "start": 2064.28, + "text": "it four well that's totally up to us as" + }, + { + "start": 2066.28, + "text": "a hyper parameter" + }, + { + "start": 2067.399, + "text": "the more um steps we take the larger" + }, + { + "start": 2070.919, + "text": "will be our vocabulary and the shorter" + }, + { + "start": 2073.04, + "text": "will be our sequence and there is some" + }, + { + "start": 2075.119, + "text": "sweet spot that we usually find works" + }, + { + "start": 2077.24, + "text": "the best in practice and so this is kind" + }, + { + "start": 2079.919, + "text": "of a hyperparameter and we tune it and" + }, + { + "start": 2081.639, + "text": "we find good vocabulary sizes as an" + }, + { + "start": 2084.2, + "text": "example gp4 currently uses roughly" + }, + { + "start": 2086.0, + "text": "100,000 tokens and um bpark that those" + }, + { + "start": 2089.879, + "text": "are reasonable numbers currently instead" + }, + { + "start": 2091.8, + "text": "the are large language models so let me" + }, + { + "start": 2093.919, + "text": "now write uh putting putting it all" + }, + { + "start": 2095.96, + "text": "together and uh iterating these steps" + }, + { + "start": 2098.68, + "text": "okay now before we dive into the Y loop" + }, + { + "start": 2100.52, + "text": "I wanted to add one more cell here where" + }, + { + "start": 2103.28, + "text": "I went to the block post and instead of" + }, + { + "start": 2104.96, + "text": "grabbing just the first paragraph or two" + }, + { + "start": 2107.0, + "text": "I took the entire block post and I" + }, + { + "start": 2108.8, + "text": "stretched it out in a single line and" + }, + { + "start": 2110.96, + "text": "basically just using longer text will" + }, + { + "start": 2112.48, + "text": "allow us to have more representative" + }, + { + "start": 2113.88, + "text": "statistics for the bite Pairs and we'll" + }, + { + "start": 2116.28, + "text": "just get a more sensible results out of" + }, + { + "start": 2118.04, + "text": "it because it's longer text um so here" + }, + { + "start": 2121.76, + "text": "we have the raw text we encode it into" + }, + { + "start": 2124.359, + "text": "bytes using the utf8 encoding" + }, + { + "start": 2127.64, + "text": "and then here as before we are just" + }, + { + "start": 2130.079, + "text": "changing it into a list of integers in" + }, + { + "start": 2131.839, + "text": "Python just so it's easier to work with" + }, + { + "start": 2133.96, + "text": "instead of the raw byes objects and then" + }, + { + "start": 2136.68, + "text": "this is the code that I came up with uh" + }, + { + "start": 2140.76, + "text": "to actually do the merging in Loop these" + }, + { + "start": 2144.0, + "text": "two functions here are identical to what" + }, + { + "start": 2145.839, + "text": "we had above I only included them here" + }, + { + "start": 2148.119, + "text": "just so that you have the point of" + }, + { + "start": 2149.88, + "text": "reference here so uh these two are" + }, + { + "start": 2153.359, + "text": "identical and then this is the new code" + }, + { + "start": 2155.0, + "text": "that I added so the first first thing we" + }, + { + "start": 2157.079, + "text": "want to do is we want to decide on the" + }, + { + "start": 2158.56, + "text": "final vocabulary size that we want our" + }, + { + "start": 2161.04, + "text": "tokenizer to have and as I mentioned" + }, + { + "start": 2162.96, + "text": "this is a hyper parameter and you set it" + }, + { + "start": 2164.52, + "text": "in some way depending on your best" + }, + { + "start": 2166.44, + "text": "performance so let's say for us we're" + }, + { + "start": 2168.48, + "text": "going to use 276 because that way we're" + }, + { + "start": 2170.839, + "text": "going to be doing exactly 20" + }, + { + "start": 2173.079, + "text": "merges and uh 20 merges because we" + }, + { + "start": 2175.72, + "text": "already have" + }, + { + "start": 2176.88, + "text": "256 tokens for the raw bytes and to" + }, + { + "start": 2180.88, + "text": "reach 276 we have to do 20 merges uh to" + }, + { + "start": 2183.68, + "text": "add 20 new" + }, + { + "start": 2185.48, + "text": "tokens here uh this is uh one way in" + }, + { + "start": 2188.2, + "text": "Python to just create a copy of a list" + }, + { + "start": 2191.48, + "text": "so I'm taking the tokens list and by" + }, + { + "start": 2193.52, + "text": "wrapping it in a list python will" + }, + { + "start": 2195.839, + "text": "construct a new list of all the" + }, + { + "start": 2197.16, + "text": "individual elements so this is just a" + }, + { + "start": 2198.64, + "text": "copy" + }, + { + "start": 2199.92, + "text": "operation then here I'm creating a" + }, + { + "start": 2202.079, + "text": "merges uh dictionary so this merges" + }, + { + "start": 2204.839, + "text": "dictionary is going to maintain" + }, + { + "start": 2206.119, + "text": "basically the child one child two" + }, + { + "start": 2209.4, + "text": "mapping to a new uh token and so what" + }, + { + "start": 2212.52, + "text": "we're going to be building up here is a" + }, + { + "start": 2213.92, + "text": "binary tree of merges but actually it's" + }, + { + "start": 2216.92, + "text": "not exactly a tree because a tree would" + }, + { + "start": 2219.28, + "text": "have a single root node with a bunch of" + }, + { + "start": 2221.44, + "text": "leaves for us we're starting with the" + }, + { + "start": 2223.44, + "text": "leaves on the bottom which are the" + }, + { + "start": 2225.0, + "text": "individual bites those are the starting" + }, + { + "start": 2226.92, + "text": "256 tokens and then we're starting to" + }, + { + "start": 2229.52, + "text": "like merge two of them at a time and so" + }, + { + "start": 2231.52, + "text": "it's not a tree it's more like a forest" + }, + { + "start": 2234.96, + "text": "um uh as we merge these elements" + }, + { + "start": 2238.92, + "text": "so for 20 merges we're going to find the" + }, + { + "start": 2242.88, + "text": "most commonly occurring pair we're going" + }, + { + "start": 2245.079, + "text": "to Mint a new token integer for it so I" + }, + { + "start": 2248.48, + "text": "here will start at zero so we'll going" + }, + { + "start": 2250.079, + "text": "to start at 256 we're going to print" + }, + { + "start": 2252.359, + "text": "that we're merging it and we're going to" + }, + { + "start": 2254.44, + "text": "replace all of the occurrences of that" + }, + { + "start": 2256.2, + "text": "pair with the new new lied token and" + }, + { + "start": 2259.56, + "text": "we're going to record that this pair of" + }, + { + "start": 2262.16, + "text": "integers merged into this new" + }, + { + "start": 2265.52, + "text": "integer so running this gives us the" + }, + { + "start": 2269.079, + "text": "following" + }, + { + "start": 2271.16, + "text": "output so we did 20 merges and for" + }, + { + "start": 2274.48, + "text": "example the first merge was exactly as" + }, + { + "start": 2276.839, + "text": "before the" + }, + { + "start": 2278.839, + "text": "10132 um tokens merging into a new token" + }, + { + "start": 2281.8, + "text": "2556 now keep in mind that the" + }, + { + "start": 2284.0, + "text": "individual uh tokens 101 and 32 can" + }, + { + "start": 2286.599, + "text": "still occur in the sequence after" + }, + { + "start": 2288.44, + "text": "merging it's only when they occur" + }, + { + "start": 2290.359, + "text": "exactly consecutively that that becomes" + }, + { + "start": 2292.599, + "text": "256" + }, + { + "start": 2293.88, + "text": "now um and in particular the other thing" + }, + { + "start": 2296.92, + "text": "to notice here is that the token 256" + }, + { + "start": 2299.16, + "text": "which is the newly minted token is also" + }, + { + "start": 2301.4, + "text": "eligible for merging so here on the" + }, + { + "start": 2303.4, + "text": "bottom the 20th merge was a merge of 25" + }, + { + "start": 2306.839, + "text": "and 259 becoming" + }, + { + "start": 2308.88, + "text": "275 so every time we replace these" + }, + { + "start": 2311.8, + "text": "tokens they become eligible for merging" + }, + { + "start": 2313.64, + "text": "in the next round of data ration so" + }, + { + "start": 2315.92, + "text": "that's why we're building up a small" + }, + { + "start": 2317.119, + "text": "sort of binary Forest instead of a" + }, + { + "start": 2318.8, + "text": "single individual" + }, + { + "start": 2320.2, + "text": "tree one thing we can take a look at as" + }, + { + "start": 2322.319, + "text": "well is we can take a look at the" + }, + { + "start": 2324.0, + "text": "compression ratio that we've achieved so" + }, + { + "start": 2326.16, + "text": "in particular we started off with this" + }, + { + "start": 2328.359, + "text": "tokens list um so we started off with" + }, + { + "start": 2331.4, + "text": "24,000 bytes and after merging 20 times" + }, + { + "start": 2336.28, + "text": "uh we now have only" + }, + { + "start": 2338.52, + "text": "19,000 um tokens and so therefore the" + }, + { + "start": 2341.92, + "text": "compression ratio simply just dividing" + }, + { + "start": 2343.64, + "text": "the two is roughly 1.27 so that's the" + }, + { + "start": 2346.8, + "text": "amount of compression we were able to" + }, + { + "start": 2347.96, + "text": "achieve of this text with only 20" + }, + { + "start": 2350.8, + "text": "merges um and of course the more" + }, + { + "start": 2353.119, + "text": "vocabulary elements you add uh the" + }, + { + "start": 2355.599, + "text": "greater the compression ratio here would" + }, + { + "start": 2359.24, + "text": "be finally so that's kind of like um the" + }, + { + "start": 2363.76, + "text": "training of the tokenizer if you will" + }, + { + "start": 2365.72, + "text": "now 1 Point I wanted to make is that and" + }, + { + "start": 2368.28, + "text": "maybe this is a diagram that can help um" + }, + { + "start": 2371.28, + "text": "kind of illustrate is that tokenizer is" + }, + { + "start": 2373.079, + "text": "a completely separate object from the" + }, + { + "start": 2374.92, + "text": "large language model itself so" + }, + { + "start": 2377.0, + "text": "everything in this lecture we're not" + }, + { + "start": 2378.04, + "text": "really touching the llm itself uh we're" + }, + { + "start": 2380.119, + "text": "just training the tokenizer this is a" + }, + { + "start": 2381.839, + "text": "completely separate pre-processing stage" + }, + { + "start": 2383.92, + "text": "usually so the tokenizer will have its" + }, + { + "start": 2386.24, + "text": "own training set just like a large" + }, + { + "start": 2387.96, + "text": "language model has a potentially" + }, + { + "start": 2389.8, + "text": "different training set so the tokenizer" + }, + { + "start": 2392.04, + "text": "has a training set of documents on which" + }, + { + "start": 2393.4, + "text": "you're going to train the" + }, + { + "start": 2394.76, + "text": "tokenizer and then and um we're" + }, + { + "start": 2397.76, + "text": "performing The Bite pair encoding" + }, + { + "start": 2398.96, + "text": "algorithm as we saw above to train the" + }, + { + "start": 2401.079, + "text": "vocabulary of this" + }, + { + "start": 2402.64, + "text": "tokenizer so it has its own training set" + }, + { + "start": 2404.96, + "text": "it is a pre-processing stage that you" + }, + { + "start": 2406.52, + "text": "would run a single time in the beginning" + }, + { + "start": 2409.24, + "text": "um and the tokenizer is trained using" + }, + { + "start": 2411.96, + "text": "bipar coding algorithm once you have the" + }, + { + "start": 2414.359, + "text": "tokenizer once it's trained and you have" + }, + { + "start": 2416.319, + "text": "the vocabulary and you have the merges" + }, + { + "start": 2419.04, + "text": "uh we can do both encoding and decoding" + }, + { + "start": 2422.28, + "text": "so these two arrows here so the" + }, + { + "start": 2424.52, + "text": "tokenizer is a translation layer between" + }, + { + "start": 2427.0, + "text": "raw text which is as we saw the sequence" + }, + { + "start": 2430.04, + "text": "of Unicode code points it can take raw" + }, + { + "start": 2432.52, + "text": "text and turn it into a token sequence" + }, + { + "start": 2435.44, + "text": "and vice versa it can take a token" + }, + { + "start": 2437.0, + "text": "sequence and translate it back into raw" + }, + { + "start": 2440.76, + "text": "text so now that we have trained uh" + }, + { + "start": 2443.359, + "text": "tokenizer and we have these merges we" + }, + { + "start": 2445.96, + "text": "are going to turn to how we can do the" + }, + { + "start": 2447.44, + "text": "encoding and the decoding step if you" + }, + { + "start": 2449.48, + "text": "give me text here are the tokens and" + }, + { + "start": 2451.24, + "text": "vice versa if you give me tokens here's" + }, + { + "start": 2453.0, + "text": "the text once we have that we can" + }, + { + "start": 2455.28, + "text": "translate between these two Realms and" + }, + { + "start": 2457.52, + "text": "then the language model is going to be" + }, + { + "start": 2458.76, + "text": "trained as a step two afterwards and" + }, + { + "start": 2461.64, + "text": "typically in a in a sort of a" + }, + { + "start": 2463.64, + "text": "state-of-the-art application you might" + }, + { + "start": 2465.48, + "text": "take all of your training data for the" + }, + { + "start": 2466.839, + "text": "language model and you might run it" + }, + { + "start": 2468.359, + "text": "through the tokenizer and sort of" + }, + { + "start": 2470.4, + "text": "translate everything into a massive" + }, + { + "start": 2471.92, + "text": "token sequence and then you can throw" + }, + { + "start": 2473.64, + "text": "away the raw text you're just left with" + }, + { + "start": 2475.44, + "text": "the tokens themselves and those are" + }, + { + "start": 2477.72, + "text": "stored on disk and that is what the" + }, + { + "start": 2479.72, + "text": "large language model is actually reading" + }, + { + "start": 2481.319, + "text": "when it's training on them so this one" + }, + { + "start": 2483.24, + "text": "approach that you can take as a single" + }, + { + "start": 2484.8, + "text": "massive pre-processing step a" + }, + { + "start": 2486.88, + "text": "stage um so yeah basically I think the" + }, + { + "start": 2490.4, + "text": "most important thing I want to get" + }, + { + "start": 2491.4, + "text": "across is that this is completely" + }, + { + "start": 2492.599, + "text": "separate stage it usually has its own" + }, + { + "start": 2494.4, + "text": "entire uh training set you may want to" + }, + { + "start": 2496.839, + "text": "have those training sets be different" + }, + { + "start": 2498.359, + "text": "between the tokenizer and the logge" + }, + { + "start": 2499.599, + "text": "language model so for example when" + }, + { + "start": 2501.28, + "text": "you're training the tokenizer as I" + }, + { + "start": 2503.319, + "text": "mentioned we don't just care about the" + }, + { + "start": 2505.079, + "text": "performance of English text we care" + }, + { + "start": 2506.76, + "text": "about uh multi many different languages" + }, + { + "start": 2509.44, + "text": "and we also care about code or not code" + }, + { + "start": 2511.52, + "text": "so you may want to look into different" + }, + { + "start": 2513.24, + "text": "kinds of mixtures of different kinds of" + }, + { + "start": 2515.2, + "text": "languages and different amounts of code" + }, + { + "start": 2517.359, + "text": "and things like that because the amount" + }, + { + "start": 2520.24, + "text": "of different language that you have in" + }, + { + "start": 2521.96, + "text": "your tokenizer training set will" + }, + { + "start": 2523.76, + "text": "determine how many merges of it there" + }, + { + "start": 2526.119, + "text": "will be and therefore that determines" + }, + { + "start": 2528.24, + "text": "the density with which uh this type of" + }, + { + "start": 2531.319, + "text": "data is um sort of has in the token" + }, + { + "start": 2535.2, + "text": "space and so roughly speaking" + }, + { + "start": 2537.76, + "text": "intuitively if you add some amount of" + }, + { + "start": 2539.72, + "text": "data like say you have a ton of Japanese" + }, + { + "start": 2541.359, + "text": "data in your uh tokenizer training set" + }, + { + "start": 2544.04, + "text": "then that means that more Japanese" + }, + { + "start": 2545.359, + "text": "tokens will get merged" + }, + { + "start": 2546.839, + "text": "and therefore Japanese will have shorter" + }, + { + "start": 2548.92, + "text": "sequences uh and that's going to be" + }, + { + "start": 2550.64, + "text": "beneficial for the large language model" + }, + { + "start": 2552.4, + "text": "which has a finite context length on" + }, + { + "start": 2554.359, + "text": "which it can work on in in the token" + }, + { + "start": 2556.599, + "text": "space uh so hopefully that makes sense" + }, + { + "start": 2559.24, + "text": "so we're now going to turn to encoding" + }, + { + "start": 2561.2, + "text": "and decoding now that we have trained a" + }, + { + "start": 2563.079, + "text": "tokenizer so we have our merges and now" + }, + { + "start": 2566.4, + "text": "how do we do encoding and decoding okay" + }, + { + "start": 2568.44, + "text": "so let's begin with decoding which is" + }, + { + "start": 2570.44, + "text": "this Arrow over here so given a token" + }, + { + "start": 2572.72, + "text": "sequence let's go through the tokenizer" + }, + { + "start": 2574.92, + "text": "to get back a python string object so" + }, + { + "start": 2577.52, + "text": "the raw text so this is the function" + }, + { + "start": 2579.88, + "text": "that we' like to implement um we're" + }, + { + "start": 2581.88, + "text": "given the list of integers and we want" + }, + { + "start": 2583.44, + "text": "to return a python string if you'd like" + }, + { + "start": 2585.68, + "text": "uh try to implement this function" + }, + { + "start": 2586.839, + "text": "yourself it's a fun exercise otherwise" + }, + { + "start": 2588.839, + "text": "I'm going to start uh pasting in my own" + }, + { + "start": 2591.28, + "text": "solution so there are many different" + }, + { + "start": 2593.52, + "text": "ways to do it um here's one way I will" + }, + { + "start": 2596.88, + "text": "create an uh kind of pre-processing" + }, + { + "start": 2598.88, + "text": "variable that I will call" + }, + { + "start": 2601.04, + "text": "vocab and vocab is a mapping or a" + }, + { + "start": 2604.68, + "text": "dictionary in Python for from the token" + }, + { + "start": 2607.559, + "text": "uh ID to the bytes object for that token" + }, + { + "start": 2611.52, + "text": "so we begin with the raw bytes for" + }, + { + "start": 2613.8, + "text": "tokens from 0 to 255 and then we go in" + }, + { + "start": 2616.839, + "text": "order of all the merges and we sort of" + }, + { + "start": 2619.76, + "text": "uh populate this vocab list by doing an" + }, + { + "start": 2622.28, + "text": "addition here so this is the basically" + }, + { + "start": 2625.72, + "text": "the bytes representation of the first" + }, + { + "start": 2627.76, + "text": "child followed by the second one and" + }, + { + "start": 2630.04, + "text": "remember these are bytes objects so this" + }, + { + "start": 2632.079, + "text": "addition here is an addition of two" + }, + { + "start": 2634.2, + "text": "bytes objects just concatenation" + }, + { + "start": 2637.04, + "text": "so that's what we get" + }, + { + "start": 2638.76, + "text": "here one tricky thing to be careful with" + }, + { + "start": 2641.2, + "text": "by the way is that I'm iterating a" + }, + { + "start": 2642.88, + "text": "dictionary in Python using a DOT items" + }, + { + "start": 2646.0, + "text": "and uh it really matters that this runs" + }, + { + "start": 2648.72, + "text": "in the order in which we inserted items" + }, + { + "start": 2651.48, + "text": "into the merous dictionary luckily" + }, + { + "start": 2653.559, + "text": "starting with python 3.7 this is" + }, + { + "start": 2655.4, + "text": "guaranteed to be the case but before" + }, + { + "start": 2657.04, + "text": "python 3.7 this iteration may have been" + }, + { + "start": 2659.16, + "text": "out of order with respect to how we" + }, + { + "start": 2660.96, + "text": "inserted elements into merges and this" + }, + { + "start": 2663.16, + "text": "may not have worked but we are using an" + }, + { + "start": 2665.8, + "text": "um modern python so we're okay and then" + }, + { + "start": 2668.8, + "text": "here uh given the IDS the first thing" + }, + { + "start": 2671.599, + "text": "we're going to do is get the" + }, + { + "start": 2675.04, + "text": "tokens so the way I implemented this" + }, + { + "start": 2677.24, + "text": "here is I'm taking I'm iterating over" + }, + { + "start": 2679.599, + "text": "all the IDS I'm using vocap to look up" + }, + { + "start": 2681.88, + "text": "their bytes and then here this is one" + }, + { + "start": 2684.119, + "text": "way in Python to concatenate all these" + }, + { + "start": 2686.64, + "text": "bytes together to create our tokens and" + }, + { + "start": 2689.72, + "text": "then these tokens here at this point are" + }, + { + "start": 2691.72, + "text": "raw bytes so I have to decode using UTF" + }, + { + "start": 2696.0, + "text": "F now back into python strings so" + }, + { + "start": 2699.2, + "text": "previously we called that encode on a" + }, + { + "start": 2701.16, + "text": "string object to get the bytes and now" + }, + { + "start": 2703.2, + "text": "we're doing it Opposite we're taking the" + }, + { + "start": 2705.2, + "text": "bytes and calling a decode on the bytes" + }, + { + "start": 2707.8, + "text": "object to get a string in Python and" + }, + { + "start": 2711.0, + "text": "then we can return" + }, + { + "start": 2713.319, + "text": "text so um this is how we can do it now" + }, + { + "start": 2716.96, + "text": "this actually has a um issue um in the" + }, + { + "start": 2720.8, + "text": "way I implemented it and this could" + }, + { + "start": 2722.119, + "text": "actually throw an error so try to think" + }, + { + "start": 2724.119, + "text": "figure out why this code could actually" + }, + { + "start": 2726.48, + "text": "result in an error if we plug in um uh" + }, + { + "start": 2730.24, + "text": "some sequence of IDs that is" + }, + { + "start": 2732.599, + "text": "unlucky so let me demonstrate the issue" + }, + { + "start": 2735.24, + "text": "when I try to decode just something like" + }, + { + "start": 2737.16, + "text": "97 I am going to get letter A here back" + }, + { + "start": 2741.079, + "text": "so nothing too crazy happening but when" + }, + { + "start": 2744.4, + "text": "I try to decode 128 as a single element" + }, + { + "start": 2748.24, + "text": "the token 128 is what in string or in" + }, + { + "start": 2751.319, + "text": "Python object uni Cod decoder utfa can't" + }, + { + "start": 2755.119, + "text": "Decode by um 0x8 which is this in HEX in" + }, + { + "start": 2760.119, + "text": "position zero invalid start bite what" + }, + { + "start": 2761.92, + "text": "does that mean well to understand what" + }, + { + "start": 2763.64, + "text": "this means we have to go back to our" + }, + { + "start": 2764.76, + "text": "utf8 page uh that I briefly showed" + }, + { + "start": 2767.92, + "text": "earlier and this is Wikipedia utf8 and" + }, + { + "start": 2770.76, + "text": "basically there's a specific schema that" + }, + { + "start": 2773.559, + "text": "utfa bytes take so in particular if you" + }, + { + "start": 2776.92, + "text": "have a multi-te object for some of the" + }, + { + "start": 2779.839, + "text": "Unicode characters they have to have" + }, + { + "start": 2781.52, + "text": "this special sort of envelope in how the" + }, + { + "start": 2784.16, + "text": "encoding works and so what's happening" + }, + { + "start": 2786.52, + "text": "here is that invalid start pite that's" + }, + { + "start": 2790.0, + "text": "because" + }, + { + "start": 2791.0, + "text": "128 the binary representation of it is" + }, + { + "start": 2793.88, + "text": "one followed by all zeros so we have one" + }, + { + "start": 2797.359, + "text": "and then all zero and we see here that" + }, + { + "start": 2799.559, + "text": "that doesn't conform to the format" + }, + { + "start": 2801.04, + "text": "because one followed by all zero just" + }, + { + "start": 2802.68, + "text": "doesn't fit any of these rules so to" + }, + { + "start": 2804.96, + "text": "speak so it's an invalid start bite" + }, + { + "start": 2807.64, + "text": "which is byte one this one must have a" + }, + { + "start": 2810.599, + "text": "one following it and then a zero" + }, + { + "start": 2812.76, + "text": "following it and then the content of" + }, + { + "start": 2814.48, + "text": "your uni codee in x here so basically we" + }, + { + "start": 2817.68, + "text": "don't um exactly follow the utf8" + }, + { + "start": 2819.96, + "text": "standard and this cannot be decoded and" + }, + { + "start": 2822.52, + "text": "so the way to fix this um is to" + }, + { + "start": 2826.28, + "text": "use this errors equals in bytes. decode" + }, + { + "start": 2831.04, + "text": "function of python and by default errors" + }, + { + "start": 2833.839, + "text": "is strict so we will throw an error if" + }, + { + "start": 2837.16, + "text": "um it's not valid utf8 bytes encoding" + }, + { + "start": 2840.28, + "text": "but there are many different things that" + }, + { + "start": 2841.68, + "text": "you could put here on error handling" + }, + { + "start": 2843.68, + "text": "this is the full list of all the errors" + }, + { + "start": 2845.359, + "text": "that you can use and in particular" + }, + { + "start": 2847.359, + "text": "instead of strict let's change it to" + }, + { + "start": 2849.359, + "text": "replace and that will replace uh with" + }, + { + "start": 2852.28, + "text": "this special marker this replacement" + }, + { + "start": 2855.8, + "text": "character so errors equals replace and" + }, + { + "start": 2860.52, + "text": "now we just get that character" + }, + { + "start": 2863.16, + "text": "back so basically not every single by" + }, + { + "start": 2866.96, + "text": "sequence is valid" + }, + { + "start": 2868.52, + "text": "utf8 and if it happens that your large" + }, + { + "start": 2871.48, + "text": "language model for example predicts your" + }, + { + "start": 2873.88, + "text": "tokens in a bad manner then they might" + }, + { + "start": 2876.64, + "text": "not fall into valid utf8 and then we" + }, + { + "start": 2880.24, + "text": "won't be able to decode them so the" + }, + { + "start": 2882.88, + "text": "standard practice is to basically uh use" + }, + { + "start": 2885.64, + "text": "errors equals replace and this is what" + }, + { + "start": 2887.52, + "text": "you will also find in the openai um code" + }, + { + "start": 2890.319, + "text": "that they released as well but basically" + }, + { + "start": 2892.72, + "text": "whenever you see um this kind of a" + }, + { + "start": 2894.2, + "text": "character in your output in that case uh" + }, + { + "start": 2896.0, + "text": "something went wrong and the LM output" + }, + { + "start": 2898.16, + "text": "not was not valid uh sort of sequence of" + }, + { + "start": 2901.52, + "text": "tokens okay and now we're going to go" + }, + { + "start": 2903.48, + "text": "the other way so we are going to" + }, + { + "start": 2905.319, + "text": "implement" + }, + { + "start": 2906.24, + "text": "this Arrow right here where we are going" + }, + { + "start": 2907.96, + "text": "to be given a string and we want to" + }, + { + "start": 2909.64, + "text": "encode it into" + }, + { + "start": 2911.16, + "text": "tokens so this is the signature of the" + }, + { + "start": 2913.72, + "text": "function that we're interested in and um" + }, + { + "start": 2916.92, + "text": "this should basically print a list of" + }, + { + "start": 2918.16, + "text": "integers of the tokens so again uh try" + }, + { + "start": 2921.76, + "text": "to maybe implement this yourself if" + }, + { + "start": 2923.04, + "text": "you'd like a fun exercise uh and pause" + }, + { + "start": 2925.559, + "text": "here otherwise I'm going to start" + }, + { + "start": 2926.52, + "text": "putting in my" + }, + { + "start": 2927.96, + "text": "solution so again there are many ways to" + }, + { + "start": 2930.28, + "text": "do this so um this is one of the ways" + }, + { + "start": 2933.64, + "text": "that sort of I came came up with so the" + }, + { + "start": 2937.599, + "text": "first thing we're going to do is we are" + }, + { + "start": 2939.16, + "text": "going" + }, + { + "start": 2940.119, + "text": "to uh take our text encode it into utf8" + }, + { + "start": 2943.44, + "text": "to get the raw bytes and then as before" + }, + { + "start": 2945.799, + "text": "we're going to call list on the bytes" + }, + { + "start": 2947.28, + "text": "object to get a list of integers of" + }, + { + "start": 2950.079, + "text": "those bytes so those are the starting" + }, + { + "start": 2952.76, + "text": "tokens those are the raw bytes of our" + }, + { + "start": 2954.599, + "text": "sequence but now of course according to" + }, + { + "start": 2956.96, + "text": "the merges dictionary above and recall" + }, + { + "start": 2959.559, + "text": "this was the" + }, + { + "start": 2961.079, + "text": "merges some of the bytes may be merged" + }, + { + "start": 2963.96, + "text": "according to this lookup in addition to" + }, + { + "start": 2966.559, + "text": "that remember that the merges was built" + }, + { + "start": 2968.16, + "text": "from top to bottom and this is sort of" + }, + { + "start": 2969.92, + "text": "the order in which we inserted stuff" + }, + { + "start": 2971.359, + "text": "into merges and so we prefer to do all" + }, + { + "start": 2974.28, + "text": "these merges in the beginning before we" + }, + { + "start": 2976.119, + "text": "do these merges later because um for" + }, + { + "start": 2979.2, + "text": "example this merge over here relies on" + }, + { + "start": 2980.96, + "text": "the 256 which got merged here so we have" + }, + { + "start": 2984.64, + "text": "to go in the order from top to bottom" + }, + { + "start": 2986.92, + "text": "sort of if we are going to be merging" + }, + { + "start": 2988.92, + "text": "anything now we expect to be doing a few" + }, + { + "start": 2991.44, + "text": "merges so we're going to be doing W" + }, + { + "start": 2994.52, + "text": "true um and now we want to find a pair" + }, + { + "start": 2998.079, + "text": "of byes that is consecutive that we are" + }, + { + "start": 3000.72, + "text": "allowed to merge according to this in" + }, + { + "start": 3003.599, + "text": "order to reuse some of the functionality" + }, + { + "start": 3005.0, + "text": "that we've already written I'm going to" + }, + { + "start": 3006.559, + "text": "reuse the function uh get" + }, + { + "start": 3009.079, + "text": "stats so recall that get stats uh will" + }, + { + "start": 3012.079, + "text": "give us the we'll basically count up how" + }, + { + "start": 3014.24, + "text": "many times every single pair occurs in" + }, + { + "start": 3016.599, + "text": "our sequence of tokens and return that" + }, + { + "start": 3018.92, + "text": "as a dictionary and the dictionary was a" + }, + { + "start": 3022.079, + "text": "mapping from all the different uh by" + }, + { + "start": 3025.599, + "text": "pairs to the number of times that they" + }, + { + "start": 3027.4, + "text": "occur right um at this point we don't" + }, + { + "start": 3030.28, + "text": "actually care how many times they occur" + }, + { + "start": 3032.359, + "text": "in the sequence we only care what the" + }, + { + "start": 3034.359, + "text": "raw pairs are in that sequence and so" + }, + { + "start": 3036.839, + "text": "I'm only going to be using basically the" + }, + { + "start": 3038.28, + "text": "keys of the dictionary I only care about" + }, + { + "start": 3040.44, + "text": "the set of possible merge candidates if" + }, + { + "start": 3042.92, + "text": "that makes" + }, + { + "start": 3043.76, + "text": "sense now we want to identify the pair" + }, + { + "start": 3046.16, + "text": "that we're going to be merging at this" + }, + { + "start": 3047.72, + "text": "stage of the loop so what do we want we" + }, + { + "start": 3050.24, + "text": "want to find the pair or like the a key" + }, + { + "start": 3053.24, + "text": "inside stats that has the lowest index" + }, + { + "start": 3057.079, + "text": "in the merges uh dictionary because we" + }, + { + "start": 3059.64, + "text": "want to do all the early merges before" + }, + { + "start": 3061.28, + "text": "we work our way to the late" + }, + { + "start": 3063.079, + "text": "merges so again there are many different" + }, + { + "start": 3065.319, + "text": "ways to implement this but I'm going to" + }, + { + "start": 3067.72, + "text": "do something a little bit fancy" + }, + { + "start": 3071.28, + "text": "here so I'm going to be using the Min" + }, + { + "start": 3074.2, + "text": "over an iterator in Python when you call" + }, + { + "start": 3076.799, + "text": "Min on an iterator and stats here as a" + }, + { + "start": 3078.96, + "text": "dictionary we're going to be iterating" + }, + { + "start": 3080.839, + "text": "the keys of this dictionary in Python so" + }, + { + "start": 3084.119, + "text": "we're looking at all the pairs inside" + }, + { + "start": 3087.079, + "text": "stats um which are all the consecutive" + }, + { + "start": 3089.359, + "text": "Pairs and we're going to be taking the" + }, + { + "start": 3092.079, + "text": "consecutive pair inside tokens that has" + }, + { + "start": 3094.44, + "text": "the minimum what the Min takes a key" + }, + { + "start": 3098.88, + "text": "which gives us the function that is" + }, + { + "start": 3100.319, + "text": "going to return a value over which we're" + }, + { + "start": 3102.359, + "text": "going to do the Min and the one we care" + }, + { + "start": 3104.96, + "text": "about is we're we care about taking" + }, + { + "start": 3106.44, + "text": "merges and basically getting um that" + }, + { + "start": 3110.92, + "text": "pairs" + }, + { + "start": 3112.839, + "text": "index so basically for any pair inside" + }, + { + "start": 3117.16, + "text": "stats we are going to be looking into" + }, + { + "start": 3119.72, + "text": "merges at what index it has and we want" + }, + { + "start": 3123.079, + "text": "to get the pair with the Min number so" + }, + { + "start": 3125.839, + "text": "as an example if there's a pair 101 and" + }, + { + "start": 3127.559, + "text": "32 we definitely want to get that pair" + }, + { + "start": 3130.44, + "text": "uh we want to identify it here and" + }, + { + "start": 3131.92, + "text": "return it and pair would become 10132 if" + }, + { + "start": 3135.04, + "text": "it" + }, + { + "start": 3135.76, + "text": "occurs and the reason that I'm putting a" + }, + { + "start": 3137.96, + "text": "float INF here as a fall back is that in" + }, + { + "start": 3141.4, + "text": "the get function when we call uh when we" + }, + { + "start": 3144.2, + "text": "basically consider a pair that doesn't" + }, + { + "start": 3146.599, + "text": "occur in the merges then that pair is" + }, + { + "start": 3149.0, + "text": "not eligible to be merged right so if in" + }, + { + "start": 3151.88, + "text": "the token sequence there's some pair" + }, + { + "start": 3153.48, + "text": "that is not a merging pair it cannot be" + }, + { + "start": 3155.559, + "text": "merged then uh it doesn't actually occur" + }, + { + "start": 3158.119, + "text": "here and it doesn't have an index and uh" + }, + { + "start": 3160.839, + "text": "it cannot be merged which we will denote" + }, + { + "start": 3162.599, + "text": "as float INF and the reason Infinity is" + }, + { + "start": 3165.079, + "text": "nice here is because for sure we're" + }, + { + "start": 3166.599, + "text": "guaranteed that it's not going to" + }, + { + "start": 3168.079, + "text": "participate in the list of candidates" + }, + { + "start": 3170.04, + "text": "when we do the men so uh so this is one" + }, + { + "start": 3173.44, + "text": "way to do it so B basically long story" + }, + { + "start": 3175.88, + "text": "short this Returns the most eligible" + }, + { + "start": 3178.28, + "text": "merging candidate pair uh that occurs in" + }, + { + "start": 3181.119, + "text": "the tokens now one thing to be careful" + }, + { + "start": 3184.079, + "text": "with here is this uh function here might" + }, + { + "start": 3187.48, + "text": "fail in the following way if there's" + }, + { + "start": 3189.88, + "text": "nothing to merge then uh uh then there's" + }, + { + "start": 3193.599, + "text": "nothing in merges um that satisfi that" + }, + { + "start": 3196.92, + "text": "is satisfied anymore there's nothing to" + }, + { + "start": 3198.559, + "text": "merge everything just returns float imps" + }, + { + "start": 3201.72, + "text": "and then the pair I think will just" + }, + { + "start": 3203.68, + "text": "become the very first element of stats" + }, + { + "start": 3206.96, + "text": "um but this pair is not actually a" + }, + { + "start": 3208.359, + "text": "mergeable pair it just becomes the first" + }, + { + "start": 3211.16, + "text": "pair inside stats arbitrarily because" + }, + { + "start": 3213.28, + "text": "all of these pairs evaluate to float in" + }, + { + "start": 3216.319, + "text": "for the merging Criterion so basically" + }, + { + "start": 3218.559, + "text": "it could be that this this doesn't look" + }, + { + "start": 3220.359, + "text": "succeed because there's no more merging" + }, + { + "start": 3221.64, + "text": "pairs so if this pair is not in merges" + }, + { + "start": 3224.64, + "text": "that was returned then this is a signal" + }, + { + "start": 3226.839, + "text": "for us that actually there was nothing" + }, + { + "start": 3228.4, + "text": "to merge no single pair can be merged" + }, + { + "start": 3230.72, + "text": "anymore in that case we will break" + }, + { + "start": 3233.079, + "text": "out um nothing else can be" + }, + { + "start": 3237.88, + "text": "merged you may come up with a different" + }, + { + "start": 3239.839, + "text": "implementation by the way this is kind" + }, + { + "start": 3241.04, + "text": "of like really trying hard in" + }, + { + "start": 3243.88, + "text": "Python um but really we're just trying" + }, + { + "start": 3245.96, + "text": "to find a pair that can be merged with" + }, + { + "start": 3247.799, + "text": "the lowest index" + }, + { + "start": 3249.599, + "text": "here now if we did find a pair that is" + }, + { + "start": 3253.88, + "text": "inside merges with the lowest index then" + }, + { + "start": 3256.28, + "text": "we can merge it" + }, + { + "start": 3259.839, + "text": "so we're going to look into the merger" + }, + { + "start": 3262.04, + "text": "dictionary for that pair to look up the" + }, + { + "start": 3264.28, + "text": "index and we're going to now merge that" + }, + { + "start": 3267.28, + "text": "into that index so we're going to do" + }, + { + "start": 3269.24, + "text": "tokens equals and we're going to" + }, + { + "start": 3272.24, + "text": "replace the original tokens we're going" + }, + { + "start": 3274.64, + "text": "to be replacing the pair pair and we're" + }, + { + "start": 3276.76, + "text": "going to be replacing it with index idx" + }, + { + "start": 3278.96, + "text": "and this returns a new list of tokens" + }, + { + "start": 3281.64, + "text": "where every occurrence of pair is" + }, + { + "start": 3283.16, + "text": "replaced with idx so we're doing a merge" + }, + { + "start": 3286.28, + "text": "and we're going to be continuing this" + }, + { + "start": 3287.599, + "text": "until eventually nothing can be merged" + }, + { + "start": 3289.28, + "text": "we'll come out here and we'll break out" + }, + { + "start": 3291.28, + "text": "and here we just return" + }, + { + "start": 3293.319, + "text": "tokens and so that that's the" + }, + { + "start": 3295.839, + "text": "implementation I think so hopefully this" + }, + { + "start": 3297.44, + "text": "runs okay cool um yeah and this looks uh" + }, + { + "start": 3302.44, + "text": "reasonable so for example 32 is a space" + }, + { + "start": 3304.88, + "text": "in asky so that's here um so this looks" + }, + { + "start": 3309.2, + "text": "like it worked great okay so let's wrap" + }, + { + "start": 3311.48, + "text": "up this section of the video at least I" + }, + { + "start": 3313.48, + "text": "wanted to point out that this is not" + }, + { + "start": 3314.88, + "text": "quite the right implementation just yet" + }, + { + "start": 3316.359, + "text": "because we are leaving out a special" + }, + { + "start": 3317.96, + "text": "case so in particular if uh we try to do" + }, + { + "start": 3320.68, + "text": "this this would give us an error and the" + }, + { + "start": 3323.559, + "text": "issue is that um if we only have a" + }, + { + "start": 3325.64, + "text": "single character or an empty string then" + }, + { + "start": 3328.039, + "text": "stats is empty and that causes an issue" + }, + { + "start": 3329.839, + "text": "inside Min so one way to fight this is" + }, + { + "start": 3332.96, + "text": "if L of tokens is at least two because" + }, + { + "start": 3336.359, + "text": "if it's less than two it's just a single" + }, + { + "start": 3337.839, + "text": "token or no tokens then let's just uh" + }, + { + "start": 3340.079, + "text": "there's nothing to merge so we just" + }, + { + "start": 3341.52, + "text": "return so that would fix uh that" + }, + { + "start": 3344.64, + "text": "case Okay and then second I have a few" + }, + { + "start": 3348.079, + "text": "test cases here for us as well so first" + }, + { + "start": 3350.44, + "text": "let's make sure uh about or let's note" + }, + { + "start": 3353.359, + "text": "the following if we take a string and we" + }, + { + "start": 3356.44, + "text": "try to encode it and then decode it back" + }, + { + "start": 3358.64, + "text": "you'd expect to get the same string back" + }, + { + "start": 3360.24, + "text": "right is that true for all" + }, + { + "start": 3364.68, + "text": "strings so I think uh so here it is the" + }, + { + "start": 3367.16, + "text": "case and I think in general this is" + }, + { + "start": 3368.72, + "text": "probably the case um but notice that" + }, + { + "start": 3372.039, + "text": "going backwards is not is not you're not" + }, + { + "start": 3374.64, + "text": "going to have an identity going" + }, + { + "start": 3375.92, + "text": "backwards because as I mentioned us not" + }, + { + "start": 3379.2, + "text": "all token sequences are valid utf8 uh" + }, + { + "start": 3382.96, + "text": "sort of by streams and so so therefore" + }, + { + "start": 3385.44, + "text": "you're some of them can't even be" + }, + { + "start": 3387.2, + "text": "decodable um so this only goes in One" + }, + { + "start": 3390.48, + "text": "Direction but for that one direction we" + }, + { + "start": 3392.92, + "text": "can check uh here if we take the" + }, + { + "start": 3394.76, + "text": "training text which is the text that we" + }, + { + "start": 3396.319, + "text": "train to tokenizer around we can make" + }, + { + "start": 3398.0, + "text": "sure that when we encode and decode we" + }, + { + "start": 3399.44, + "text": "get the same thing back which is true" + }, + { + "start": 3401.96, + "text": "and here I took some validation data so" + }, + { + "start": 3403.839, + "text": "I went to I think this web page and I" + }, + { + "start": 3405.599, + "text": "grabbed some text so this is text that" + }, + { + "start": 3407.76, + "text": "the tokenizer has not seen and we can" + }, + { + "start": 3409.68, + "text": "make sure that this also works um okay" + }, + { + "start": 3412.72, + "text": "so that gives us some confidence that" + }, + { + "start": 3413.92, + "text": "this was correctly implemented" + }, + { + "start": 3416.0, + "text": "so those are the basics of the bite pair" + }, + { + "start": 3418.039, + "text": "encoding algorithm we saw how we can uh" + }, + { + "start": 3420.72, + "text": "take some training set train a tokenizer" + }, + { + "start": 3423.68, + "text": "the parameters of this tokenizer really" + }, + { + "start": 3425.44, + "text": "are just this dictionary of merges and" + }, + { + "start": 3428.119, + "text": "that basically creates the little binary" + }, + { + "start": 3429.599, + "text": "Forest on top of raw" + }, + { + "start": 3431.559, + "text": "bites once we have this the merges table" + }, + { + "start": 3434.68, + "text": "we can both encode and decode between" + }, + { + "start": 3436.799, + "text": "raw text and token sequences so that's" + }, + { + "start": 3439.4, + "text": "the the simplest setting of The" + }, + { + "start": 3441.28, + "text": "tokenizer what we're going to do now" + }, + { + "start": 3443.2, + "text": "though is we're going to look at some of" + }, + { + "start": 3444.48, + "text": "the St the art lar language models and" + }, + { + "start": 3446.559, + "text": "the kinds of tokenizers that they use" + }, + { + "start": 3448.359, + "text": "and we're going to see that this picture" + }, + { + "start": 3449.559, + "text": "complexifies very quickly so we're going" + }, + { + "start": 3451.64, + "text": "to go through the details of this comp" + }, + { + "start": 3454.599, + "text": "complexification one at a time so let's" + }, + { + "start": 3457.52, + "text": "kick things off by looking at the GPD" + }, + { + "start": 3459.039, + "text": "Series so in particular I have the gpt2" + }, + { + "start": 3461.64, + "text": "paper here um and this paper is from" + }, + { + "start": 3464.64, + "text": "2019 or so so 5 years ago and let's" + }, + { + "start": 3468.359, + "text": "scroll down to input representation this" + }, + { + "start": 3471.28, + "text": "is where they talk about the tokenizer" + }, + { + "start": 3472.68, + "text": "that they're using for gpd2 now this is" + }, + { + "start": 3475.64, + "text": "all fairly readable so I encourage you" + }, + { + "start": 3477.039, + "text": "to pause and um read this yourself but" + }, + { + "start": 3480.039, + "text": "this is where they motivate the use of" + }, + { + "start": 3482.0, + "text": "the bite pair encoding algorithm on the" + }, + { + "start": 3484.68, + "text": "bite level representation of utf8" + }, + { + "start": 3487.52, + "text": "encoding so this is where they motivate" + }, + { + "start": 3489.52, + "text": "it and they talk about the vocabulary" + }, + { + "start": 3491.079, + "text": "sizes and everything now everything here" + }, + { + "start": 3493.839, + "text": "is exactly as we've covered it so far" + }, + { + "start": 3495.92, + "text": "but things start to depart around here" + }, + { + "start": 3498.559, + "text": "so what they mention is that they don't" + }, + { + "start": 3500.44, + "text": "just apply the naive algorithm as we" + }, + { + "start": 3502.28, + "text": "have done it and in particular here's a" + }, + { + "start": 3505.16, + "text": "example suppose that you have common" + }, + { + "start": 3507.0, + "text": "words like dog what will happen is that" + }, + { + "start": 3509.48, + "text": "dog of course occurs very frequently in" + }, + { + "start": 3511.64, + "text": "the text and it occurs right next to all" + }, + { + "start": 3514.28, + "text": "kinds of punctuation as an example so" + }, + { + "start": 3516.4, + "text": "doc dot dog exclamation mark dog" + }, + { + "start": 3519.16, + "text": "question mark Etc and naively you might" + }, + { + "start": 3522.24, + "text": "imagine that the BP algorithm could" + }, + { + "start": 3523.64, + "text": "merge these to be single tokens and then" + }, + { + "start": 3525.76, + "text": "you end up with lots of tokens that are" + }, + { + "start": 3527.44, + "text": "just like dog with a slightly different" + }, + { + "start": 3529.0, + "text": "punctuation and so it feels like you're" + }, + { + "start": 3530.88, + "text": "clustering things that shouldn't be" + }, + { + "start": 3532.039, + "text": "clustered you're combining kind of" + }, + { + "start": 3533.64, + "text": "semantics with" + }, + { + "start": 3535.559, + "text": "uation and this uh feels suboptimal and" + }, + { + "start": 3538.92, + "text": "indeed they also say that this is" + }, + { + "start": 3540.96, + "text": "suboptimal according to some of the" + }, + { + "start": 3542.359, + "text": "experiments so what they want to do is" + }, + { + "start": 3544.2, + "text": "they want to top down in a manual way" + }, + { + "start": 3546.319, + "text": "enforce that some types of um characters" + }, + { + "start": 3549.599, + "text": "should never be merged together um so" + }, + { + "start": 3552.76, + "text": "they want to enforce these merging rules" + }, + { + "start": 3554.799, + "text": "on top of the bite PA encoding algorithm" + }, + { + "start": 3557.68, + "text": "so let's take a look um at their code" + }, + { + "start": 3559.88, + "text": "and see how they actually enforce this" + }, + { + "start": 3561.48, + "text": "and what kinds of mergy they actually do" + }, + { + "start": 3563.2, + "text": "perform so I have to to tab open here" + }, + { + "start": 3565.839, + "text": "for gpt2 under open AI on GitHub and" + }, + { + "start": 3569.64, + "text": "when we go to" + }, + { + "start": 3570.68, + "text": "Source there is an encoder thatp now I" + }, + { + "start": 3574.28, + "text": "don't personally love that they call it" + }, + { + "start": 3575.599, + "text": "encoder dopy because this is the" + }, + { + "start": 3577.079, + "text": "tokenizer and the tokenizer can do both" + }, + { + "start": 3579.359, + "text": "encode and decode uh so it feels kind of" + }, + { + "start": 3581.88, + "text": "awkward to me that it's called encoder" + }, + { + "start": 3583.2, + "text": "but that is the tokenizer and there's a" + }, + { + "start": 3585.92, + "text": "lot going on here and we're going to" + }, + { + "start": 3587.0, + "text": "step through it in detail at one point" + }, + { + "start": 3589.24, + "text": "for now I just want to focus on this" + }, + { + "start": 3591.599, + "text": "part here the create a rigix pattern" + }, + { + "start": 3594.359, + "text": "here that looks very complicated and" + }, + { + "start": 3596.24, + "text": "we're going to go through it in a bit uh" + }, + { + "start": 3598.68, + "text": "but this is the core part that allows" + }, + { + "start": 3600.28, + "text": "them to enforce rules uh for what parts" + }, + { + "start": 3604.0, + "text": "of the text Will Never Be merged for" + }, + { + "start": 3605.96, + "text": "sure now notice that re. compile here is" + }, + { + "start": 3608.64, + "text": "a little bit misleading because we're" + }, + { + "start": 3610.76, + "text": "not just doing import re which is the" + }, + { + "start": 3612.44, + "text": "python re module we're doing import reex" + }, + { + "start": 3614.64, + "text": "as re and reex is a python package that" + }, + { + "start": 3617.72, + "text": "you can install P install r x and it's" + }, + { + "start": 3620.4, + "text": "basically an extension of re so it's a" + }, + { + "start": 3622.079, + "text": "bit more powerful" + }, + { + "start": 3623.24, + "text": "re um" + }, + { + "start": 3626.0, + "text": "so let's take a look at this pattern and" + }, + { + "start": 3628.88, + "text": "what it's doing and why this is actually" + }, + { + "start": 3630.799, + "text": "doing the separation that they are" + }, + { + "start": 3632.64, + "text": "looking for okay so I've copy pasted the" + }, + { + "start": 3634.92, + "text": "pattern here to our jupit notebook where" + }, + { + "start": 3637.119, + "text": "we left off and let's take this pattern" + }, + { + "start": 3639.24, + "text": "for a spin so in the exact same way that" + }, + { + "start": 3642.119, + "text": "their code does we're going to call an" + }, + { + "start": 3644.079, + "text": "re. findall for this pattern on any" + }, + { + "start": 3647.28, + "text": "arbitrary string that we are interested" + }, + { + "start": 3649.359, + "text": "so this is the string that we want to" + }, + { + "start": 3650.599, + "text": "encode into tokens um to feed into n llm" + }, + { + "start": 3655.24, + "text": "like gpt2 so what exactly is this doing" + }, + { + "start": 3659.039, + "text": "well re. findall will take this pattern" + }, + { + "start": 3661.039, + "text": "and try to match it against a" + }, + { + "start": 3662.839, + "text": "string um the way this works is that you" + }, + { + "start": 3666.119, + "text": "are going from left to right in the" + }, + { + "start": 3667.96, + "text": "string and you're trying to match the" + }, + { + "start": 3670.28, + "text": "pattern and R.F find all will get all" + }, + { + "start": 3673.799, + "text": "the occurrences and organize them into a" + }, + { + "start": 3676.319, + "text": "list now when you look at the um when" + }, + { + "start": 3679.16, + "text": "you look at this pattern first of all" + }, + { + "start": 3680.88, + "text": "notice that this is a raw string um and" + }, + { + "start": 3683.96, + "text": "then these are three double quotes just" + }, + { + "start": 3686.319, + "text": "to start the string so really the string" + }, + { + "start": 3688.839, + "text": "itself this is the pattern itself" + }, + { + "start": 3691.319, + "text": "right and notice that it's made up of a" + }, + { + "start": 3694.079, + "text": "lot of ores so see these vertical bars" + }, + { + "start": 3696.48, + "text": "those are ores in reg X and so you go" + }, + { + "start": 3700.2, + "text": "from left to right in this pattern and" + }, + { + "start": 3701.48, + "text": "try to match it against the string" + }, + { + "start": 3703.16, + "text": "wherever you are so we have hello and" + }, + { + "start": 3706.44, + "text": "we're going to try to match it well it's" + }, + { + "start": 3708.24, + "text": "not apostrophe s it's not apostrophe t" + }, + { + "start": 3710.799, + "text": "or any of these but it is an optional" + }, + { + "start": 3713.96, + "text": "space followed by- P of uh sorry SL P of" + }, + { + "start": 3718.119, + "text": "L one or more times what is/ P of L it" + }, + { + "start": 3722.319, + "text": "is coming to some documentation that I" + }, + { + "start": 3724.72, + "text": "found um there might be other sources as" + }, + { + "start": 3728.0, + "text": "well uh SLP is a letter any kind of" + }, + { + "start": 3731.599, + "text": "letter from any language and hello is" + }, + { + "start": 3735.039, + "text": "made up of letters h e l Etc so optional" + }, + { + "start": 3739.52, + "text": "space followed by a bunch of letters one" + }, + { + "start": 3741.559, + "text": "or more letters is going to match hello" + }, + { + "start": 3744.72, + "text": "but then the match ends because a white" + }, + { + "start": 3747.079, + "text": "space is not a letter so from there on" + }, + { + "start": 3751.079, + "text": "begins a new sort of attempt to match" + }, + { + "start": 3753.64, + "text": "against the string again and starting in" + }, + { + "start": 3756.44, + "text": "here we're going to skip over all of" + }, + { + "start": 3758.079, + "text": "these again until we get to the exact" + }, + { + "start": 3760.16, + "text": "same Point again and we see that there's" + }, + { + "start": 3762.319, + "text": "an optional space this is the optional" + }, + { + "start": 3764.279, + "text": "space followed by a bunch of letters one" + }, + { + "start": 3766.24, + "text": "or more of them and so that matches so" + }, + { + "start": 3768.72, + "text": "when we run this we get a list of two" + }, + { + "start": 3772.0, + "text": "elements hello and then space world" + }, + { + "start": 3775.72, + "text": "so how are you if we add more letters we" + }, + { + "start": 3778.88, + "text": "would just get them like this now what" + }, + { + "start": 3781.599, + "text": "is this doing and why is this important" + }, + { + "start": 3783.64, + "text": "we are taking our string and instead of" + }, + { + "start": 3785.92, + "text": "directly encoding it um for" + }, + { + "start": 3789.0, + "text": "tokenization we are first splitting it" + }, + { + "start": 3791.4, + "text": "up and when you actually step through" + }, + { + "start": 3793.48, + "text": "the code and we'll do that in a bit more" + }, + { + "start": 3795.319, + "text": "detail what really is doing on a high" + }, + { + "start": 3797.359, + "text": "level is that it first splits your text" + }, + { + "start": 3800.92, + "text": "into a list of texts just like this one" + }, + { + "start": 3804.64, + "text": "and all these elements of this list are" + }, + { + "start": 3806.559, + "text": "processed independently by the tokenizer" + }, + { + "start": 3809.279, + "text": "and all of the results of that" + }, + { + "start": 3810.76, + "text": "processing are simply" + }, + { + "start": 3812.279, + "text": "concatenated so hello world oh I I" + }, + { + "start": 3815.92, + "text": "missed how hello world how are you we" + }, + { + "start": 3819.64, + "text": "have five elements of list all of these" + }, + { + "start": 3821.599, + "text": "will independent" + }, + { + "start": 3824.4, + "text": "independently go from text to a token" + }, + { + "start": 3827.0, + "text": "sequence and then that token sequence is" + }, + { + "start": 3829.2, + "text": "going to be concatenated it's all going" + }, + { + "start": 3830.799, + "text": "to be joined up and roughly speaking" + }, + { + "start": 3834.359, + "text": "what that does is you're only ever" + }, + { + "start": 3836.119, + "text": "finding merges between the elements of" + }, + { + "start": 3838.44, + "text": "this list so you can only ever consider" + }, + { + "start": 3840.359, + "text": "merges within every one of these" + }, + { + "start": 3841.72, + "text": "elements in" + }, + { + "start": 3843.24, + "text": "individually and um after you've done" + }, + { + "start": 3846.319, + "text": "all the possible merging for all of" + }, + { + "start": 3847.92, + "text": "these elements individually the results" + }, + { + "start": 3849.88, + "text": "of all that will be joined um by" + }, + { + "start": 3853.64, + "text": "concatenation and so you are basically" + }, + { + "start": 3856.24, + "text": "what what you're doing effectively is" + }, + { + "start": 3858.4, + "text": "you are never going to be merging this e" + }, + { + "start": 3861.0, + "text": "with this space because they are now" + }, + { + "start": 3863.2, + "text": "parts of the separate elements of this" + }, + { + "start": 3865.079, + "text": "list and so you are saying we are never" + }, + { + "start": 3867.72, + "text": "going to merge" + }, + { + "start": 3868.92, + "text": "eace um because we're breaking it up in" + }, + { + "start": 3872.039, + "text": "this way so basically using this regx" + }, + { + "start": 3875.72, + "text": "pattern to Chunk Up the text is just one" + }, + { + "start": 3877.96, + "text": "way of enforcing that some merges are" + }, + { + "start": 3881.72, + "text": "not to happen and we're going to go into" + }, + { + "start": 3883.76, + "text": "more of this text and we'll see that" + }, + { + "start": 3885.2, + "text": "what this is trying to do on a high" + }, + { + "start": 3886.24, + "text": "level is we're trying to not merge" + }, + { + "start": 3888.0, + "text": "across letters across numbers across" + }, + { + "start": 3890.64, + "text": "punctuation and so on so let's see in" + }, + { + "start": 3893.2, + "text": "more detail how that works so let's" + }, + { + "start": 3894.72, + "text": "continue now we have/ P ofn if you go to" + }, + { + "start": 3898.0, + "text": "the documentation SLP of n is any kind" + }, + { + "start": 3901.839, + "text": "of numeric character in any script so" + }, + { + "start": 3904.44, + "text": "it's numbers so we have an optional" + }, + { + "start": 3906.599, + "text": "space followed by numbers and those" + }, + { + "start": 3908.119, + "text": "would be separated out so letters and" + }, + { + "start": 3910.359, + "text": "numbers are being separated so if I do" + }, + { + "start": 3912.559, + "text": "Hello World 123 how are you then world" + }, + { + "start": 3915.839, + "text": "will stop matching here because one is" + }, + { + "start": 3917.96, + "text": "not a letter anymore but one is a number" + }, + { + "start": 3920.64, + "text": "so this group will match for that and" + }, + { + "start": 3922.52, + "text": "we'll get it as a separate entity" + }, + { + "start": 3926.559, + "text": "uh let's see how these apostrophes work" + }, + { + "start": 3928.359, + "text": "so here if we have" + }, + { + "start": 3931.0, + "text": "um uh Slash V or I mean apostrophe V as" + }, + { + "start": 3935.079, + "text": "an example then apostrophe here is not a" + }, + { + "start": 3938.359, + "text": "letter or a" + }, + { + "start": 3939.52, + "text": "number so hello will stop matching and" + }, + { + "start": 3942.44, + "text": "then we will exactly match this with" + }, + { + "start": 3944.96, + "text": "that so that will come out as a separate" + }, + { + "start": 3948.2, + "text": "thing so why are they doing the" + }, + { + "start": 3950.24, + "text": "apostrophes here honestly I think that" + }, + { + "start": 3952.24, + "text": "these are just like very common" + }, + { + "start": 3953.599, + "text": "apostrophes p uh that are used um" + }, + { + "start": 3956.96, + "text": "typically I don't love that they've done" + }, + { + "start": 3959.359, + "text": "this" + }, + { + "start": 3960.599, + "text": "because uh let me show you what happens" + }, + { + "start": 3963.319, + "text": "when you have uh some Unicode" + }, + { + "start": 3965.44, + "text": "apostrophes like for example you can" + }, + { + "start": 3967.359, + "text": "have if you have house then this will be" + }, + { + "start": 3970.559, + "text": "separated out because of this matching" + }, + { + "start": 3973.039, + "text": "but if you use the Unicode apostrophe" + }, + { + "start": 3975.319, + "text": "like" + }, + { + "start": 3976.16, + "text": "this then suddenly this does not work" + }, + { + "start": 3979.839, + "text": "and so this apostrophe will actually" + }, + { + "start": 3981.559, + "text": "become its own thing now and so so um" + }, + { + "start": 3984.92, + "text": "it's basically hardcoded for this" + }, + { + "start": 3986.359, + "text": "specific kind of apostrophe and uh" + }, + { + "start": 3989.68, + "text": "otherwise they become completely" + }, + { + "start": 3991.319, + "text": "separate tokens in addition to this you" + }, + { + "start": 3994.039, + "text": "can go to the gpt2 docs and here when" + }, + { + "start": 3998.48, + "text": "they Define the pattern they say should" + }, + { + "start": 4000.2, + "text": "have added re. ignore case so BP merges" + }, + { + "start": 4003.0, + "text": "can happen for capitalized versions of" + }, + { + "start": 4004.559, + "text": "contractions so what they're pointing" + }, + { + "start": 4006.52, + "text": "out is that you see how this is" + }, + { + "start": 4007.72, + "text": "apostrophe and then lowercase letters" + }, + { + "start": 4010.839, + "text": "well because they didn't do re. ignore" + }, + { + "start": 4012.92, + "text": "case then then um these rules will not" + }, + { + "start": 4016.44, + "text": "separate out the apostrophes if it's" + }, + { + "start": 4018.88, + "text": "uppercase so" + }, + { + "start": 4021.44, + "text": "house would be like this but if I did" + }, + { + "start": 4026.64, + "text": "house if I'm uppercase then notice" + }, + { + "start": 4030.24, + "text": "suddenly the apostrophe comes by" + }, + { + "start": 4032.279, + "text": "itself so the tokenization will work" + }, + { + "start": 4035.48, + "text": "differently in uppercase and lower case" + }, + { + "start": 4037.44, + "text": "inconsistently separating out these" + }, + { + "start": 4039.039, + "text": "apostrophes so it feels extremely gnarly" + }, + { + "start": 4041.119, + "text": "and slightly gross um but that's that's" + }, + { + "start": 4044.52, + "text": "how that works okay so let's come back" + }, + { + "start": 4047.24, + "text": "after trying to match a bunch of" + }, + { + "start": 4048.44, + "text": "apostrophe Expressions by the way the" + }, + { + "start": 4050.279, + "text": "other issue here is that these are quite" + }, + { + "start": 4052.079, + "text": "language specific probably so I don't" + }, + { + "start": 4054.559, + "text": "know that all the languages for example" + }, + { + "start": 4055.799, + "text": "use or don't use apostrophes but that" + }, + { + "start": 4057.48, + "text": "would be inconsistently tokenized as a" + }, + { + "start": 4059.96, + "text": "result then we try to match letters then" + }, + { + "start": 4062.52, + "text": "we try to match numbers and then if that" + }, + { + "start": 4064.88, + "text": "doesn't work we fall back to here and" + }, + { + "start": 4067.559, + "text": "what this is saying is again optional" + }, + { + "start": 4069.16, + "text": "space followed by something that is not" + }, + { + "start": 4070.839, + "text": "a letter number or a space in one or" + }, + { + "start": 4073.96, + "text": "more of that so what this is doing" + }, + { + "start": 4075.799, + "text": "effectively is this is trying to match" + }, + { + "start": 4077.559, + "text": "punctuation roughly speaking not letters" + }, + { + "start": 4079.52, + "text": "and not numbers so this group will try" + }, + { + "start": 4082.279, + "text": "to trigger for that so if I do something" + }, + { + "start": 4084.2, + "text": "like this then these parts here are not" + }, + { + "start": 4088.48, + "text": "letters or numbers but they will" + }, + { + "start": 4089.96, + "text": "actually they are uh they will actually" + }, + { + "start": 4092.039, + "text": "get caught here and so they become its" + }, + { + "start": 4094.48, + "text": "own group so we've separated out the" + }, + { + "start": 4097.4, + "text": "punctuation and finally this um this is" + }, + { + "start": 4100.08, + "text": "also a little bit confusing so this is" + }, + { + "start": 4102.159, + "text": "matching white space but this is using a" + }, + { + "start": 4105.359, + "text": "negative look ahead assertion in regex" + }, + { + "start": 4109.04, + "text": "so what this is doing is it's matching" + }, + { + "start": 4110.92, + "text": "wh space up to but not including the" + }, + { + "start": 4113.279, + "text": "last Whit space" + }, + { + "start": 4115.0, + "text": "character why is this important um this" + }, + { + "start": 4117.92, + "text": "is pretty subtle I think so you see how" + }, + { + "start": 4120.279, + "text": "the white space is always included at" + }, + { + "start": 4121.719, + "text": "the beginning of the word so um space r" + }, + { + "start": 4125.52, + "text": "space u Etc suppose we have a lot of" + }, + { + "start": 4128.08, + "text": "spaces" + }, + { + "start": 4129.4, + "text": "here what's going to happen here is that" + }, + { + "start": 4132.359, + "text": "these spaces up to not including the" + }, + { + "start": 4134.6, + "text": "last character will get caught by this" + }, + { + "start": 4137.92, + "text": "and what that will do is it will" + }, + { + "start": 4139.719, + "text": "separate out the spaces up to but not" + }, + { + "start": 4141.88, + "text": "including the last character so that the" + }, + { + "start": 4143.679, + "text": "last character can come here and join" + }, + { + "start": 4145.92, + "text": "with the um space you and the reason" + }, + { + "start": 4149.239, + "text": "that's nice is because space you is the" + }, + { + "start": 4151.44, + "text": "common token so if I didn't have these" + }, + { + "start": 4153.799, + "text": "Extra Spaces here you would just have" + }, + { + "start": 4155.44, + "text": "space you and if I add tokens if I add" + }, + { + "start": 4158.159, + "text": "spaces we still have a space view but" + }, + { + "start": 4160.719, + "text": "now we have all this extra white space" + }, + { + "start": 4162.96, + "text": "so basically the GB to tokenizer really" + }, + { + "start": 4164.719, + "text": "likes to have a space letters or numbers" + }, + { + "start": 4167.44, + "text": "um and it it preens these spaces and" + }, + { + "start": 4170.44, + "text": "this is just something that it is" + }, + { + "start": 4171.4, + "text": "consistent about so that's what that is" + }, + { + "start": 4173.679, + "text": "for and then finally we have all the the" + }, + { + "start": 4176.4, + "text": "last fallback is um whites space" + }, + { + "start": 4178.64, + "text": "characters uh so um that would be" + }, + { + "start": 4182.719, + "text": "just um if that doesn't get caught then" + }, + { + "start": 4186.679, + "text": "this thing will catch any trailing" + }, + { + "start": 4188.52, + "text": "spaces and so on I wanted to show one" + }, + { + "start": 4190.759, + "text": "more real world example here so if we" + }, + { + "start": 4193.159, + "text": "have this string which is a piece of" + }, + { + "start": 4194.44, + "text": "python code and then we try to split it" + }, + { + "start": 4196.36, + "text": "up then this is the kind of output we" + }, + { + "start": 4198.4, + "text": "get so you'll notice that the list has" + }, + { + "start": 4200.56, + "text": "many elements here and that's because we" + }, + { + "start": 4202.48, + "text": "are splitting up fairly often uh every" + }, + { + "start": 4205.12, + "text": "time sort of a category" + }, + { + "start": 4207.12, + "text": "changes um so there will never be any" + }, + { + "start": 4209.36, + "text": "merges Within These" + }, + { + "start": 4210.96, + "text": "elements and um that's what you are" + }, + { + "start": 4213.48, + "text": "seeing here now you might think that in" + }, + { + "start": 4216.44, + "text": "order to train the" + }, + { + "start": 4217.76, + "text": "tokenizer uh open AI has used this to" + }, + { + "start": 4221.12, + "text": "split up text into chunks and then run" + }, + { + "start": 4223.88, + "text": "just a BP algorithm within all the" + }, + { + "start": 4225.8, + "text": "chunks but that is not exactly what" + }, + { + "start": 4227.96, + "text": "happened and the reason is the following" + }, + { + "start": 4230.28, + "text": "notice that we have the spaces here uh" + }, + { + "start": 4233.32, + "text": "those Spaces end up being entire" + }, + { + "start": 4235.44, + "text": "elements but these spaces never actually" + }, + { + "start": 4238.36, + "text": "end up being merged by by open Ai and" + }, + { + "start": 4240.64, + "text": "the way you can tell is that if you copy" + }, + { + "start": 4242.48, + "text": "paste the exact same chunk here into Tik" + }, + { + "start": 4244.199, + "text": "token U Tik tokenizer you see that all" + }, + { + "start": 4247.28, + "text": "the spaces are kept independent and" + }, + { + "start": 4249.28, + "text": "they're all token" + }, + { + "start": 4251.0, + "text": "220 so I think opena at some point Point" + }, + { + "start": 4253.84, + "text": "en Force some rule that these spaces" + }, + { + "start": 4256.04, + "text": "would never be merged and so um there's" + }, + { + "start": 4259.4, + "text": "some additional rules on top of just" + }, + { + "start": 4261.28, + "text": "chunking and bpe that open ey is not uh" + }, + { + "start": 4264.199, + "text": "clear about now the training code for" + }, + { + "start": 4266.32, + "text": "the gpt2 tokenizer was never released so" + }, + { + "start": 4268.679, + "text": "all we have is uh the code that I've" + }, + { + "start": 4270.8, + "text": "already shown you but this code here" + }, + { + "start": 4273.28, + "text": "that they've released is only the" + }, + { + "start": 4274.4, + "text": "inference code for the tokens so this is" + }, + { + "start": 4277.679, + "text": "not the training code you can't give it" + }, + { + "start": 4279.08, + "text": "a piece of text and training tokenizer" + }, + { + "start": 4281.52, + "text": "this is just the inference code which" + }, + { + "start": 4283.32, + "text": "Tak takes the merges that we have up" + }, + { + "start": 4285.6, + "text": "above and applies them to a new piece of" + }, + { + "start": 4288.32, + "text": "text and so we don't know exactly how" + }, + { + "start": 4290.56, + "text": "opening ey trained um train the" + }, + { + "start": 4292.48, + "text": "tokenizer but it wasn't as simple as" + }, + { + "start": 4294.64, + "text": "chunk it up and BP it uh whatever it was" + }, + { + "start": 4298.36, + "text": "next I wanted to introduce you to the" + }, + { + "start": 4300.239, + "text": "Tik token library from openai which is" + }, + { + "start": 4302.48, + "text": "the official library for tokenization" + }, + { + "start": 4304.8, + "text": "from openai so this is Tik token bip" + }, + { + "start": 4308.36, + "text": "install P to Tik token and then um you" + }, + { + "start": 4311.44, + "text": "can do the tokenization in inference" + }, + { + "start": 4314.36, + "text": "this is again not training code this is" + }, + { + "start": 4315.88, + "text": "only inference code for" + }, + { + "start": 4317.92, + "text": "tokenization um I wanted to show you how" + }, + { + "start": 4320.36, + "text": "you would use it quite simple and" + }, + { + "start": 4322.48, + "text": "running this just gives us the gpt2" + }, + { + "start": 4324.36, + "text": "tokens or the GPT 4 tokens so this is" + }, + { + "start": 4326.92, + "text": "the tokenizer use for GPT 4 and so in" + }, + { + "start": 4329.679, + "text": "particular we see that the Whit space in" + }, + { + "start": 4331.239, + "text": "gpt2 remains unmerged but in GPT 4 uh" + }, + { + "start": 4334.48, + "text": "these Whit spaces merge as we also saw" + }, + { + "start": 4337.32, + "text": "in this one where here they're all" + }, + { + "start": 4339.44, + "text": "unmerged but if we go down to GPT 4 uh" + }, + { + "start": 4342.639, + "text": "they become merged" + }, + { + "start": 4345.239, + "text": "um now in the" + }, + { + "start": 4347.76, + "text": "gp4 uh tokenizer they changed the" + }, + { + "start": 4351.04, + "text": "regular expression that they use to" + }, + { + "start": 4353.12, + "text": "Chunk Up text so the way to see this is" + }, + { + "start": 4355.639, + "text": "that if you come to your the Tik token" + }, + { + "start": 4358.0, + "text": "uh library and then you go to this file" + }, + { + "start": 4361.08, + "text": "Tik token X openi public this is where" + }, + { + "start": 4364.12, + "text": "sort of like the definition of all these" + }, + { + "start": 4365.639, + "text": "different tokenizers that openi" + }, + { + "start": 4366.96, + "text": "maintains is and so uh necessarily to do" + }, + { + "start": 4370.56, + "text": "the inference they had to publish some" + }, + { + "start": 4371.76, + "text": "of the details about the strings" + }, + { + "start": 4373.96, + "text": "so this is the string that we already" + }, + { + "start": 4375.36, + "text": "saw for gpt2 it is slightly different" + }, + { + "start": 4378.36, + "text": "but it is actually equivalent uh to what" + }, + { + "start": 4380.36, + "text": "we discussed here so this pattern that" + }, + { + "start": 4382.84, + "text": "we discussed is equivalent to this" + }, + { + "start": 4384.96, + "text": "pattern this one just executes a little" + }, + { + "start": 4387.0, + "text": "bit faster so here you see a little bit" + }, + { + "start": 4389.239, + "text": "of a slightly different definition but" + }, + { + "start": 4390.719, + "text": "otherwise it's the same we're going to" + }, + { + "start": 4392.719, + "text": "go into special tokens in a bit and then" + }, + { + "start": 4395.32, + "text": "if you scroll down to CL 100k this is" + }, + { + "start": 4398.6, + "text": "the GPT 4 tokenizer you see that the" + }, + { + "start": 4400.76, + "text": "pattern has changed um and this is kind" + }, + { + "start": 4403.96, + "text": "of like the main the major change in" + }, + { + "start": 4406.08, + "text": "addition to a bunch of other special" + }, + { + "start": 4407.36, + "text": "tokens which I'll go into in a bit again" + }, + { + "start": 4410.4, + "text": "now some I'm not going to actually go" + }, + { + "start": 4411.84, + "text": "into the full detail of the pattern" + }, + { + "start": 4413.28, + "text": "change because honestly this is my" + }, + { + "start": 4415.44, + "text": "numbing uh I would just advise that you" + }, + { + "start": 4417.44, + "text": "pull out chat GPT and the regex" + }, + { + "start": 4419.88, + "text": "documentation and just step through it" + }, + { + "start": 4422.159, + "text": "but really the major changes are number" + }, + { + "start": 4424.52, + "text": "one you see this eye here that means" + }, + { + "start": 4428.08, + "text": "that the um case sensitivity this is" + }, + { + "start": 4431.08, + "text": "case insensitive match and so the" + }, + { + "start": 4433.679, + "text": "comment that we saw earlier on oh we" + }, + { + "start": 4436.12, + "text": "should have used re. uppercase uh" + }, + { + "start": 4438.4, + "text": "basically we're now going to be matching" + }, + { + "start": 4441.8, + "text": "these apostrophe s apostrophe D" + }, + { + "start": 4444.6, + "text": "apostrophe M Etc uh we're going to be" + }, + { + "start": 4446.92, + "text": "matching them both in lowercase and in" + }, + { + "start": 4448.6, + "text": "uppercase so that's fixed there's a" + }, + { + "start": 4451.32, + "text": "bunch of different like handling of the" + }, + { + "start": 4452.76, + "text": "whites space that I'm not going to go" + }, + { + "start": 4454.08, + "text": "into the full details of and then one" + }, + { + "start": 4456.48, + "text": "more thing here is you will notice that" + }, + { + "start": 4458.639, + "text": "when they match the numbers they only" + }, + { + "start": 4460.679, + "text": "match one to three numbers so so they" + }, + { + "start": 4463.56, + "text": "will never merge" + }, + { + "start": 4466.12, + "text": "numbers that are in low in more than" + }, + { + "start": 4468.88, + "text": "three digits only up to three digits of" + }, + { + "start": 4471.159, + "text": "numbers will ever be merged and uh" + }, + { + "start": 4474.679, + "text": "that's one change that they made as well" + }, + { + "start": 4476.32, + "text": "to prevent uh tokens that are very very" + }, + { + "start": 4478.6, + "text": "long number" + }, + { + "start": 4480.0, + "text": "sequences uh but again we don't really" + }, + { + "start": 4482.08, + "text": "know why they do any of this stuff uh" + }, + { + "start": 4484.199, + "text": "because none of this is documented and" + }, + { + "start": 4486.28, + "text": "uh it's just we just get the pattern so" + }, + { + "start": 4489.52, + "text": "um yeah it is what it is but those are" + }, + { + "start": 4491.76, + "text": "some of the changes that gp4 has made" + }, + { + "start": 4494.36, + "text": "and of course the vocabulary size went" + }, + { + "start": 4496.36, + "text": "from roughly 50k to roughly" + }, + { + "start": 4498.4, + "text": "100K the next thing I would like to do" + }, + { + "start": 4500.4, + "text": "very briefly is to take you through the" + }, + { + "start": 4502.32, + "text": "gpt2 encoder dopy that openi has" + }, + { + "start": 4505.4, + "text": "released uh this is the file that I" + }, + { + "start": 4507.36, + "text": "already mentioned to you briefly now" + }, + { + "start": 4509.639, + "text": "this file is uh fairly short and should" + }, + { + "start": 4512.84, + "text": "be relatively understandable to you at" + }, + { + "start": 4514.639, + "text": "this point um starting at the bottom" + }, + { + "start": 4517.96, + "text": "here they are loading two files encoder" + }, + { + "start": 4521.48, + "text": "Json and vocab bpe and they do some" + }, + { + "start": 4524.159, + "text": "light processing on it and then they" + }, + { + "start": 4525.4, + "text": "call this encoder object which is the" + }, + { + "start": 4527.719, + "text": "tokenizer now if you'd like to inspect" + }, + { + "start": 4530.12, + "text": "these two files which together" + }, + { + "start": 4531.96, + "text": "constitute their saved tokenizer then" + }, + { + "start": 4534.56, + "text": "you can do that with a piece of code" + }, + { + "start": 4536.12, + "text": "like" + }, + { + "start": 4536.84, + "text": "this um this is where you can download" + }, + { + "start": 4539.32, + "text": "these two files and you can inspect them" + }, + { + "start": 4540.8, + "text": "if you'd like and what you will find is" + }, + { + "start": 4542.88, + "text": "that this encoder as they call it in" + }, + { + "start": 4545.08, + "text": "their code is exactly equivalent to our" + }, + { + "start": 4547.639, + "text": "vocab so remember here where we have" + }, + { + "start": 4551.8, + "text": "this vocab object which allowed us us to" + }, + { + "start": 4553.48, + "text": "decode very efficiently and basically it" + }, + { + "start": 4556.0, + "text": "took us from the integer to the byes uh" + }, + { + "start": 4560.12, + "text": "for that integer so our vocab is exactly" + }, + { + "start": 4563.32, + "text": "their encoder and then their vocab bpe" + }, + { + "start": 4567.76, + "text": "confusingly is actually are merges so" + }, + { + "start": 4571.159, + "text": "their BP merges which is based on the" + }, + { + "start": 4574.0, + "text": "data inside vocab bpe ends up being" + }, + { + "start": 4576.679, + "text": "equivalent to our merges so uh basically" + }, + { + "start": 4580.679, + "text": "they are saving and loading the two uh" + }, + { + "start": 4584.36, + "text": "variables that for us are also critical" + }, + { + "start": 4586.239, + "text": "the merges variable and the vocab" + }, + { + "start": 4588.32, + "text": "variable using just these two variables" + }, + { + "start": 4591.12, + "text": "you can represent a tokenizer and you" + }, + { + "start": 4592.56, + "text": "can both do encoding and decoding once" + }, + { + "start": 4594.52, + "text": "you've trained this" + }, + { + "start": 4596.0, + "text": "tokenizer now the only thing that um is" + }, + { + "start": 4600.0, + "text": "actually slightly confusing inside what" + }, + { + "start": 4602.56, + "text": "opening ey does here is that in addition" + }, + { + "start": 4604.52, + "text": "to this encoder and a decoder they also" + }, + { + "start": 4606.88, + "text": "have something called a bite encoder and" + }, + { + "start": 4608.52, + "text": "a bite decoder and this is actually" + }, + { + "start": 4611.28, + "text": "unfortunately just" + }, + { + "start": 4613.96, + "text": "kind of a spirous implementation detail" + }, + { + "start": 4615.88, + "text": "and isn't actually deep or interesting" + }, + { + "start": 4617.719, + "text": "in any way so I'm going to skip the" + }, + { + "start": 4619.08, + "text": "discussion of it but what opening ey" + }, + { + "start": 4621.04, + "text": "does here for reasons that I don't fully" + }, + { + "start": 4622.8, + "text": "understand is that not only have they" + }, + { + "start": 4625.0, + "text": "this tokenizer which can encode and" + }, + { + "start": 4626.44, + "text": "decode but they have a whole separate" + }, + { + "start": 4628.159, + "text": "layer here in addition that is used" + }, + { + "start": 4630.0, + "text": "serially with the tokenizer and so you" + }, + { + "start": 4632.639, + "text": "first do um bite encode and then encode" + }, + { + "start": 4636.08, + "text": "and then you do decode and then bite" + }, + { + "start": 4637.679, + "text": "decode so that's the loop and they are" + }, + { + "start": 4640.239, + "text": "just stacked serial on top of each other" + }, + { + "start": 4642.84, + "text": "and and it's not that interesting so I" + }, + { + "start": 4644.719, + "text": "won't cover it and you can step through" + }, + { + "start": 4645.96, + "text": "it if you'd like otherwise this file if" + }, + { + "start": 4648.639, + "text": "you ignore the bite encoder and the bite" + }, + { + "start": 4650.239, + "text": "decoder will be algorithmically very" + }, + { + "start": 4651.88, + "text": "familiar with you and the meat of it" + }, + { + "start": 4653.96, + "text": "here is the what they call bpe function" + }, + { + "start": 4657.04, + "text": "and you should recognize this Loop here" + }, + { + "start": 4659.639, + "text": "which is very similar to our own y Loop" + }, + { + "start": 4661.96, + "text": "where they're trying to identify the" + }, + { + "start": 4663.52, + "text": "Byram uh a pair that they should be" + }, + { + "start": 4666.96, + "text": "merging next and then here just like we" + }, + { + "start": 4669.159, + "text": "had they have a for Loop trying to merge" + }, + { + "start": 4670.96, + "text": "this pair uh so they will go over all of" + }, + { + "start": 4673.6, + "text": "the sequence and they will merge the" + }, + { + "start": 4675.12, + "text": "pair whenever they find it and they keep" + }, + { + "start": 4677.84, + "text": "repeating that until they run out of" + }, + { + "start": 4679.8, + "text": "possible merges in the in the text so" + }, + { + "start": 4682.36, + "text": "that's the meat of this file and uh" + }, + { + "start": 4684.56, + "text": "there's an encode and a decode function" + }, + { + "start": 4686.04, + "text": "just like we have implemented it so long" + }, + { + "start": 4688.159, + "text": "story short what I want you to take away" + }, + { + "start": 4689.719, + "text": "at this point is that unfortunately it's" + }, + { + "start": 4691.639, + "text": "a little bit of a messy code that they" + }, + { + "start": 4693.0, + "text": "have but algorithmically it is identical" + }, + { + "start": 4695.12, + "text": "to what we've built up above and what" + }, + { + "start": 4697.719, + "text": "we've built up above if you understand" + }, + { + "start": 4699.159, + "text": "it is algorithmically what is necessary" + }, + { + "start": 4701.32, + "text": "to actually build a BP to organizer" + }, + { + "start": 4703.719, + "text": "train it and then both encode and decode" + }, + { + "start": 4706.84, + "text": "the next topic I would like to turn to" + }, + { + "start": 4708.28, + "text": "is that of special tokens so in addition" + }, + { + "start": 4710.92, + "text": "to tokens that are coming from you know" + }, + { + "start": 4712.6, + "text": "raw bytes and the BP merges we can" + }, + { + "start": 4715.239, + "text": "insert all kinds of tokens that we are" + }, + { + "start": 4716.8, + "text": "going to use to delimit different parts" + }, + { + "start": 4718.96, + "text": "of the data or introduced to create a" + }, + { + "start": 4721.04, + "text": "special structure of the token streams" + }, + { + "start": 4724.8, + "text": "so in uh if you look at this encoder" + }, + { + "start": 4727.48, + "text": "object from open AIS gpd2 right here we" + }, + { + "start": 4730.88, + "text": "mentioned this is very similar to our" + }, + { + "start": 4732.159, + "text": "vocab you'll notice that the length of" + }, + { + "start": 4734.84, + "text": "this is" + }, + { + "start": 4738.88, + "text": "50257 and as I mentioned it's mapping uh" + }, + { + "start": 4741.84, + "text": "and it's inverted from the mapping of" + }, + { + "start": 4743.36, + "text": "our vocab our vocab goes from integer to" + }, + { + "start": 4746.12, + "text": "string and they go the other way around" + }, + { + "start": 4748.08, + "text": "for no amazing reason um but the thing" + }, + { + "start": 4751.84, + "text": "to note here is that this the mapping" + }, + { + "start": 4753.28, + "text": "table here is" + }, + { + "start": 4755.0, + "text": "50257 where does that number come from" + }, + { + "start": 4758.6, + "text": "where what are the tokens as I mentioned" + }, + { + "start": 4760.8, + "text": "there are 256 raw bite token" + }, + { + "start": 4764.4, + "text": "tokens and then opena actually did" + }, + { + "start": 4767.199, + "text": "50,000" + }, + { + "start": 4768.639, + "text": "merges so those become the other tokens" + }, + { + "start": 4772.0, + "text": "but this would have been" + }, + { + "start": 4774.04, + "text": "50256 so what is the 57th token and" + }, + { + "start": 4777.679, + "text": "there is basically one special" + }, + { + "start": 4780.52, + "text": "token and that one special token you can" + }, + { + "start": 4783.239, + "text": "see is called end of text so this is a" + }, + { + "start": 4787.04, + "text": "special token and it's the very last" + }, + { + "start": 4789.56, + "text": "token and this token is used to delimit" + }, + { + "start": 4792.48, + "text": "documents ments in the training set so" + }, + { + "start": 4795.76, + "text": "when we're creating the training data we" + }, + { + "start": 4797.32, + "text": "have all these documents and we tokenize" + }, + { + "start": 4799.199, + "text": "them and we get a stream of tokens those" + }, + { + "start": 4801.8, + "text": "tokens only range from Z to" + }, + { + "start": 4805.28, + "text": "50256 and then in between those" + }, + { + "start": 4807.4, + "text": "documents we put special end of text" + }, + { + "start": 4810.4, + "text": "token and we insert that token in" + }, + { + "start": 4812.8, + "text": "between documents and we are using this" + }, + { + "start": 4815.639, + "text": "as a signal to the language model that" + }, + { + "start": 4818.4, + "text": "the document has ended and what follows" + }, + { + "start": 4820.719, + "text": "is going to be unrelated to the document" + }, + { + "start": 4823.28, + "text": "previously that said the language model" + }, + { + "start": 4825.199, + "text": "has to learn this from data it it needs" + }, + { + "start": 4827.199, + "text": "to learn that this token usually means" + }, + { + "start": 4829.719, + "text": "that it should wipe its sort of memory" + }, + { + "start": 4831.92, + "text": "of what came before and what came before" + }, + { + "start": 4834.04, + "text": "this token is not actually informative" + }, + { + "start": 4835.56, + "text": "to what comes next but we are expecting" + }, + { + "start": 4837.56, + "text": "the language model to just like learn" + }, + { + "start": 4839.0, + "text": "this but we're giving it the Special" + }, + { + "start": 4840.92, + "text": "sort of the limiter of these documents" + }, + { + "start": 4844.08, + "text": "we can go here to Tech tokenizer and um" + }, + { + "start": 4846.679, + "text": "this the gpt2 tokenizer uh our code that" + }, + { + "start": 4849.48, + "text": "we've been playing with before so we can" + }, + { + "start": 4851.44, + "text": "add here right hello world world how are" + }, + { + "start": 4853.679, + "text": "you and we're getting different tokens" + }, + { + "start": 4855.84, + "text": "but now you can see what if what happens" + }, + { + "start": 4858.239, + "text": "if I put end of text you see how until I" + }, + { + "start": 4862.199, + "text": "finished it these are all different" + }, + { + "start": 4863.92, + "text": "tokens end of" + }, + { + "start": 4866.36, + "text": "text still set different tokens and now" + }, + { + "start": 4868.8, + "text": "when I finish it suddenly we get token" + }, + { + "start": 4873.28, + "text": "50256 and the reason this works is" + }, + { + "start": 4875.88, + "text": "because this didn't actually go through" + }, + { + "start": 4878.239, + "text": "the bpe merges instead the code that" + }, + { + "start": 4881.92, + "text": "actually outposted tokens has special" + }, + { + "start": 4885.0, + "text": "case instructions for handling special" + }, + { + "start": 4888.04, + "text": "tokens um we did not see these special" + }, + { + "start": 4890.76, + "text": "instructions for handling special tokens" + }, + { + "start": 4892.84, + "text": "in the encoder dopy it's absent there" + }, + { + "start": 4896.36, + "text": "but if you go to Tech token Library" + }, + { + "start": 4898.0, + "text": "which is uh implemented in Rust you will" + }, + { + "start": 4900.92, + "text": "find all kinds of special case handling" + }, + { + "start": 4902.639, + "text": "for these special tokens that you can" + }, + { + "start": 4904.52, + "text": "register uh create adds to the" + }, + { + "start": 4907.12, + "text": "vocabulary and then it looks for them" + }, + { + "start": 4909.0, + "text": "and it uh whenever it sees these special" + }, + { + "start": 4910.92, + "text": "tokens like this it will actually come" + }, + { + "start": 4913.44, + "text": "in and swap in that special token so" + }, + { + "start": 4916.08, + "text": "these things are outside of the typical" + }, + { + "start": 4918.12, + "text": "algorithm of uh B PA en" + }, + { + "start": 4920.56, + "text": "coding so these special tokens are used" + }, + { + "start": 4922.92, + "text": "pervasively uh not just in uh basically" + }, + { + "start": 4925.639, + "text": "base language modeling of predicting the" + }, + { + "start": 4927.4, + "text": "next token in the sequence but" + }, + { + "start": 4929.08, + "text": "especially when it gets to later to the" + }, + { + "start": 4930.679, + "text": "fine tuning stage and all of the chat uh" + }, + { + "start": 4933.239, + "text": "gbt sort of aspects of it uh because we" + }, + { + "start": 4935.679, + "text": "don't just want to Del limit documents" + }, + { + "start": 4936.88, + "text": "we want to delimit entire conversations" + }, + { + "start": 4938.719, + "text": "between an assistant and a user so if I" + }, + { + "start": 4941.56, + "text": "refresh this sck tokenizer page the" + }, + { + "start": 4944.239, + "text": "default example that they have here is" + }, + { + "start": 4946.44, + "text": "using not sort of base model encoders" + }, + { + "start": 4950.12, + "text": "but ftuned model uh sort of tokenizers" + }, + { + "start": 4953.6, + "text": "um so for example using the GPT 3.5" + }, + { + "start": 4955.84, + "text": "turbo scheme these here are all special" + }, + { + "start": 4958.96, + "text": "tokens I am start I end Etc uh this is" + }, + { + "start": 4963.239, + "text": "short for Imaginary mcore start by the" + }, + { + "start": 4966.84, + "text": "way but you can see here that there's a" + }, + { + "start": 4969.6, + "text": "sort of start and end of every single" + }, + { + "start": 4971.199, + "text": "message and there can be many other" + }, + { + "start": 4972.56, + "text": "other tokens lots of tokens um in use to" + }, + { + "start": 4976.52, + "text": "delimit these conversations and kind of" + }, + { + "start": 4978.719, + "text": "keep track of the flow of the messages" + }, + { + "start": 4980.84, + "text": "here now we can go back to the Tik token" + }, + { + "start": 4983.8, + "text": "library and here when you scroll to the" + }, + { + "start": 4986.239, + "text": "bottom they talk about how you can" + }, + { + "start": 4988.159, + "text": "extend tick token and I can you can" + }, + { + "start": 4990.239, + "text": "create basically you can Fork uh the um" + }, + { + "start": 4993.679, + "text": "CL 100K base tokenizers in gp4 and for" + }, + { + "start": 4997.32, + "text": "example you can extend it by adding more" + }, + { + "start": 4998.92, + "text": "special tokens and these are totally up" + }, + { + "start": 5000.36, + "text": "to you you can come up with any" + }, + { + "start": 5001.36, + "text": "arbitrary tokens and add them with the" + }, + { + "start": 5003.76, + "text": "new ID afterwards and the tikken library" + }, + { + "start": 5006.52, + "text": "will uh correctly swap them out uh when" + }, + { + "start": 5009.88, + "text": "it sees this in the" + }, + { + "start": 5011.76, + "text": "strings now we can also go back to this" + }, + { + "start": 5014.96, + "text": "file which we've looked at previously" + }, + { + "start": 5017.08, + "text": "and I mentioned that the gpt2 in Tik" + }, + { + "start": 5019.679, + "text": "toen open" + }, + { + "start": 5021.44, + "text": "I.P we have the vocabulary we have the" + }, + { + "start": 5024.0, + "text": "pattern for splitting and then here we" + }, + { + "start": 5026.28, + "text": "are registering the single special token" + }, + { + "start": 5028.04, + "text": "in gpd2 which was the end of text token" + }, + { + "start": 5030.32, + "text": "and we saw that it has this ID" + }, + { + "start": 5033.0, + "text": "in GPT 4 when they defy this here you" + }, + { + "start": 5036.4, + "text": "see that the pattern has changed as" + }, + { + "start": 5037.6, + "text": "we've discussed but also the special" + }, + { + "start": 5039.36, + "text": "tokens have changed in this tokenizer so" + }, + { + "start": 5041.8, + "text": "we of course have the end of text just" + }, + { + "start": 5043.719, + "text": "like in gpd2 but we also see three sorry" + }, + { + "start": 5046.88, + "text": "four additional tokens here Thim prefix" + }, + { + "start": 5049.52, + "text": "middle and suffix what is fim fim is" + }, + { + "start": 5052.36, + "text": "short for fill in the middle and if" + }, + { + "start": 5054.88, + "text": "you'd like to learn more about this idea" + }, + { + "start": 5057.0, + "text": "it comes from this paper um and I'm not" + }, + { + "start": 5060.0, + "text": "going to go into detail in this video" + }, + { + "start": 5061.199, + "text": "it's beyond this video and then there's" + }, + { + "start": 5063.44, + "text": "one additional uh serve token here so" + }, + { + "start": 5067.04, + "text": "that's that encoding as well so it's" + }, + { + "start": 5069.92, + "text": "very common basically to train a" + }, + { + "start": 5071.6, + "text": "language model and then if you'd like uh" + }, + { + "start": 5074.719, + "text": "you can add special tokens now when you" + }, + { + "start": 5077.52, + "text": "add special tokens you of course have to" + }, + { + "start": 5079.8, + "text": "um do some model surgery to the" + }, + { + "start": 5081.719, + "text": "Transformer and all the parameters" + }, + { + "start": 5083.44, + "text": "involved in that Transformer because you" + }, + { + "start": 5085.159, + "text": "are basically adding an integer and you" + }, + { + "start": 5087.119, + "text": "want to make sure that for example your" + }, + { + "start": 5088.56, + "text": "embedding Matrix for the vocabulary" + }, + { + "start": 5090.639, + "text": "tokens has to be extended by adding a" + }, + { + "start": 5093.04, + "text": "row and typically this row would be" + }, + { + "start": 5094.88, + "text": "initialized uh with small random numbers" + }, + { + "start": 5096.88, + "text": "or something like that because we need" + }, + { + "start": 5098.8, + "text": "to have a vector that now stands for" + }, + { + "start": 5101.199, + "text": "that token in addition to that you have" + }, + { + "start": 5103.28, + "text": "to go to the final layer of the" + }, + { + "start": 5104.28, + "text": "Transformer and you have to make sure" + }, + { + "start": 5105.679, + "text": "that that projection at the very end" + }, + { + "start": 5107.52, + "text": "into the classifier uh is extended by" + }, + { + "start": 5109.679, + "text": "one as well so basically there's some" + }, + { + "start": 5111.8, + "text": "model surgery involved that you have to" + }, + { + "start": 5113.48, + "text": "couple with the tokenization changes if" + }, + { + "start": 5116.52, + "text": "you are going to add special tokens but" + }, + { + "start": 5118.92, + "text": "this is a very common operation that" + }, + { + "start": 5120.199, + "text": "people do especially if they'd like to" + }, + { + "start": 5121.8, + "text": "fine tune the model for example taking" + }, + { + "start": 5123.719, + "text": "it from a base model to a chat model" + }, + { + "start": 5126.239, + "text": "like chat" + }, + { + "start": 5127.88, + "text": "GPT okay so at this point you should" + }, + { + "start": 5129.84, + "text": "have everything you need in order to" + }, + { + "start": 5131.04, + "text": "build your own gp4 tokenizer now in the" + }, + { + "start": 5133.719, + "text": "process of developing this lecture I've" + }, + { + "start": 5135.36, + "text": "done that and I published the code under" + }, + { + "start": 5137.239, + "text": "this repository" + }, + { + "start": 5138.92, + "text": "MBP so MBP looks like this right now as" + }, + { + "start": 5142.52, + "text": "I'm recording but uh the MBP repository" + }, + { + "start": 5145.36, + "text": "will probably change quite a bit because" + }, + { + "start": 5146.719, + "text": "I intend to continue working on it um in" + }, + { + "start": 5149.84, + "text": "addition to the MBP repository I've" + }, + { + "start": 5151.76, + "text": "published the this uh exercise" + }, + { + "start": 5153.44, + "text": "progression that you can follow so if" + }, + { + "start": 5155.36, + "text": "you go to exercise. MD here uh this is" + }, + { + "start": 5158.36, + "text": "sort of me breaking up the task ahead of" + }, + { + "start": 5161.159, + "text": "you into four steps that sort of uh" + }, + { + "start": 5163.4, + "text": "build up to what can be a gp4 tokenizer" + }, + { + "start": 5166.639, + "text": "and so feel free to follow these steps" + }, + { + "start": 5168.4, + "text": "exactly and follow a little bit of the" + }, + { + "start": 5170.4, + "text": "guidance that I've laid out here and" + }, + { + "start": 5172.48, + "text": "anytime you feel stuck just reference" + }, + { + "start": 5174.639, + "text": "the MBP repository here so either the" + }, + { + "start": 5177.96, + "text": "tests could be useful or the MBP" + }, + { + "start": 5180.08, + "text": "repository itself I try to keep the code" + }, + { + "start": 5182.6, + "text": "fairly clean and understandable and so" + }, + { + "start": 5186.159, + "text": "um feel free to reference it whenever um" + }, + { + "start": 5188.92, + "text": "you get" + }, + { + "start": 5190.159, + "text": "stuck uh in addition to that basically" + }, + { + "start": 5192.56, + "text": "once you write it you should be able to" + }, + { + "start": 5194.679, + "text": "reproduce this behavior from Tech token" + }, + { + "start": 5196.84, + "text": "so getting the gb4 tokenizer you can" + }, + { + "start": 5199.32, + "text": "take uh you can encode the string and" + }, + { + "start": 5201.32, + "text": "you should get these tokens and then you" + }, + { + "start": 5203.239, + "text": "can encode and decode the exact same" + }, + { + "start": 5204.679, + "text": "string to recover it and in addition to" + }, + { + "start": 5207.239, + "text": "all that you should be able to implement" + }, + { + "start": 5208.4, + "text": "your own train function uh which Tik" + }, + { + "start": 5210.719, + "text": "token Library does not provide it's it's" + }, + { + "start": 5212.48, + "text": "again only inference code but you could" + }, + { + "start": 5214.6, + "text": "write your own train MBP does it as well" + }, + { + "start": 5217.88, + "text": "and that will allow you to train your" + }, + { + "start": 5219.32, + "text": "own token" + }, + { + "start": 5220.719, + "text": "vocabularies so here are some of the" + }, + { + "start": 5222.4, + "text": "code inside M be mean bpe uh shows the" + }, + { + "start": 5226.04, + "text": "token vocabularies that you might obtain" + }, + { + "start": 5228.719, + "text": "so on the left uh here we have the GPT 4" + }, + { + "start": 5232.4, + "text": "merges uh so the first 256 are raw" + }, + { + "start": 5235.84, + "text": "individual bytes and then here I am" + }, + { + "start": 5237.719, + "text": "visualizing the merges that gp4" + }, + { + "start": 5239.56, + "text": "performed during its training so the" + }, + { + "start": 5241.76, + "text": "very first merge that gp4 did was merge" + }, + { + "start": 5244.92, + "text": "two spaces into a single token for you" + }, + { + "start": 5247.6, + "text": "know two spaces and that is a token 256" + }, + { + "start": 5250.84, + "text": "and so this is the order in which things" + }, + { + "start": 5252.239, + "text": "merged during gb4 training and this is" + }, + { + "start": 5254.679, + "text": "the merge order that um we obtain in MBP" + }, + { + "start": 5259.08, + "text": "by training a tokenizer and in this case" + }, + { + "start": 5261.199, + "text": "I trained it on a Wikipedia page of" + }, + { + "start": 5263.239, + "text": "Taylor Swift uh not because I'm a Swifty" + }, + { + "start": 5265.6, + "text": "but because that is one of the longest" + }, + { + "start": 5267.8, + "text": "um Wikipedia Pages apparently that's" + }, + { + "start": 5269.639, + "text": "available but she is pretty cool and" + }, + { + "start": 5274.04, + "text": "um what was I going to say yeah so you" + }, + { + "start": 5276.639, + "text": "can compare these two uh vocabularies" + }, + { + "start": 5279.08, + "text": "and so as an example um here GPT for" + }, + { + "start": 5284.0, + "text": "merged I in to become in and we've done" + }, + { + "start": 5286.8, + "text": "the exact same thing on this token 259" + }, + { + "start": 5290.0, + "text": "here space t becomes space t and that" + }, + { + "start": 5293.28, + "text": "happened for us a little bit later as" + }, + { + "start": 5294.639, + "text": "well so the difference here is again to" + }, + { + "start": 5296.719, + "text": "my understanding only a difference of" + }, + { + "start": 5298.4, + "text": "the training set so as an example" + }, + { + "start": 5300.28, + "text": "because I see a lot of white space I" + }, + { + "start": 5302.08, + "text": "supect that gp4 probably had a lot of" + }, + { + "start": 5303.76, + "text": "python code in its training set I'm not" + }, + { + "start": 5305.48, + "text": "sure uh for the" + }, + { + "start": 5307.6, + "text": "tokenizer and uh here we see much less" + }, + { + "start": 5310.08, + "text": "of that of course in the Wikipedia page" + }, + { + "start": 5312.96, + "text": "so roughly speaking they look the same" + }, + { + "start": 5314.679, + "text": "and they look the same because they're" + }, + { + "start": 5315.96, + "text": "running the same algorithm and when you" + }, + { + "start": 5318.08, + "text": "train your own you're probably going to" + }, + { + "start": 5319.199, + "text": "get something similar depending on what" + }, + { + "start": 5321.199, + "text": "you train it on okay so we are now going" + }, + { + "start": 5323.28, + "text": "to move on from tick token and the way" + }, + { + "start": 5325.08, + "text": "that open AI tokenizes its strings and" + }, + { + "start": 5327.6, + "text": "we're going to discuss one more very" + }, + { + "start": 5329.199, + "text": "commonly used library for working with" + }, + { + "start": 5331.0, + "text": "tokenization inlm" + }, + { + "start": 5332.719, + "text": "and that is sentence piece so sentence" + }, + { + "start": 5335.36, + "text": "piece is very commonly used in language" + }, + { + "start": 5338.159, + "text": "models because unlike Tik token it can" + }, + { + "start": 5340.119, + "text": "do both training and inference and is" + }, + { + "start": 5342.36, + "text": "quite efficient at both it supports a" + }, + { + "start": 5344.84, + "text": "number of algorithms for training uh" + }, + { + "start": 5346.76, + "text": "vocabularies but one of them is the B" + }, + { + "start": 5349.199, + "text": "pair en coding algorithm that we've been" + }, + { + "start": 5350.44, + "text": "looking at so it supports it now" + }, + { + "start": 5353.639, + "text": "sentence piece is used both by llama and" + }, + { + "start": 5355.719, + "text": "mistal series and many other models as" + }, + { + "start": 5358.199, + "text": "well it is on GitHub under Google" + }, + { + "start": 5360.76, + "text": "sentence piece" + }, + { + "start": 5362.76, + "text": "and the big difference with sentence" + }, + { + "start": 5364.4, + "text": "piece and we're going to look at example" + }, + { + "start": 5366.199, + "text": "because this is kind of hard and subtle" + }, + { + "start": 5367.92, + "text": "to explain is that they think different" + }, + { + "start": 5371.04, + "text": "about the order of operations here so in" + }, + { + "start": 5375.48, + "text": "the case of Tik token we first take our" + }, + { + "start": 5378.56, + "text": "code points in the string we encode them" + }, + { + "start": 5381.0, + "text": "using mutf to bytes and then we're" + }, + { + "start": 5382.88, + "text": "merging bytes it's fairly" + }, + { + "start": 5384.96, + "text": "straightforward for sentence piece um it" + }, + { + "start": 5388.88, + "text": "works directly on the level of the code" + }, + { + "start": 5390.4, + "text": "points themselves so so it looks at" + }, + { + "start": 5392.52, + "text": "whatever code points are available in" + }, + { + "start": 5393.92, + "text": "your training set and then it starts" + }, + { + "start": 5395.88, + "text": "merging those code points and um the bpe" + }, + { + "start": 5399.76, + "text": "is running on the level of code" + }, + { + "start": 5401.6, + "text": "points and if you happen to run out of" + }, + { + "start": 5404.239, + "text": "code points so there are maybe some rare" + }, + { + "start": 5406.76, + "text": "uh code points that just don't come up" + }, + { + "start": 5408.04, + "text": "too often and the Rarity is determined" + }, + { + "start": 5409.719, + "text": "by this character coverage hyper" + }, + { + "start": 5411.199, + "text": "parameter then these uh code points will" + }, + { + "start": 5414.36, + "text": "either get mapped to a special unknown" + }, + { + "start": 5416.28, + "text": "token like ank or if you have the bite" + }, + { + "start": 5419.52, + "text": "foldback option turned on then that will" + }, + { + "start": 5422.119, + "text": "take those rare Cod points it will" + }, + { + "start": 5423.96, + "text": "encode them using utf8 and then the" + }, + { + "start": 5426.08, + "text": "individual bytes of that encoding will" + }, + { + "start": 5427.76, + "text": "be translated into tokens and there are" + }, + { + "start": 5430.119, + "text": "these special bite tokens that basically" + }, + { + "start": 5432.199, + "text": "get added to the vocabulary so it uses" + }, + { + "start": 5435.52, + "text": "BP on on the code points and then it" + }, + { + "start": 5438.239, + "text": "falls back to bytes for rare Cod points" + }, + { + "start": 5441.8, + "text": "um and so that's kind of like difference" + }, + { + "start": 5444.08, + "text": "personally I find the Tik token we" + }, + { + "start": 5445.52, + "text": "significantly cleaner uh but it's kind" + }, + { + "start": 5447.48, + "text": "of like a subtle but pretty major" + }, + { + "start": 5448.84, + "text": "difference between the way they approach" + }, + { + "start": 5450.32, + "text": "tokenization let's work with with a" + }, + { + "start": 5452.04, + "text": "concrete example because otherwise this" + }, + { + "start": 5454.0, + "text": "is kind of hard to um to get your head" + }, + { + "start": 5456.719, + "text": "around so let's work with a concrete" + }, + { + "start": 5459.119, + "text": "example this is how we can import" + }, + { + "start": 5461.119, + "text": "sentence piece and then here we're going" + }, + { + "start": 5463.6, + "text": "to take I think I took like the" + }, + { + "start": 5465.199, + "text": "description of sentence piece and I just" + }, + { + "start": 5466.76, + "text": "created like a little toy data set it" + }, + { + "start": 5468.679, + "text": "really likes to have a file so I created" + }, + { + "start": 5470.4, + "text": "a toy. txt file with this" + }, + { + "start": 5473.08, + "text": "content now what's kind of a little bit" + }, + { + "start": 5475.52, + "text": "crazy about sentence piece is that" + }, + { + "start": 5476.76, + "text": "there's a ton of options and" + }, + { + "start": 5478.679, + "text": "configurations and the reason this is so" + }, + { + "start": 5480.8, + "text": "is because sentence piece has been" + }, + { + "start": 5482.199, + "text": "around I think for a while and it really" + }, + { + "start": 5483.84, + "text": "tries to handle a large diversity of" + }, + { + "start": 5485.76, + "text": "things and um because it's been around I" + }, + { + "start": 5488.44, + "text": "think it has quite a bit of accumulated" + }, + { + "start": 5490.52, + "text": "historical baggage uh as well and so in" + }, + { + "start": 5493.679, + "text": "particular there's like a ton of" + }, + { + "start": 5495.56, + "text": "configuration arguments this is not even" + }, + { + "start": 5496.96, + "text": "all of it you can go to here to see all" + }, + { + "start": 5499.8, + "text": "the training" + }, + { + "start": 5500.96, + "text": "options um and uh there's also quite" + }, + { + "start": 5504.4, + "text": "useful documentation when you look at" + }, + { + "start": 5505.719, + "text": "the raw Proto buff uh that is used to" + }, + { + "start": 5508.6, + "text": "represent the trainer spec and so on um" + }, + { + "start": 5512.44, + "text": "many of these options are irrelevant to" + }, + { + "start": 5514.52, + "text": "us so maybe to point out one example Das" + }, + { + "start": 5516.96, + "text": "Das shrinking Factor uh this shrinking" + }, + { + "start": 5519.84, + "text": "factor is not used in the B pair en" + }, + { + "start": 5521.28, + "text": "coding algorithm so this is just an" + }, + { + "start": 5523.159, + "text": "argument that is irrelevant to us um it" + }, + { + "start": 5525.92, + "text": "applies to a different training" + }, + { + "start": 5529.52, + "text": "algorithm now what I tried to do here is" + }, + { + "start": 5531.92, + "text": "I tried to set up sentence piece in a" + }, + { + "start": 5533.88, + "text": "way that is very very similar as far as" + }, + { + "start": 5535.719, + "text": "I can tell to maybe identical hopefully" + }, + { + "start": 5538.88, + "text": "to the way that llama 2 was strained so" + }, + { + "start": 5542.08, + "text": "the way they trained their own um their" + }, + { + "start": 5545.04, + "text": "own tokenizer and the way I did this was" + }, + { + "start": 5547.119, + "text": "basically you can take the tokenizer" + }, + { + "start": 5548.719, + "text": "model file that meta released and you" + }, + { + "start": 5551.4, + "text": "can um open it using the Proto protuff" + }, + { + "start": 5555.199, + "text": "uh sort of file that you can generate" + }, + { + "start": 5558.36, + "text": "and then you can inspect all the options" + }, + { + "start": 5559.719, + "text": "and I tried to copy over all the options" + }, + { + "start": 5561.36, + "text": "that looked relevant so here we set up" + }, + { + "start": 5563.679, + "text": "the input it's raw text in this file" + }, + { + "start": 5566.6, + "text": "here's going to be the output so it's" + }, + { + "start": 5568.08, + "text": "going to be for talk 400. model and" + }, + { + "start": 5570.76, + "text": "vocab" + }, + { + "start": 5572.44, + "text": "we're saying that we're going to use the" + }, + { + "start": 5573.4, + "text": "BP algorithm and we want to Bap size of" + }, + { + "start": 5576.04, + "text": "400 then there's a ton of configurations" + }, + { + "start": 5578.6, + "text": "here" + }, + { + "start": 5581.08, + "text": "for um for basically pre-processing and" + }, + { + "start": 5585.08, + "text": "normalization rules as they're called" + }, + { + "start": 5587.08, + "text": "normalization used to be very prevalent" + }, + { + "start": 5589.48, + "text": "I would say before llms in natural" + }, + { + "start": 5591.159, + "text": "language processing so in machine" + }, + { + "start": 5592.8, + "text": "translation and uh text classification" + }, + { + "start": 5594.88, + "text": "and so on you want to normalize and" + }, + { + "start": 5596.719, + "text": "simplify the text and you want to turn" + }, + { + "start": 5598.0, + "text": "it all lowercase and you want to remove" + }, + { + "start": 5599.52, + "text": "all double whites space Etc" + }, + { + "start": 5602.199, + "text": "and in language models we prefer not to" + }, + { + "start": 5603.76, + "text": "do any of it or at least that is my" + }, + { + "start": 5605.28, + "text": "preference as a deep learning person you" + }, + { + "start": 5606.96, + "text": "want to not touch your data you want to" + }, + { + "start": 5608.84, + "text": "keep the raw data as much as possible um" + }, + { + "start": 5611.679, + "text": "in a raw" + }, + { + "start": 5613.119, + "text": "form so you're basically trying to turn" + }, + { + "start": 5615.159, + "text": "off a lot of this if you can the other" + }, + { + "start": 5618.0, + "text": "thing that sentence piece does is that" + }, + { + "start": 5619.52, + "text": "it has this concept of sentences so" + }, + { + "start": 5623.04, + "text": "sentence piece it's back it's kind of" + }, + { + "start": 5625.48, + "text": "like was developed I think early in the" + }, + { + "start": 5626.84, + "text": "days where there was um an idea that" + }, + { + "start": 5630.159, + "text": "they you're training a tokenizer on a" + }, + { + "start": 5631.96, + "text": "bunch of independent sentences so it has" + }, + { + "start": 5634.199, + "text": "a lot of like how many sentences you're" + }, + { + "start": 5636.36, + "text": "going to train on what is the maximum" + }, + { + "start": 5638.0, + "text": "sentence length" + }, + { + "start": 5640.679, + "text": "um shuffling sentences and so for it" + }, + { + "start": 5643.719, + "text": "sentences are kind of like the" + }, + { + "start": 5644.8, + "text": "individual training examples but again" + }, + { + "start": 5646.88, + "text": "in the context of llms I find that this" + }, + { + "start": 5648.719, + "text": "is like a very spous and weird" + }, + { + "start": 5650.44, + "text": "distinction like sentences are just like" + }, + { + "start": 5653.92, + "text": "don't touch the raw data sentences" + }, + { + "start": 5655.6, + "text": "happen to exist but in raw data sets" + }, + { + "start": 5658.679, + "text": "there are a lot of like inet like what" + }, + { + "start": 5660.6, + "text": "exactly is a sentence what isn't a" + }, + { + "start": 5662.44, + "text": "sentence um and so I think like it's" + }, + { + "start": 5665.0, + "text": "really hard to Define what an actual" + }, + { + "start": 5666.48, + "text": "sentence is if you really like dig into" + }, + { + "start": 5668.639, + "text": "it and there could be different concepts" + }, + { + "start": 5670.92, + "text": "of it in different languages or" + }, + { + "start": 5672.119, + "text": "something like that so why even" + }, + { + "start": 5673.719, + "text": "introduce the concept it it doesn't" + }, + { + "start": 5675.56, + "text": "honestly make sense to me I would just" + }, + { + "start": 5676.92, + "text": "prefer to treat a file as a giant uh" + }, + { + "start": 5679.199, + "text": "stream of" + }, + { + "start": 5680.36, + "text": "bytes it has a lot of treatment around" + }, + { + "start": 5682.8, + "text": "rare word characters and when I say word" + }, + { + "start": 5685.119, + "text": "I mean code points we're going to come" + }, + { + "start": 5686.48, + "text": "back to this in a second and it has a" + }, + { + "start": 5688.679, + "text": "lot of other rules for um basically" + }, + { + "start": 5691.679, + "text": "splitting digits splitting white space" + }, + { + "start": 5694.48, + "text": "and numbers and how you deal with that" + }, + { + "start": 5696.56, + "text": "so these are some kind of like merge" + }, + { + "start": 5698.199, + "text": "rules so I think this is a little bit" + }, + { + "start": 5700.08, + "text": "equivalent to tick token using the" + }, + { + "start": 5702.92, + "text": "regular expression to split up" + }, + { + "start": 5704.52, + "text": "categories there's like kind of" + }, + { + "start": 5707.04, + "text": "equivalence of it if you squint T it in" + }, + { + "start": 5709.239, + "text": "sentence piece where you can also for" + }, + { + "start": 5710.639, + "text": "example split up split up the digits uh" + }, + { + "start": 5714.199, + "text": "and uh so" + }, + { + "start": 5715.84, + "text": "on there's a few more things here that" + }, + { + "start": 5718.199, + "text": "I'll come back to in a bit and then" + }, + { + "start": 5719.36, + "text": "there are some special tokens that you" + }, + { + "start": 5720.48, + "text": "can indicate and it hardcodes the UN" + }, + { + "start": 5723.36, + "text": "token the beginning of sentence end of" + }, + { + "start": 5725.56, + "text": "sentence and a pad token um and the UN" + }, + { + "start": 5729.32, + "text": "token must exist for my understanding" + }, + { + "start": 5732.52, + "text": "and then some some things so we can" + }, + { + "start": 5734.719, + "text": "train and when when I press train it's" + }, + { + "start": 5737.28, + "text": "going to create this file talk 400." + }, + { + "start": 5740.119, + "text": "model and talk 400. wab I can then load" + }, + { + "start": 5743.159, + "text": "the model file and I can inspect the" + }, + { + "start": 5745.56, + "text": "vocabulary off it and so we trained" + }, + { + "start": 5748.56, + "text": "vocab size 400 on this text here and" + }, + { + "start": 5753.32, + "text": "these are the individual pieces the" + }, + { + "start": 5755.0, + "text": "individual tokens that sentence piece" + }, + { + "start": 5756.88, + "text": "will create so in the beginning we see" + }, + { + "start": 5758.8, + "text": "that we have the an token uh with the ID" + }, + { + "start": 5762.08, + "text": "zero then we have the beginning of" + }, + { + "start": 5764.04, + "text": "sequence end of sequence one and two and" + }, + { + "start": 5767.8, + "text": "then we said that the pad ID is negative" + }, + { + "start": 5769.32, + "text": "1 so we chose not to use it so there's" + }, + { + "start": 5772.08, + "text": "no pad ID" + }, + { + "start": 5773.48, + "text": "here then these are individual bite" + }, + { + "start": 5776.84, + "text": "tokens so here we saw that bite fallback" + }, + { + "start": 5780.159, + "text": "in llama was turned on so it's true so" + }, + { + "start": 5783.56, + "text": "what follows are going to be the 256" + }, + { + "start": 5786.159, + "text": "bite" + }, + { + "start": 5787.199, + "text": "tokens and these are their" + }, + { + "start": 5791.719, + "text": "IDs and then at the bottom after the" + }, + { + "start": 5795.04, + "text": "bite tokens come the" + }, + { + "start": 5797.679, + "text": "merges and these are the parent nodes in" + }, + { + "start": 5800.56, + "text": "the merges so we're not seeing the" + }, + { + "start": 5802.199, + "text": "children we're just seeing the parents" + }, + { + "start": 5803.719, + "text": "and their" + }, + { + "start": 5804.6, + "text": "ID and then after the" + }, + { + "start": 5807.04, + "text": "merges comes eventually the individual" + }, + { + "start": 5810.719, + "text": "tokens and their IDs and so these are" + }, + { + "start": 5813.56, + "text": "the individual tokens so these are the" + }, + { + "start": 5815.32, + "text": "individual code Point tokens if you will" + }, + { + "start": 5818.239, + "text": "and they come at the end so that is the" + }, + { + "start": 5820.28, + "text": "ordering with which sentence piece sort" + }, + { + "start": 5821.76, + "text": "of like represents its vocabularies it" + }, + { + "start": 5823.92, + "text": "starts with special tokens then the bike" + }, + { + "start": 5826.119, + "text": "tokens then the merge tokens and then" + }, + { + "start": 5828.159, + "text": "the individual codo tokens and all these" + }, + { + "start": 5831.639, + "text": "raw codepoint to tokens are the ones" + }, + { + "start": 5834.04, + "text": "that it encountered in the training" + }, + { + "start": 5836.119, + "text": "set so those individual code points are" + }, + { + "start": 5839.8, + "text": "all the the entire set of code points" + }, + { + "start": 5842.159, + "text": "that occurred" + }, + { + "start": 5844.4, + "text": "here so those all get put in there and" + }, + { + "start": 5847.48, + "text": "then those that are extremely rare as" + }, + { + "start": 5849.28, + "text": "determined by character coverage so if a" + }, + { + "start": 5851.119, + "text": "code Point occurred only a single time" + }, + { + "start": 5852.52, + "text": "out of like a million um sentences or" + }, + { + "start": 5855.159, + "text": "something like that then it would be" + }, + { + "start": 5857.08, + "text": "ignored and it would not be added to our" + }, + { + "start": 5860.199, + "text": "uh" + }, + { + "start": 5861.04, + "text": "vocabulary once we have a vocabulary we" + }, + { + "start": 5863.36, + "text": "can encode into IDs and we can um sort" + }, + { + "start": 5866.48, + "text": "of get a" + }, + { + "start": 5867.4, + "text": "list and then here I am also decoding" + }, + { + "start": 5870.679, + "text": "the indiv idual tokens back into little" + }, + { + "start": 5874.32, + "text": "pieces as they call it so let's take a" + }, + { + "start": 5876.96, + "text": "look at what happened here hello space" + }, + { + "start": 5881.08, + "text": "on so these are the token IDs we got" + }, + { + "start": 5884.679, + "text": "back and when we look here uh a few" + }, + { + "start": 5887.48, + "text": "things sort of uh jump to mind number" + }, + { + "start": 5891.52, + "text": "one take a look at these characters the" + }, + { + "start": 5894.159, + "text": "Korean characters of course were not" + }, + { + "start": 5895.52, + "text": "part of the training set so sentence" + }, + { + "start": 5898.0, + "text": "piece is encountering code points that" + }, + { + "start": 5899.599, + "text": "it has not seen during training time and" + }, + { + "start": 5902.199, + "text": "those code points do not have a token" + }, + { + "start": 5904.56, + "text": "associated with them so suddenly these" + }, + { + "start": 5906.4, + "text": "are un tokens unknown tokens but because" + }, + { + "start": 5910.56, + "text": "bite fall back as true instead sentence" + }, + { + "start": 5913.84, + "text": "piece falls back to bytes and so it" + }, + { + "start": 5916.44, + "text": "takes this it encodes it with utf8 and" + }, + { + "start": 5919.84, + "text": "then it uses these tokens to represent" + }, + { + "start": 5923.28, + "text": "uh those bytes and that's what we are" + }, + { + "start": 5925.8, + "text": "getting sort of here this is the utf8 uh" + }, + { + "start": 5929.719, + "text": "encoding and in this shifted by three uh" + }, + { + "start": 5932.88, + "text": "because of these um special tokens here" + }, + { + "start": 5936.239, + "text": "that have IDs earlier on so that's what" + }, + { + "start": 5938.84, + "text": "happened here now one more thing that um" + }, + { + "start": 5942.92, + "text": "well first before I go on with respect" + }, + { + "start": 5945.52, + "text": "to the bitef back let me remove bite" + }, + { + "start": 5948.239, + "text": "foldback if this is false what's going" + }, + { + "start": 5950.84, + "text": "to happen let's" + }, + { + "start": 5952.52, + "text": "retrain so the first thing that happened" + }, + { + "start": 5954.44, + "text": "is all the bite tokens disappeared right" + }, + { + "start": 5957.28, + "text": "and now we just have the merges and we" + }, + { + "start": 5959.0, + "text": "have a lot more merges now because we" + }, + { + "start": 5960.48, + "text": "have a lot more space because we're not" + }, + { + "start": 5961.8, + "text": "taking up space in the wab size uh with" + }, + { + "start": 5965.04, + "text": "all the" + }, + { + "start": 5965.96, + "text": "bytes and now if we encode" + }, + { + "start": 5969.08, + "text": "this we get a zero so this entire string" + }, + { + "start": 5973.239, + "text": "here suddenly there's no bitef back so" + }, + { + "start": 5975.119, + "text": "this is unknown and unknown is an and so" + }, + { + "start": 5979.4, + "text": "this is zero because the an token is" + }, + { + "start": 5982.04, + "text": "token zero and you have to keep in mind" + }, + { + "start": 5984.92, + "text": "that this would feed into your uh" + }, + { + "start": 5986.88, + "text": "language model so what is a language" + }, + { + "start": 5988.4, + "text": "model supposed to do when all kinds of" + }, + { + "start": 5989.92, + "text": "different things that are unrecognized" + }, + { + "start": 5992.159, + "text": "because they're rare just end up mapping" + }, + { + "start": 5994.0, + "text": "into Unk it's not exactly the property" + }, + { + "start": 5996.119, + "text": "that you want so that's why I think" + }, + { + "start": 5997.76, + "text": "llama correctly uh used by fallback true" + }, + { + "start": 6002.04, + "text": "uh because we definitely want to feed" + }, + { + "start": 6003.719, + "text": "these um unknown or rare code points" + }, + { + "start": 6006.04, + "text": "into the model and some uh some manner" + }, + { + "start": 6008.56, + "text": "the next thing I want to show you is the" + }, + { + "start": 6010.679, + "text": "following notice here when we are" + }, + { + "start": 6012.48, + "text": "decoding all the individual tokens you" + }, + { + "start": 6014.719, + "text": "see how spaces uh space here ends up" + }, + { + "start": 6018.04, + "text": "being this um bold underline I'm not" + }, + { + "start": 6021.239, + "text": "100% sure by the way why sentence piece" + }, + { + "start": 6023.08, + "text": "switches whites space into these bold" + }, + { + "start": 6025.36, + "text": "underscore characters maybe it's for" + }, + { + "start": 6027.639, + "text": "visualization I'm not 100% sure why that" + }, + { + "start": 6029.88, + "text": "happens uh but notice this why do we" + }, + { + "start": 6032.44, + "text": "have an extra space in the front of" + }, + { + "start": 6037.44, + "text": "hello um what where is this coming from" + }, + { + "start": 6040.48, + "text": "well it's coming from this option" + }, + { + "start": 6043.159, + "text": "here" + }, + { + "start": 6045.04, + "text": "um add dummy prefix is true and when you" + }, + { + "start": 6048.36, + "text": "go to the" + }, + { + "start": 6049.56, + "text": "documentation add D whites space at the" + }, + { + "start": 6051.88, + "text": "beginning of text in order to treat" + }, + { + "start": 6053.36, + "text": "World in world and hello world in the" + }, + { + "start": 6055.92, + "text": "exact same way so what this is trying to" + }, + { + "start": 6057.96, + "text": "do is the" + }, + { + "start": 6059.239, + "text": "following if we go back to our tick" + }, + { + "start": 6062.04, + "text": "tokenizer world as uh token by itself" + }, + { + "start": 6066.32, + "text": "has a different ID than space world so" + }, + { + "start": 6070.239, + "text": "we have this is 1917 but this is 14 Etc" + }, + { + "start": 6074.599, + "text": "so these are two different tokens for" + }, + { + "start": 6076.0, + "text": "the language model and the language" + }, + { + "start": 6077.4, + "text": "model has to learn from data that they" + }, + { + "start": 6078.88, + "text": "are actually kind of like a very similar" + }, + { + "start": 6080.32, + "text": "concept so to the language model in the" + }, + { + "start": 6083.0, + "text": "Tik token World um basically words in" + }, + { + "start": 6086.0, + "text": "the beginning of sentences and words in" + }, + { + "start": 6087.639, + "text": "the middle of sentences actually look" + }, + { + "start": 6089.04, + "text": "completely different um and it has to" + }, + { + "start": 6092.04, + "text": "learned that they are roughly the same" + }, + { + "start": 6094.44, + "text": "so this add dami prefix is trying to" + }, + { + "start": 6096.92, + "text": "fight that a little bit and the way that" + }, + { + "start": 6098.96, + "text": "works is that it basically" + }, + { + "start": 6101.719, + "text": "uh adds a dummy prefix so for as a as a" + }, + { + "start": 6106.76, + "text": "part of pre-processing it will take the" + }, + { + "start": 6109.08, + "text": "string and it will add a space it will" + }, + { + "start": 6111.32, + "text": "do this and that's done in an effort to" + }, + { + "start": 6114.92, + "text": "make this world and that world the same" + }, + { + "start": 6117.52, + "text": "they will both be space world so that's" + }, + { + "start": 6120.28, + "text": "one other kind of pre-processing option" + }, + { + "start": 6122.159, + "text": "that is turned on and llama 2 also uh" + }, + { + "start": 6125.28, + "text": "uses this option and that's I think" + }, + { + "start": 6127.4, + "text": "everything that I want to say for my" + }, + { + "start": 6128.639, + "text": "preview of sentence piece and how it is" + }, + { + "start": 6130.44, + "text": "different um maybe here what I've done" + }, + { + "start": 6133.119, + "text": "is I just uh put in the Raw protocol" + }, + { + "start": 6136.719, + "text": "buffer representation basically of the" + }, + { + "start": 6139.84, + "text": "tokenizer the too trained so feel free" + }, + { + "start": 6142.88, + "text": "to sort of Step through this and if you" + }, + { + "start": 6144.76, + "text": "would like uh your tokenization to look" + }, + { + "start": 6147.0, + "text": "identical to that of the meta uh llama 2" + }, + { + "start": 6150.32, + "text": "then you would be copy pasting these" + }, + { + "start": 6151.679, + "text": "settings as I tried to do up above and" + }, + { + "start": 6154.76, + "text": "uh yeah that's I think that's it for" + }, + { + "start": 6156.96, + "text": "this section I think my summary for" + }, + { + "start": 6158.88, + "text": "sentence piece from all of this is" + }, + { + "start": 6160.8, + "text": "number one I think that there's a lot of" + }, + { + "start": 6162.44, + "text": "historical baggage in sentence piece a" + }, + { + "start": 6164.28, + "text": "lot of Concepts that I think are" + }, + { + "start": 6165.679, + "text": "slightly confusing and I think" + }, + { + "start": 6167.239, + "text": "potentially um contain foot guns like" + }, + { + "start": 6169.4, + "text": "this concept of a sentence and it's" + }, + { + "start": 6170.8, + "text": "maximum length and stuff like that um" + }, + { + "start": 6173.719, + "text": "otherwise it is fairly commonly used in" + }, + { + "start": 6175.88, + "text": "the industry um because it is efficient" + }, + { + "start": 6178.88, + "text": "and can do both training and inference" + }, + { + "start": 6181.0, + "text": "uh it has a few quirks like for example" + }, + { + "start": 6182.76, + "text": "un token must exist and the way the bite" + }, + { + "start": 6185.08, + "text": "fallbacks are done and so on I don't" + }, + { + "start": 6186.56, + "text": "find particularly elegant and" + }, + { + "start": 6188.36, + "text": "unfortunately I have to say it's not" + }, + { + "start": 6189.56, + "text": "very well documented so it took me a lot" + }, + { + "start": 6191.44, + "text": "of time working with this myself um and" + }, + { + "start": 6194.76, + "text": "just visualizing things and trying to" + }, + { + "start": 6196.159, + "text": "really understand what is happening here" + }, + { + "start": 6197.8, + "text": "because uh the documentation" + }, + { + "start": 6199.28, + "text": "unfortunately is in my opion not not" + }, + { + "start": 6201.44, + "text": "super amazing but it is a very nice repo" + }, + { + "start": 6204.679, + "text": "that is available to you if you'd like" + }, + { + "start": 6206.159, + "text": "to train your own tokenizer right now" + }, + { + "start": 6208.199, + "text": "okay let me now switch gears again as" + }, + { + "start": 6209.639, + "text": "we're starting to slowly wrap up here I" + }, + { + "start": 6211.719, + "text": "want to revisit this issue in a bit more" + }, + { + "start": 6213.36, + "text": "detail of how we should set the vocap" + }, + { + "start": 6215.32, + "text": "size and what are some of the" + }, + { + "start": 6216.199, + "text": "considerations around it so for this I'd" + }, + { + "start": 6219.639, + "text": "like to go back to the model" + }, + { + "start": 6220.84, + "text": "architecture that we developed in the" + }, + { + "start": 6222.159, + "text": "last video when we built the GPT from" + }, + { + "start": 6224.679, + "text": "scratch so this here was uh the file" + }, + { + "start": 6227.4, + "text": "that we built in the previous video and" + }, + { + "start": 6229.08, + "text": "we defined the Transformer model and and" + }, + { + "start": 6231.32, + "text": "let's specifically look at Bap size and" + }, + { + "start": 6232.88, + "text": "where it appears in this file so here we" + }, + { + "start": 6235.199, + "text": "Define the voap size uh at this time it" + }, + { + "start": 6238.159, + "text": "was 65 or something like that extremely" + }, + { + "start": 6239.96, + "text": "small number so this will grow much" + }, + { + "start": 6242.08, + "text": "larger you'll see that Bap size doesn't" + }, + { + "start": 6244.28, + "text": "come up too much in most of these layers" + }, + { + "start": 6246.159, + "text": "the only place that it comes up to is in" + }, + { + "start": 6248.52, + "text": "exactly these two places here so when we" + }, + { + "start": 6251.48, + "text": "Define the language model there's the" + }, + { + "start": 6253.56, + "text": "token embedding table which is this" + }, + { + "start": 6255.8, + "text": "two-dimensional array where the vocap" + }, + { + "start": 6258.08, + "text": "size is basically the number of rows and" + }, + { + "start": 6261.199, + "text": "uh each vocabulary element each token" + }, + { + "start": 6263.92, + "text": "has a vector that we're going to train" + }, + { + "start": 6265.92, + "text": "using back propagation that Vector is of" + }, + { + "start": 6267.96, + "text": "size and embed which is number of" + }, + { + "start": 6269.44, + "text": "channels in the Transformer and" + }, + { + "start": 6271.599, + "text": "basically as voap size increases this" + }, + { + "start": 6273.679, + "text": "embedding table as I mentioned earlier" + }, + { + "start": 6275.679, + "text": "is going to also grow we're going to be" + }, + { + "start": 6277.0, + "text": "adding rows in addition to that at the" + }, + { + "start": 6279.719, + "text": "end of the Transformer there's this LM" + }, + { + "start": 6281.88, + "text": "head layer which is a linear layer and" + }, + { + "start": 6284.239, + "text": "you'll notice that that layer is used at" + }, + { + "start": 6286.28, + "text": "the very end to produce the logits uh" + }, + { + "start": 6288.639, + "text": "which become the probabilities for the" + }, + { + "start": 6289.96, + "text": "next token in sequence and so" + }, + { + "start": 6291.76, + "text": "intuitively we're trying to produce a" + }, + { + "start": 6293.92, + "text": "probability for every single token that" + }, + { + "start": 6296.239, + "text": "might come next at every point in time" + }, + { + "start": 6298.84, + "text": "of that Transformer and if we have more" + }, + { + "start": 6301.08, + "text": "and more tokens we need to produce more" + }, + { + "start": 6302.679, + "text": "and more probabilities so every single" + }, + { + "start": 6304.92, + "text": "token is going to introduce an" + }, + { + "start": 6306.199, + "text": "additional dot product that we have to" + }, + { + "start": 6308.159, + "text": "do here in this linear layer for this" + }, + { + "start": 6310.199, + "text": "final layer in a" + }, + { + "start": 6311.44, + "text": "Transformer so why can't vocap size be" + }, + { + "start": 6314.56, + "text": "infinite why can't we grow to Infinity" + }, + { + "start": 6316.52, + "text": "well number one your token embedding" + }, + { + "start": 6318.199, + "text": "table is going to grow uh your linear" + }, + { + "start": 6321.56, + "text": "layer is going to grow so we're going to" + }, + { + "start": 6323.599, + "text": "be doing a lot more computation here" + }, + { + "start": 6325.119, + "text": "because this LM head layer will become" + }, + { + "start": 6326.56, + "text": "more computational expensive number two" + }, + { + "start": 6329.119, + "text": "because we have more parameters we could" + }, + { + "start": 6330.84, + "text": "be worried that we are going to be under" + }, + { + "start": 6333.44, + "text": "trining some of these" + }, + { + "start": 6335.199, + "text": "parameters so intuitively if you have a" + }, + { + "start": 6337.4, + "text": "very large vocabulary size say we have a" + }, + { + "start": 6338.96, + "text": "million uh tokens then every one of" + }, + { + "start": 6341.32, + "text": "these tokens is going to come up more" + }, + { + "start": 6342.679, + "text": "and more rarely in the training data" + }, + { + "start": 6345.04, + "text": "because there's a lot more other tokens" + }, + { + "start": 6346.52, + "text": "all over the place and so we're going to" + }, + { + "start": 6348.56, + "text": "be seeing fewer and fewer examples uh" + }, + { + "start": 6351.0, + "text": "for each individual token and you might" + }, + { + "start": 6353.28, + "text": "be worried that basically the vectors" + }, + { + "start": 6355.0, + "text": "associated with every token will be" + }, + { + "start": 6356.28, + "text": "undertrained as a result because they" + }, + { + "start": 6358.28, + "text": "just don't come up too often and they" + }, + { + "start": 6359.92, + "text": "don't participate in the forward" + }, + { + "start": 6360.96, + "text": "backward pass in addition to that as" + }, + { + "start": 6363.199, + "text": "your vocab size grows you're going to" + }, + { + "start": 6364.88, + "text": "start shrinking your sequences a lot" + }, + { + "start": 6367.04, + "text": "right and that's really nice because" + }, + { + "start": 6369.32, + "text": "that means that we're going to be" + }, + { + "start": 6370.119, + "text": "attending to more and more text so" + }, + { + "start": 6372.0, + "text": "that's nice but also you might be" + }, + { + "start": 6373.599, + "text": "worrying that two large of chunks are" + }, + { + "start": 6375.92, + "text": "being squished into single tokens and so" + }, + { + "start": 6378.56, + "text": "the model just doesn't have as much of" + }, + { + "start": 6380.719, + "text": "time to think per sort of um some number" + }, + { + "start": 6385.08, + "text": "of characters in the text or you can" + }, + { + "start": 6386.679, + "text": "think about it that way right so" + }, + { + "start": 6388.08, + "text": "basically we're squishing too much" + }, + { + "start": 6389.48, + "text": "information into a single token and then" + }, + { + "start": 6391.639, + "text": "the forward pass of the Transformer is" + }, + { + "start": 6393.04, + "text": "not enough to actually process that" + }, + { + "start": 6394.4, + "text": "information appropriately and so these" + }, + { + "start": 6396.44, + "text": "are some of the considerations you're" + }, + { + "start": 6397.48, + "text": "thinking about when you're designing the" + }, + { + "start": 6398.639, + "text": "vocab size as I mentioned this is mostly" + }, + { + "start": 6400.639, + "text": "an empirical hyperparameter and it seems" + }, + { + "start": 6402.88, + "text": "like in state-of-the-art architectures" + }, + { + "start": 6404.239, + "text": "today this is usually in the high 10,000" + }, + { + "start": 6406.76, + "text": "or somewhere around 100,000 today and" + }, + { + "start": 6409.36, + "text": "the next consideration I want to briefly" + }, + { + "start": 6410.88, + "text": "talk about is what if we want to take a" + }, + { + "start": 6413.0, + "text": "pre-trained model and we want to extend" + }, + { + "start": 6415.199, + "text": "the vocap size and this is done fairly" + }, + { + "start": 6417.36, + "text": "commonly actually so for example when" + }, + { + "start": 6418.88, + "text": "you're doing fine-tuning for cha GPT um" + }, + { + "start": 6422.159, + "text": "a lot more new special tokens get" + }, + { + "start": 6423.76, + "text": "introduced on top of the base model to" + }, + { + "start": 6425.8, + "text": "maintain the metadata and all the" + }, + { + "start": 6428.04, + "text": "structure of conversation objects" + }, + { + "start": 6429.88, + "text": "between a user and an assistant so that" + }, + { + "start": 6431.92, + "text": "takes a lot of special tokens you might" + }, + { + "start": 6434.04, + "text": "also try to throw in more special tokens" + }, + { + "start": 6435.88, + "text": "for example for using the browser or any" + }, + { + "start": 6437.8, + "text": "other tool and so it's very tempting to" + }, + { + "start": 6440.639, + "text": "add a lot of tokens for all kinds of" + }, + { + "start": 6442.159, + "text": "special functionality so if you want to" + }, + { + "start": 6444.52, + "text": "be adding a token that's totally" + }, + { + "start": 6445.8, + "text": "possible Right all we have to do is we" + }, + { + "start": 6447.719, + "text": "have to resize this embedding so we have" + }, + { + "start": 6449.88, + "text": "to add rows we would initialize these uh" + }, + { + "start": 6452.48, + "text": "parameters from scratch to be small" + }, + { + "start": 6454.44, + "text": "random numbers and then we have to" + }, + { + "start": 6456.119, + "text": "extend the weight inside this linear uh" + }, + { + "start": 6459.28, + "text": "so we have to start making dot products" + }, + { + "start": 6461.44, + "text": "um with the associated parameters as" + }, + { + "start": 6463.199, + "text": "well to basically calculate the" + }, + { + "start": 6464.56, + "text": "probabilities for these new tokens so" + }, + { + "start": 6466.76, + "text": "both of these are just a resizing" + }, + { + "start": 6468.639, + "text": "operation it's a very mild" + }, + { + "start": 6470.84, + "text": "model surgery and can be done fairly" + }, + { + "start": 6472.599, + "text": "easily and it's quite common that" + }, + { + "start": 6474.04, + "text": "basically you would freeze the base" + }, + { + "start": 6475.36, + "text": "model you introduce these new parameters" + }, + { + "start": 6477.44, + "text": "and then you only train these new" + }, + { + "start": 6478.639, + "text": "parameters to introduce new tokens into" + }, + { + "start": 6480.56, + "text": "the architecture um and so you can" + }, + { + "start": 6483.119, + "text": "freeze arbitrary parts of it or you can" + }, + { + "start": 6484.96, + "text": "train arbitrary parts of it and that's" + }, + { + "start": 6486.4, + "text": "totally up to you but basically minor" + }, + { + "start": 6488.32, + "text": "surgery required if you'd like to" + }, + { + "start": 6490.119, + "text": "introduce new tokens and finally I'd" + }, + { + "start": 6491.88, + "text": "like to mention that actually there's an" + }, + { + "start": 6493.36, + "text": "entire design space of applications in" + }, + { + "start": 6495.92, + "text": "terms of introducing new tokens into a" + }, + { + "start": 6497.639, + "text": "vocabulary that go Way Beyond just" + }, + { + "start": 6499.36, + "text": "adding special tokens and special new" + }, + { + "start": 6501.199, + "text": "functionality so just to give you a" + }, + { + "start": 6503.0, + "text": "sense of the design space but this could" + }, + { + "start": 6504.36, + "text": "be an entire video just by itself uh" + }, + { + "start": 6506.599, + "text": "this is a paper on learning to compress" + }, + { + "start": 6508.639, + "text": "prompts with what they called uh gist" + }, + { + "start": 6511.04, + "text": "tokens and the rough idea is suppose" + }, + { + "start": 6513.4, + "text": "that you're using language models in a" + }, + { + "start": 6514.679, + "text": "setting that requires very long prompts" + }, + { + "start": 6517.159, + "text": "while these long prompts just slow" + }, + { + "start": 6518.8, + "text": "everything down because you have to" + }, + { + "start": 6519.84, + "text": "encode them and then you have to use" + }, + { + "start": 6521.4, + "text": "them and then you're tending over them" + }, + { + "start": 6523.119, + "text": "and it's just um you know heavy to have" + }, + { + "start": 6525.119, + "text": "very large prompts so instead what they" + }, + { + "start": 6527.639, + "text": "do here in this paper is they introduce" + }, + { + "start": 6530.679, + "text": "new tokens and um imagine basically" + }, + { + "start": 6534.56, + "text": "having a few new tokens you put them in" + }, + { + "start": 6536.4, + "text": "a sequence and then you train the model" + }, + { + "start": 6539.36, + "text": "by distillation so you are keeping the" + }, + { + "start": 6541.52, + "text": "entire model Frozen and you're only" + }, + { + "start": 6543.159, + "text": "training the representations of the new" + }, + { + "start": 6545.0, + "text": "tokens their embeddings and you're" + }, + { + "start": 6546.96, + "text": "optimizing over the new tokens such that" + }, + { + "start": 6549.44, + "text": "the behavior of the language model is" + }, + { + "start": 6551.92, + "text": "identical uh to the model that has a" + }, + { + "start": 6555.04, + "text": "very long prompt that works for you and" + }, + { + "start": 6557.679, + "text": "so it's a compression technique of" + }, + { + "start": 6559.0, + "text": "compressing that very long prompt into" + }, + { + "start": 6560.8, + "text": "those few new gist tokens and so you can" + }, + { + "start": 6563.8, + "text": "train this and then at test time you can" + }, + { + "start": 6565.04, + "text": "discard your old prompt and just swap in" + }, + { + "start": 6566.719, + "text": "those tokens and they sort of like uh" + }, + { + "start": 6568.639, + "text": "stand in for that very long prompt and" + }, + { + "start": 6571.119, + "text": "have an almost identical performance and" + }, + { + "start": 6573.679, + "text": "so this is one um technique and a class" + }, + { + "start": 6576.48, + "text": "of parameter efficient fine-tuning" + }, + { + "start": 6578.0, + "text": "techniques where most of the model is" + }, + { + "start": 6579.92, + "text": "basically fixed and there's no training" + }, + { + "start": 6581.88, + "text": "of the model weights there's no training" + }, + { + "start": 6583.599, + "text": "of Laura or anything like that of new" + }, + { + "start": 6585.44, + "text": "parameters the the parameters that" + }, + { + "start": 6587.239, + "text": "you're training are now just the uh" + }, + { + "start": 6589.119, + "text": "token embeddings so that's just one" + }, + { + "start": 6591.199, + "text": "example but this could again be like an" + }, + { + "start": 6592.88, + "text": "entire video but just to give you a" + }, + { + "start": 6594.52, + "text": "sense that there's a whole design space" + }, + { + "start": 6595.76, + "text": "here that is potentially worth exploring" + }, + { + "start": 6597.36, + "text": "in the future the next thing I want to" + }, + { + "start": 6599.199, + "text": "briefly address is that I think recently" + }, + { + "start": 6601.199, + "text": "there's a lot of momentum in how you" + }, + { + "start": 6603.08, + "text": "actually could construct Transformers" + }, + { + "start": 6605.08, + "text": "that can simultaneously process not just" + }, + { + "start": 6606.8, + "text": "text as the input modality but a lot of" + }, + { + "start": 6608.84, + "text": "other modalities so be it images videos" + }, + { + "start": 6611.52, + "text": "audio Etc and how do you feed in all" + }, + { + "start": 6614.28, + "text": "these modalities and potentially predict" + }, + { + "start": 6616.0, + "text": "these modalities from a Transformer uh" + }, + { + "start": 6618.84, + "text": "do you have to change the architecture" + }, + { + "start": 6619.84, + "text": "in some fundamental way and I think what" + }, + { + "start": 6621.599, + "text": "a lot of people are starting to converge" + }, + { + "start": 6623.119, + "text": "towards is that you're not changing the" + }, + { + "start": 6624.28, + "text": "architecture you stick with the" + }, + { + "start": 6625.44, + "text": "Transformer you just kind of tokenize" + }, + { + "start": 6627.56, + "text": "your input domains and then call the day" + }, + { + "start": 6629.96, + "text": "and pretend it's just text tokens and" + }, + { + "start": 6631.52, + "text": "just do everything else identical in an" + }, + { + "start": 6633.96, + "text": "identical manner so here for example" + }, + { + "start": 6636.08, + "text": "there was a early paper that has nice" + }, + { + "start": 6637.56, + "text": "graphic for how you can take an image" + }, + { + "start": 6639.599, + "text": "and you can chunc at it into" + }, + { + "start": 6642.159, + "text": "integers um and these sometimes uh so" + }, + { + "start": 6645.4, + "text": "these will basically become the tokens" + }, + { + "start": 6646.84, + "text": "of images as an example and uh these" + }, + { + "start": 6649.56, + "text": "tokens can be uh hard tokens where you" + }, + { + "start": 6652.199, + "text": "force them to be integers they can also" + }, + { + "start": 6653.92, + "text": "be soft tokens where you uh sort of" + }, + { + "start": 6657.0, + "text": "don't require uh these to be discrete" + }, + { + "start": 6660.239, + "text": "but you do Force these representations" + }, + { + "start": 6662.159, + "text": "to go through bottlenecks like in Auto" + }, + { + "start": 6664.76, + "text": "encoders uh also in this paper that came" + }, + { + "start": 6666.92, + "text": "out from open a SORA which I think" + }, + { + "start": 6668.88, + "text": "really um uh blew the mind of many" + }, + { + "start": 6671.84, + "text": "people and inspired a lot of people in" + }, + { + "start": 6673.52, + "text": "terms of what's possible they have a" + }, + { + "start": 6675.199, + "text": "Graphic here and they talk briefly about" + }, + { + "start": 6676.92, + "text": "how llms have text tokens Sora has" + }, + { + "start": 6680.159, + "text": "visual patches so again they came up" + }, + { + "start": 6682.52, + "text": "with a way to chunc a videos into" + }, + { + "start": 6684.92, + "text": "basically tokens when they own" + }, + { + "start": 6686.52, + "text": "vocabularies and then you can either" + }, + { + "start": 6688.52, + "text": "process discrete tokens say with autog" + }, + { + "start": 6690.04, + "text": "regressive models or even soft tokens" + }, + { + "start": 6692.079, + "text": "with diffusion models and uh all of that" + }, + { + "start": 6695.239, + "text": "is sort of uh being actively worked on" + }, + { + "start": 6698.239, + "text": "designed on and is beyond the scope of" + }, + { + "start": 6699.639, + "text": "this video but just something I wanted" + }, + { + "start": 6700.88, + "text": "to mention briefly okay now that we have" + }, + { + "start": 6702.96, + "text": "come quite deep into the tokenization" + }, + { + "start": 6705.119, + "text": "algorithm and we understand a lot more" + }, + { + "start": 6706.76, + "text": "about how it works let's loop back" + }, + { + "start": 6708.92, + "text": "around to the beginning of this video" + }, + { + "start": 6710.52, + "text": "and go through some of these bullet" + }, + { + "start": 6711.599, + "text": "points and really see why they happen so" + }, + { + "start": 6714.88, + "text": "first of all why can't my llm spell" + }, + { + "start": 6716.96, + "text": "words very well or do other spell" + }, + { + "start": 6718.76, + "text": "related" + }, + { + "start": 6720.56, + "text": "tasks so fundamentally this is because" + }, + { + "start": 6722.92, + "text": "as we saw these characters are chunked" + }, + { + "start": 6725.679, + "text": "up into tokens and some of these tokens" + }, + { + "start": 6727.96, + "text": "are actually fairly long so as an" + }, + { + "start": 6730.4, + "text": "example I went to the gp4 vocabulary and" + }, + { + "start": 6732.8, + "text": "I looked at uh one of the longer tokens" + }, + { + "start": 6735.28, + "text": "so that default style turns out to be a" + }, + { + "start": 6737.88, + "text": "single individual token so that's a lot" + }, + { + "start": 6739.719, + "text": "of characters for a single token so my" + }, + { + "start": 6742.159, + "text": "suspicion is that there's just too much" + }, + { + "start": 6743.76, + "text": "crammed into this single token and my" + }, + { + "start": 6746.079, + "text": "suspicion was that the model should not" + }, + { + "start": 6747.76, + "text": "be very good at tasks related to" + }, + { + "start": 6750.36, + "text": "spelling of this uh single token so I" + }, + { + "start": 6754.679, + "text": "asked how many letters L are there in" + }, + { + "start": 6757.0, + "text": "the word default style and of course my" + }, + { + "start": 6761.48, + "text": "prompt is intentionally done that way" + }, + { + "start": 6764.36, + "text": "and you see how default style will be a" + }, + { + "start": 6765.76, + "text": "single token so this is what the model" + }, + { + "start": 6767.36, + "text": "sees so my suspicion is that it wouldn't" + }, + { + "start": 6769.4, + "text": "be very good at this and indeed it is" + }, + { + "start": 6771.32, + "text": "not it doesn't actually know how many" + }, + { + "start": 6773.159, + "text": "L's are in there it thinks there are" + }, + { + "start": 6774.639, + "text": "three and actually there are four if I'm" + }, + { + "start": 6777.0, + "text": "not getting this wrong myself so that" + }, + { + "start": 6779.639, + "text": "didn't go extremely well let's look look" + }, + { + "start": 6782.32, + "text": "at another kind of uh character level" + }, + { + "start": 6784.599, + "text": "task so for example here I asked uh gp4" + }, + { + "start": 6788.4, + "text": "to reverse the string default style and" + }, + { + "start": 6791.159, + "text": "they tried to use a code interpreter and" + }, + { + "start": 6793.199, + "text": "I stopped it and I said just do it just" + }, + { + "start": 6795.44, + "text": "try it and uh it gave me jumble so it" + }, + { + "start": 6799.56, + "text": "doesn't actually really know how to" + }, + { + "start": 6801.44, + "text": "reverse this string going from right to" + }, + { + "start": 6803.76, + "text": "left uh so it gave a wrong result so" + }, + { + "start": 6806.76, + "text": "again like working with this working" + }, + { + "start": 6808.32, + "text": "hypothesis that maybe this is due to the" + }, + { + "start": 6810.0, + "text": "tokenization I tried a different" + }, + { + "start": 6811.84, + "text": "approach I said okay let's reverse the" + }, + { + "start": 6814.119, + "text": "exact same string but take the following" + }, + { + "start": 6816.44, + "text": "approach step one just print out every" + }, + { + "start": 6818.679, + "text": "single character separated by spaces and" + }, + { + "start": 6820.719, + "text": "then as a step two reverse that list and" + }, + { + "start": 6823.28, + "text": "it again Tred to use a tool but when I" + }, + { + "start": 6824.8, + "text": "stopped it it uh first uh produced all" + }, + { + "start": 6827.76, + "text": "the characters and that was actually" + }, + { + "start": 6828.92, + "text": "correct and then It reversed them and" + }, + { + "start": 6830.92, + "text": "that was correct once it had this so" + }, + { + "start": 6833.04, + "text": "somehow it can't reverse it directly but" + }, + { + "start": 6834.88, + "text": "when you go just first uh you know" + }, + { + "start": 6837.4, + "text": "listing it out in order it can do that" + }, + { + "start": 6839.28, + "text": "somehow and then it can once it's uh" + }, + { + "start": 6841.88, + "text": "broken up this way this becomes all" + }, + { + "start": 6843.88, + "text": "these individual characters and so now" + }, + { + "start": 6846.04, + "text": "this is much easier for it to see these" + }, + { + "start": 6847.88, + "text": "individual tokens and reverse them and" + }, + { + "start": 6850.079, + "text": "print them out so that is kind of" + }, + { + "start": 6853.52, + "text": "interesting so let's continue now why" + }, + { + "start": 6856.84, + "text": "are llms worse at uh non-english langu" + }, + { + "start": 6860.4, + "text": "and I briefly covered this already but" + }, + { + "start": 6862.679, + "text": "basically um it's not only that the" + }, + { + "start": 6864.88, + "text": "language model sees less non-english" + }, + { + "start": 6867.159, + "text": "data during training of the model" + }, + { + "start": 6868.76, + "text": "parameters but also the tokenizer is not" + }, + { + "start": 6871.639, + "text": "um is not sufficiently trained on" + }, + { + "start": 6874.639, + "text": "non-english data and so here for example" + }, + { + "start": 6877.28, + "text": "hello how are you is five tokens and its" + }, + { + "start": 6880.52, + "text": "translation is 15 tokens so this is a" + }, + { + "start": 6882.88, + "text": "three times blow up and so for example" + }, + { + "start": 6885.8, + "text": "anang is uh just hello basically in" + }, + { + "start": 6888.639, + "text": "Korean and that end up being three" + }, + { + "start": 6890.32, + "text": "tokens I'm actually kind of surprised by" + }, + { + "start": 6891.8, + "text": "that because that is a very common" + }, + { + "start": 6893.119, + "text": "phrase there just the typical greeting" + }, + { + "start": 6895.159, + "text": "of like hello and that ends up being" + }, + { + "start": 6897.0, + "text": "three tokens whereas our hello is a" + }, + { + "start": 6898.76, + "text": "single token and so basically everything" + }, + { + "start": 6900.56, + "text": "is a lot more bloated and diffuse and" + }, + { + "start": 6902.32, + "text": "this is I think partly the reason that" + }, + { + "start": 6904.079, + "text": "the model Works worse on other" + }, + { + "start": 6907.0, + "text": "languages uh coming back why is LM bad" + }, + { + "start": 6910.04, + "text": "at simple arithmetic um that has to do" + }, + { + "start": 6913.159, + "text": "with the tokenization of numbers and so" + }, + { + "start": 6917.36, + "text": "um you'll notice that for example" + }, + { + "start": 6919.079, + "text": "addition is very sort of" + }, + { + "start": 6920.96, + "text": "like uh there's an algorithm that is" + }, + { + "start": 6923.079, + "text": "like character level for doing addition" + }, + { + "start": 6925.719, + "text": "so for example here we would first add" + }, + { + "start": 6927.639, + "text": "the ones and then the tens and then the" + }, + { + "start": 6929.199, + "text": "hundreds you have to refer to specific" + }, + { + "start": 6931.079, + "text": "parts of these digits but uh these" + }, + { + "start": 6934.719, + "text": "numbers are represented completely" + }, + { + "start": 6936.199, + "text": "arbitrarily based on whatever happened" + }, + { + "start": 6937.679, + "text": "to merge or not merge during the" + }, + { + "start": 6939.28, + "text": "tokenization process there's an entire" + }, + { + "start": 6941.44, + "text": "blog post about this that I think is" + }, + { + "start": 6942.84, + "text": "quite good integer tokenization is" + }, + { + "start": 6944.719, + "text": "insane and this person basically" + }, + { + "start": 6946.679, + "text": "systematically explores the tokenization" + }, + { + "start": 6948.719, + "text": "of numbers in I believe this is gpt2 and" + }, + { + "start": 6952.04, + "text": "so they notice that for example for the" + }, + { + "start": 6953.76, + "text": "for um four-digit numbers you can take a" + }, + { + "start": 6957.28, + "text": "look at whether it is uh a single token" + }, + { + "start": 6960.199, + "text": "or whether it is two tokens that is a 1" + }, + { + "start": 6962.119, + "text": "three or a 2 two or a 31 combination and" + }, + { + "start": 6964.92, + "text": "so all the different numbers are all the" + }, + { + "start": 6966.56, + "text": "different combinations and you can" + }, + { + "start": 6968.04, + "text": "imagine this is all completely" + }, + { + "start": 6969.199, + "text": "arbitrarily so and the model" + }, + { + "start": 6971.28, + "text": "unfortunately sometimes sees uh four um" + }, + { + "start": 6974.159, + "text": "a token for for all four digits" + }, + { + "start": 6976.599, + "text": "sometimes for three sometimes for two" + }, + { + "start": 6978.04, + "text": "sometimes for one and it's in an" + }, + { + "start": 6980.0, + "text": "arbitrary uh Manner and so this is" + }, + { + "start": 6982.52, + "text": "definitely a headwind if you will for" + }, + { + "start": 6985.0, + "text": "the language model and it's kind of" + }, + { + "start": 6986.36, + "text": "incredible that it can kind of do it and" + }, + { + "start": 6987.92, + "text": "deal with it but it's also kind of not" + }, + { + "start": 6990.119, + "text": "ideal and so that's why for example we" + }, + { + "start": 6992.0, + "text": "saw that meta when they train the Llama" + }, + { + "start": 6994.199, + "text": "2 algorithm and they use sentence piece" + }, + { + "start": 6996.44, + "text": "they make sure to split up all the um" + }, + { + "start": 6999.52, + "text": "all the digits as an example for uh" + }, + { + "start": 7002.32, + "text": "llama 2 and this is partly to improve a" + }, + { + "start": 7004.88, + "text": "simple arithmetic kind of" + }, + { + "start": 7006.92, + "text": "performance and finally why is gpt2 not" + }, + { + "start": 7010.52, + "text": "as good in Python again this is partly a" + }, + { + "start": 7012.92, + "text": "modeling issue on in the architecture" + }, + { + "start": 7014.88, + "text": "and the data set and the strength of the" + }, + { + "start": 7016.639, + "text": "model but it's also partially" + }, + { + "start": 7018.199, + "text": "tokenization because as we saw here with" + }, + { + "start": 7020.32, + "text": "the simple python example the encoding" + }, + { + "start": 7023.04, + "text": "efficiency of the tokenizer for handling" + }, + { + "start": 7025.199, + "text": "spaces in Python is terrible and every" + }, + { + "start": 7027.36, + "text": "single space is an individual token and" + }, + { + "start": 7029.44, + "text": "this dramatically reduces the context" + }, + { + "start": 7031.079, + "text": "length that the model can attend to" + }, + { + "start": 7032.52, + "text": "cross so that's almost like a" + }, + { + "start": 7034.079, + "text": "tokenization bug for gpd2 and that was" + }, + { + "start": 7036.8, + "text": "later fixed with gp4 okay so here's" + }, + { + "start": 7040.0, + "text": "another fun one my llm abruptly halts" + }, + { + "start": 7042.52, + "text": "when it sees the string end of text so" + }, + { + "start": 7045.28, + "text": "here's um here's a very strange Behavior" + }, + { + "start": 7048.04, + "text": "print a string end of text is what I" + }, + { + "start": 7050.079, + "text": "told jt4 and it says could you please" + }, + { + "start": 7052.239, + "text": "specify the string and I'm I'm telling" + }, + { + "start": 7055.119, + "text": "it give me end of text and it seems like" + }, + { + "start": 7057.159, + "text": "there's an issue it's not seeing end of" + }, + { + "start": 7059.239, + "text": "text and then I give it end of text is" + }, + { + "start": 7061.599, + "text": "the string and then here's a string and" + }, + { + "start": 7064.239, + "text": "then it just doesn't print it so" + }, + { + "start": 7065.84, + "text": "obviously something is breaking here" + }, + { + "start": 7067.119, + "text": "with respect to the handling of the" + }, + { + "start": 7068.32, + "text": "special token and I don't actually know" + }, + { + "start": 7070.199, + "text": "what open ey is doing under the hood" + }, + { + "start": 7072.639, + "text": "here and whether they are potentially" + }, + { + "start": 7074.52, + "text": "parsing this as an um as an actual token" + }, + { + "start": 7078.96, + "text": "instead of this just being uh end of" + }, + { + "start": 7081.159, + "text": "text um as like individual sort of" + }, + { + "start": 7084.599, + "text": "pieces of it without the special token" + }, + { + "start": 7086.44, + "text": "handling logic and so it might be that" + }, + { + "start": 7089.52, + "text": "someone when they're calling do encode" + }, + { + "start": 7091.76, + "text": "uh they are passing in the allowed" + }, + { + "start": 7093.36, + "text": "special and they are allowing end of" + }, + { + "start": 7096.199, + "text": "text as a special character in the user" + }, + { + "start": 7098.36, + "text": "prompt but the user prompt of course is" + }, + { + "start": 7100.84, + "text": "is a sort of um attacker controlled text" + }, + { + "start": 7103.52, + "text": "so you would hope that they don't really" + }, + { + "start": 7105.32, + "text": "parse or use special tokens or you know" + }, + { + "start": 7108.76, + "text": "from that kind of input but it appears" + }, + { + "start": 7110.599, + "text": "that there's something definitely going" + }, + { + "start": 7111.76, + "text": "wrong here and um so your knowledge of" + }, + { + "start": 7114.8, + "text": "these special tokens ends up being in a" + }, + { + "start": 7116.4, + "text": "tax surface potentially and so if you'd" + }, + { + "start": 7118.88, + "text": "like to confuse llms then just um try to" + }, + { + "start": 7123.0, + "text": "give them some special tokens and see if" + }, + { + "start": 7124.32, + "text": "you're breaking something by chance okay" + }, + { + "start": 7126.4, + "text": "so this next one is a really fun one uh" + }, + { + "start": 7129.48, + "text": "the trailing whites space issue so if" + }, + { + "start": 7132.88, + "text": "you come to playground and uh we come" + }, + { + "start": 7136.0, + "text": "here to GPT 3.5 turbo instruct so this" + }, + { + "start": 7138.44, + "text": "is not a chat model this is a completion" + }, + { + "start": 7140.32, + "text": "model so think of it more like it's a" + }, + { + "start": 7142.88, + "text": "lot more closer to a base model it does" + }, + { + "start": 7145.28, + "text": "completion it will continue the token" + }, + { + "start": 7147.599, + "text": "sequence so here's a tagline for ice" + }, + { + "start": 7149.88, + "text": "cream shop and we want to continue the" + }, + { + "start": 7151.639, + "text": "sequence and so we can submit and get a" + }, + { + "start": 7154.239, + "text": "bunch of tokens okay no problem but now" + }, + { + "start": 7158.239, + "text": "suppose I do this but instead of" + }, + { + "start": 7160.84, + "text": "pressing submit here I do here's a" + }, + { + "start": 7163.119, + "text": "tagline for ice cream shop space so I" + }, + { + "start": 7166.0, + "text": "have a space here before I click" + }, + { + "start": 7168.96, + "text": "submit we get a warning your text ends" + }, + { + "start": 7171.84, + "text": "in a trail Ling space which causes worse" + }, + { + "start": 7173.4, + "text": "performance due to how API splits text" + }, + { + "start": 7175.84, + "text": "into tokens so what's happening here it" + }, + { + "start": 7178.239, + "text": "still gave us a uh sort of completion" + }, + { + "start": 7180.56, + "text": "here but let's take a look at what's" + }, + { + "start": 7182.8, + "text": "happening so here's a tagline for an ice" + }, + { + "start": 7184.88, + "text": "cream shop and then what does this look" + }, + { + "start": 7188.679, + "text": "like in the actual actual training data" + }, + { + "start": 7190.159, + "text": "suppose you found the completion in the" + }, + { + "start": 7192.28, + "text": "training document somewhere on the" + }, + { + "start": 7193.56, + "text": "internet and the llm trained on this" + }, + { + "start": 7195.679, + "text": "data so maybe it's something like oh" + }, + { + "start": 7198.32, + "text": "yeah maybe that's the tagline that's a" + }, + { + "start": 7200.4, + "text": "terrible tagline but notice here that" + }, + { + "start": 7202.76, + "text": "when I create o you see that because" + }, + { + "start": 7205.76, + "text": "there's the the space character is" + }, + { + "start": 7207.8, + "text": "always a prefix to these tokens in GPT" + }, + { + "start": 7211.159, + "text": "so it's not an O token it's a space o" + }, + { + "start": 7213.48, + "text": "token the space is part of the O and" + }, + { + "start": 7216.76, + "text": "together they are token 8840 that's" + }, + { + "start": 7219.239, + "text": "that's space o so what's What's" + }, + { + "start": 7221.92, + "text": "Happening Here is that when I just have" + }, + { + "start": 7224.119, + "text": "it like this and I let it complete the" + }, + { + "start": 7227.04, + "text": "next token it can sample the space o" + }, + { + "start": 7230.04, + "text": "token but instead if I have this and I" + }, + { + "start": 7232.599, + "text": "add my space then what I'm doing here" + }, + { + "start": 7234.76, + "text": "when I incode this string is I have" + }, + { + "start": 7237.639, + "text": "basically here's a t line for an ice" + }, + { + "start": 7239.079, + "text": "cream uh shop and this space at the very" + }, + { + "start": 7242.0, + "text": "end becomes a token" + }, + { + "start": 7244.079, + "text": "220 and so we've added token 220 and" + }, + { + "start": 7247.84, + "text": "this token otherwise would be part of" + }, + { + "start": 7249.76, + "text": "the tagline because if there actually is" + }, + { + "start": 7251.88, + "text": "a tagline here so space o is the token" + }, + { + "start": 7255.239, + "text": "and so this is suddenly a of" + }, + { + "start": 7257.32, + "text": "distribution for the model because this" + }, + { + "start": 7259.679, + "text": "space is part of the next token but" + }, + { + "start": 7261.52, + "text": "we're putting it here like this and the" + }, + { + "start": 7264.04, + "text": "model has seen very very little data of" + }, + { + "start": 7267.199, + "text": "actual Space by itself and we're asking" + }, + { + "start": 7270.079, + "text": "it to complete the sequence like add in" + }, + { + "start": 7271.719, + "text": "more tokens but the problem is that" + }, + { + "start": 7273.48, + "text": "we've sort of begun the first token and" + }, + { + "start": 7276.36, + "text": "now it's been split up and now we're out" + }, + { + "start": 7278.76, + "text": "of this distribution and now arbitrary" + }, + { + "start": 7280.76, + "text": "bad things happen and it's just a very" + }, + { + "start": 7283.04, + "text": "rare example for it to see something" + }, + { + "start": 7284.56, + "text": "like that and uh that's why we get the" + }, + { + "start": 7286.92, + "text": "warning so the fundamental issue here is" + }, + { + "start": 7289.119, + "text": "of course that um the llm is on top of" + }, + { + "start": 7292.44, + "text": "these tokens and these tokens are text" + }, + { + "start": 7294.599, + "text": "chunks they're not characters in a way" + }, + { + "start": 7296.56, + "text": "you and I would think of them they are" + }, + { + "start": 7298.199, + "text": "these are the atoms of what the LM is" + }, + { + "start": 7300.36, + "text": "seeing and there's a bunch of weird" + }, + { + "start": 7301.8, + "text": "stuff that comes out of it let's go back" + }, + { + "start": 7303.639, + "text": "to our default cell style I bet you that" + }, + { + "start": 7308.0, + "text": "the model has never in its training set" + }, + { + "start": 7309.96, + "text": "seen default cell sta without Le in" + }, + { + "start": 7314.199, + "text": "there it's always seen this as a single" + }, + { + "start": 7316.599, + "text": "group because uh this is some kind of a" + }, + { + "start": 7319.239, + "text": "function in um I'm guess I don't" + }, + { + "start": 7322.0, + "text": "actually know what this is part of this" + }, + { + "start": 7323.079, + "text": "is some kind of API but I bet you that" + }, + { + "start": 7325.119, + "text": "it's never seen this combination of" + }, + { + "start": 7327.079, + "text": "tokens uh in its training data because" + }, + { + "start": 7330.639, + "text": "or I think it would be extremely rare so" + }, + { + "start": 7332.36, + "text": "I took this and I copy pasted it here" + }, + { + "start": 7334.719, + "text": "and I had I tried to complete from it" + }, + { + "start": 7337.48, + "text": "and the it immediately gave me a big" + }, + { + "start": 7339.199, + "text": "error and it said the model predicted to" + }, + { + "start": 7341.079, + "text": "completion that begins with a stop" + }, + { + "start": 7342.32, + "text": "sequence resulting in no output consider" + }, + { + "start": 7344.159, + "text": "adjusting your prompt or stop sequences" + }, + { + "start": 7346.36, + "text": "so what happened here when I clicked" + }, + { + "start": 7347.639, + "text": "submit is that immediately the model" + }, + { + "start": 7350.199, + "text": "emitted and sort of like end of text" + }, + { + "start": 7352.239, + "text": "token I think or something like that it" + }, + { + "start": 7354.44, + "text": "basically predicted the stop sequence" + }, + { + "start": 7356.44, + "text": "immediately so it had no completion and" + }, + { + "start": 7358.76, + "text": "so this is why I'm getting a warning" + }, + { + "start": 7360.199, + "text": "again because we're off the data" + }, + { + "start": 7362.159, + "text": "distribution and the model is just uh" + }, + { + "start": 7365.119, + "text": "predicting just totally arbitrary things" + }, + { + "start": 7367.639, + "text": "it's just really confused basically this" + }, + { + "start": 7369.44, + "text": "is uh this is giving it brain damage" + }, + { + "start": 7370.92, + "text": "it's never seen this before it's shocked" + }, + { + "start": 7373.32, + "text": "and it's predicting end of text or" + }, + { + "start": 7374.56, + "text": "something I tried it again here and it" + }, + { + "start": 7377.04, + "text": "in this case it completed it but then" + }, + { + "start": 7379.079, + "text": "for some reason this request May violate" + }, + { + "start": 7381.44, + "text": "our usage policies this was" + }, + { + "start": 7383.639, + "text": "flagged um basically something just like" + }, + { + "start": 7386.639, + "text": "goes wrong and there's something like" + }, + { + "start": 7387.679, + "text": "Jank you can just feel the Jank because" + }, + { + "start": 7389.52, + "text": "the model is like extremely unhappy with" + }, + { + "start": 7391.4, + "text": "just this and it doesn't know how to" + }, + { + "start": 7392.96, + "text": "complete it because it's never occurred" + }, + { + "start": 7394.159, + "text": "in training set in a training set it" + }, + { + "start": 7396.199, + "text": "always appears like this and becomes a" + }, + { + "start": 7398.32, + "text": "single token" + }, + { + "start": 7400.04, + "text": "so these kinds of issues where tokens" + }, + { + "start": 7401.96, + "text": "are either you sort of like complete the" + }, + { + "start": 7404.239, + "text": "first character of the next token or you" + }, + { + "start": 7406.76, + "text": "are sort of you have long tokens that" + }, + { + "start": 7408.56, + "text": "you then have just some of the" + }, + { + "start": 7409.8, + "text": "characters off all of these are kind of" + }, + { + "start": 7412.32, + "text": "like issues with partial tokens is how I" + }, + { + "start": 7415.36, + "text": "would describe it and if you actually" + }, + { + "start": 7417.76, + "text": "dig into the T token" + }, + { + "start": 7419.8, + "text": "repository go to the rust code and" + }, + { + "start": 7421.96, + "text": "search for" + }, + { + "start": 7424.159, + "text": "unstable and you'll see um en code" + }, + { + "start": 7427.079, + "text": "unstable native unstable token tokens" + }, + { + "start": 7429.239, + "text": "and a lot of like special case handling" + }, + { + "start": 7431.52, + "text": "none of this stuff about unstable tokens" + }, + { + "start": 7433.4, + "text": "is documented anywhere but there's a ton" + }, + { + "start": 7435.48, + "text": "of code dealing with unstable tokens and" + }, + { + "start": 7438.36, + "text": "unstable tokens is exactly kind of like" + }, + { + "start": 7440.8, + "text": "what I'm describing here what you would" + }, + { + "start": 7442.76, + "text": "like out of a completion API is" + }, + { + "start": 7445.239, + "text": "something a lot more fancy like if we're" + }, + { + "start": 7446.599, + "text": "putting in default cell sta if we're" + }, + { + "start": 7448.96, + "text": "asking for the next token sequence we're" + }, + { + "start": 7450.679, + "text": "not actually trying to append the next" + }, + { + "start": 7452.239, + "text": "token exactly after this list we're" + }, + { + "start": 7454.639, + "text": "actually trying to append we're trying" + }, + { + "start": 7456.48, + "text": "to consider lots of tokens um" + }, + { + "start": 7459.52, + "text": "that if we were or I guess like we're" + }, + { + "start": 7462.159, + "text": "trying to search over characters that if" + }, + { + "start": 7465.76, + "text": "we retened would be of high probability" + }, + { + "start": 7468.159, + "text": "if that makes sense um so that we can" + }, + { + "start": 7470.679, + "text": "actually add a single individual" + }, + { + "start": 7472.32, + "text": "character uh instead of just like adding" + }, + { + "start": 7474.48, + "text": "the next full token that comes after" + }, + { + "start": 7476.679, + "text": "this partial token list so I this is" + }, + { + "start": 7479.36, + "text": "very tricky to describe and I invite you" + }, + { + "start": 7481.32, + "text": "to maybe like look through this it ends" + }, + { + "start": 7483.04, + "text": "up being extremely gnarly and hairy kind" + }, + { + "start": 7484.679, + "text": "of topic it and it comes from" + }, + { + "start": 7486.36, + "text": "tokenization fundamentally so um maybe I" + }, + { + "start": 7489.4, + "text": "can even spend an entire video talking" + }, + { + "start": 7490.8, + "text": "about unstable tokens sometime in the" + }, + { + "start": 7492.119, + "text": "future okay and I'm really saving the" + }, + { + "start": 7494.199, + "text": "best for last my favorite one by far is" + }, + { + "start": 7496.599, + "text": "the solid gold" + }, + { + "start": 7499.199, + "text": "Magikarp and it just okay so this comes" + }, + { + "start": 7501.36, + "text": "from this blog post uh solid gold" + }, + { + "start": 7503.639, + "text": "Magikarp and uh this is um internet" + }, + { + "start": 7507.0, + "text": "famous now for those of us in llms and" + }, + { + "start": 7510.079, + "text": "basically I I would advise you to uh" + }, + { + "start": 7511.84, + "text": "read this block Post in full but" + }, + { + "start": 7513.679, + "text": "basically what this person was doing is" + }, + { + "start": 7516.559, + "text": "this person went to the um" + }, + { + "start": 7519.239, + "text": "token embedding stable and clustered the" + }, + { + "start": 7522.32, + "text": "tokens based on their embedding" + }, + { + "start": 7524.8, + "text": "representation and this person noticed" + }, + { + "start": 7527.28, + "text": "that there's a cluster of tokens that" + }, + { + "start": 7529.239, + "text": "look really strange so there's a cluster" + }, + { + "start": 7531.159, + "text": "here at rot e stream Fame solid gold" + }, + { + "start": 7534.079, + "text": "Magikarp Signet message like really" + }, + { + "start": 7536.0, + "text": "weird tokens in uh basically in this" + }, + { + "start": 7539.96, + "text": "embedding cluster and so what are these" + }, + { + "start": 7542.239, + "text": "tokens and where do they even come from" + }, + { + "start": 7543.679, + "text": "like what is solid gold magikarpet makes" + }, + { + "start": 7545.4, + "text": "no sense and then they found bunch of" + }, + { + "start": 7548.96, + "text": "these" + }, + { + "start": 7550.199, + "text": "tokens and then they notice that" + }, + { + "start": 7552.119, + "text": "actually the plot thickens here because" + }, + { + "start": 7553.559, + "text": "if you ask the model about these tokens" + }, + { + "start": 7556.04, + "text": "like you ask it uh some very benign" + }, + { + "start": 7558.639, + "text": "question like please can you repeat back" + }, + { + "start": 7560.199, + "text": "to me the string sold gold Magikarp uh" + }, + { + "start": 7562.96, + "text": "then you get a variety of basically" + }, + { + "start": 7564.8, + "text": "totally broken llm Behavior so either" + }, + { + "start": 7567.76, + "text": "you get evasion so I'm sorry I can't" + }, + { + "start": 7569.84, + "text": "hear you or you get a bunch of" + }, + { + "start": 7571.4, + "text": "hallucinations as a response um you can" + }, + { + "start": 7574.559, + "text": "even get back like insults so you ask it" + }, + { + "start": 7577.28, + "text": "uh about streamer bot it uh tells the" + }, + { + "start": 7580.0, + "text": "and the model actually just calls you" + }, + { + "start": 7582.04, + "text": "names uh or it kind of comes up with" + }, + { + "start": 7584.159, + "text": "like weird humor like you're actually" + }, + { + "start": 7586.239, + "text": "breaking the model by asking about these" + }, + { + "start": 7588.48, + "text": "very simple strings like at Roth and" + }, + { + "start": 7590.52, + "text": "sold gold Magikarp so like what the hell" + }, + { + "start": 7592.84, + "text": "is happening and there's a variety of" + }, + { + "start": 7594.48, + "text": "here documented behaviors uh there's a" + }, + { + "start": 7597.079, + "text": "bunch of tokens not just so good" + }, + { + "start": 7598.48, + "text": "Magikarp that have that kind of a" + }, + { + "start": 7600.28, + "text": "behavior and so basically there's a" + }, + { + "start": 7602.119, + "text": "bunch of like trigger words and if you" + }, + { + "start": 7604.159, + "text": "ask the model about these trigger words" + }, + { + "start": 7606.04, + "text": "or you just include them in your prompt" + }, + { + "start": 7608.04, + "text": "the model goes haywire and has all kinds" + }, + { + "start": 7610.0, + "text": "of uh really Strange Behaviors including" + }, + { + "start": 7612.8, + "text": "sort of ones that violate typical safety" + }, + { + "start": 7614.84, + "text": "guidelines uh and the alignment of the" + }, + { + "start": 7617.0, + "text": "model like it's swearing back at you so" + }, + { + "start": 7619.84, + "text": "what is happening here and how can this" + }, + { + "start": 7621.76, + "text": "possibly be true well this again comes" + }, + { + "start": 7624.559, + "text": "down to tokenization so what's happening" + }, + { + "start": 7626.719, + "text": "here is that sold gold Magikarp if you" + }, + { + "start": 7628.76, + "text": "actually dig into it is a Reddit user so" + }, + { + "start": 7631.719, + "text": "there's a u Sol gold" + }, + { + "start": 7634.04, + "text": "Magikarp and probably what happened here" + }, + { + "start": 7636.8, + "text": "even though I I don't know that this has" + }, + { + "start": 7638.0, + "text": "been like really definitively explored" + }, + { + "start": 7640.44, + "text": "but what is thought to have happened is" + }, + { + "start": 7643.159, + "text": "that the tokenization data set was very" + }, + { + "start": 7645.559, + "text": "different from the training data set for" + }, + { + "start": 7648.0, + "text": "the actual language model so in the" + }, + { + "start": 7649.92, + "text": "tokenization data set there was a ton of" + }, + { + "start": 7651.52, + "text": "redded data potentially where the user" + }, + { + "start": 7654.599, + "text": "solid gold Magikarp was mentioned in the" + }, + { + "start": 7656.4, + "text": "text because solid gold Magikarp was a" + }, + { + "start": 7659.199, + "text": "very common um sort of uh person who" + }, + { + "start": 7661.679, + "text": "would post a lot uh this would be a" + }, + { + "start": 7663.679, + "text": "string that occurs many times in a" + }, + { + "start": 7665.28, + "text": "tokenization data set because it occurs" + }, + { + "start": 7668.0, + "text": "many times in a tokenization data set" + }, + { + "start": 7670.0, + "text": "these tokens would end up getting merged" + }, + { + "start": 7671.48, + "text": "to the single individual token for that" + }, + { + "start": 7673.52, + "text": "single Reddit user sold gold Magikarp so" + }, + { + "start": 7676.4, + "text": "they would have a dedicated token in a" + }, + { + "start": 7678.36, + "text": "vocabulary of was it 50,000 tokens in" + }, + { + "start": 7680.719, + "text": "gpd2 that is devoted to that Reddit user" + }, + { + "start": 7684.119, + "text": "and then what happens is the" + }, + { + "start": 7685.599, + "text": "tokenization data set has those strings" + }, + { + "start": 7688.599, + "text": "but then later when you train the model" + }, + { + "start": 7690.92, + "text": "the language model itself um this data" + }, + { + "start": 7693.92, + "text": "from Reddit was not present and so" + }, + { + "start": 7696.679, + "text": "therefore in the entire training set for" + }, + { + "start": 7698.8, + "text": "the language model sold gold Magikarp" + }, + { + "start": 7701.28, + "text": "never occurs that token never appears in" + }, + { + "start": 7704.32, + "text": "the training set for the actual language" + }, + { + "start": 7705.84, + "text": "model later so this token never gets" + }, + { + "start": 7708.92, + "text": "activated it's initialized at random in" + }, + { + "start": 7711.04, + "text": "the beginning of optimization then you" + }, + { + "start": 7712.88, + "text": "have forward backward passes and updates" + }, + { + "start": 7714.48, + "text": "to the model and this token is just" + }, + { + "start": 7716.0, + "text": "never updated in the embedding table" + }, + { + "start": 7717.92, + "text": "that row Vector never gets sampled it" + }, + { + "start": 7720.0, + "text": "never gets used so it never gets trained" + }, + { + "start": 7722.04, + "text": "and it's completely untrained it's kind" + }, + { + "start": 7723.88, + "text": "of like unallocated memory in a typical" + }, + { + "start": 7726.4, + "text": "binary program written in C or something" + }, + { + "start": 7728.159, + "text": "like that that so it's unallocated" + }, + { + "start": 7730.0, + "text": "memory and then at test time if you" + }, + { + "start": 7731.84, + "text": "evoke this token then you're basically" + }, + { + "start": 7734.28, + "text": "plucking out a row of the embedding" + }, + { + "start": 7735.639, + "text": "table that is completely untrained and" + }, + { + "start": 7737.32, + "text": "that feeds into a Transformer and" + }, + { + "start": 7738.92, + "text": "creates undefined behavior and that's" + }, + { + "start": 7740.96, + "text": "what we're seeing here this completely" + }, + { + "start": 7742.159, + "text": "undefined never before seen in a" + }, + { + "start": 7743.88, + "text": "training behavior and so any of these" + }, + { + "start": 7746.559, + "text": "kind of like weird tokens would evoke" + }, + { + "start": 7748.0, + "text": "this Behavior because fundamentally the" + }, + { + "start": 7749.32, + "text": "model is um is uh uh out of sample out" + }, + { + "start": 7754.48, + "text": "of distribution okay and the very last" + }, + { + "start": 7756.76, + "text": "thing I wanted to just briefly mention" + }, + { + "start": 7758.52, + "text": "point out although I think a lot of" + }, + { + "start": 7759.679, + "text": "people are quite aware of this is that" + }, + { + "start": 7761.639, + "text": "different kinds of formats and different" + }, + { + "start": 7763.159, + "text": "representations and different languages" + }, + { + "start": 7765.0, + "text": "and so on might be more or less" + }, + { + "start": 7766.88, + "text": "efficient with GPD tokenizers uh or any" + }, + { + "start": 7769.8, + "text": "tokenizers for any other L for that" + }, + { + "start": 7771.4, + "text": "matter so for example Json is actually" + }, + { + "start": 7773.559, + "text": "really dense in tokens and yaml is a lot" + }, + { + "start": 7776.32, + "text": "more efficient in tokens um so for" + }, + { + "start": 7779.239, + "text": "example this are these are the same in" + }, + { + "start": 7781.32, + "text": "Json and in yaml the Json is" + }, + { + "start": 7784.599, + "text": "116 and the yaml is 99 so quite a bit of" + }, + { + "start": 7788.119, + "text": "an Improvement and so in the token" + }, + { + "start": 7791.639, + "text": "economy where we are paying uh per token" + }, + { + "start": 7793.639, + "text": "in many ways and you are paying in the" + }, + { + "start": 7795.679, + "text": "context length and you're paying in um" + }, + { + "start": 7797.639, + "text": "dollar amount for uh the cost of" + }, + { + "start": 7799.88, + "text": "processing all this kind of structured" + }, + { + "start": 7801.199, + "text": "data when you have to um so prefer to" + }, + { + "start": 7803.52, + "text": "use theal over Json and in general kind" + }, + { + "start": 7806.079, + "text": "of like the tokenization density is" + }, + { + "start": 7807.599, + "text": "something that you have to um sort of" + }, + { + "start": 7809.84, + "text": "care about and worry about at all times" + }, + { + "start": 7811.679, + "text": "and try to find efficient encoding" + }, + { + "start": 7813.4, + "text": "schemes and spend a lot of time in tick" + }, + { + "start": 7815.4, + "text": "tokenizer and measure the different" + }, + { + "start": 7816.88, + "text": "token efficiencies of different formats" + }, + { + "start": 7818.92, + "text": "and settings and so on okay so that" + }, + { + "start": 7821.0, + "text": "concludes my fairly long video on" + }, + { + "start": 7823.36, + "text": "tokenization I know it's a try I know" + }, + { + "start": 7825.96, + "text": "it's annoying I know it's irritating I" + }, + { + "start": 7828.44, + "text": "personally really dislike the stage what" + }, + { + "start": 7830.88, + "text": "I do have to say at this point is don't" + }, + { + "start": 7832.599, + "text": "brush it off there's a lot of foot guns" + }, + { + "start": 7834.96, + "text": "sharp edges here security issues uh AI" + }, + { + "start": 7838.119, + "text": "safety issues as we saw plugging in" + }, + { + "start": 7839.88, + "text": "unallocated memory into uh language" + }, + { + "start": 7842.079, + "text": "models so um it's worth understanding" + }, + { + "start": 7845.159, + "text": "this stage um that said I will say that" + }, + { + "start": 7848.48, + "text": "eternal glory goes to anyone who can get" + }, + { + "start": 7850.32, + "text": "rid of it uh I showed you one possible" + }, + { + "start": 7852.559, + "text": "paper that tried to uh do that and I" + }, + { + "start": 7854.679, + "text": "think I hope a lot more can follow over" + }, + { + "start": 7857.04, + "text": "time and my final recommendations for" + }, + { + "start": 7859.4, + "text": "the application right now are if you can" + }, + { + "start": 7861.44, + "text": "reuse the GPT 4 tokens and the" + }, + { + "start": 7863.04, + "text": "vocabulary uh in your application then" + }, + { + "start": 7865.0, + "text": "that's something you should consider and" + }, + { + "start": 7866.199, + "text": "just use Tech token because it is very" + }, + { + "start": 7867.84, + "text": "efficient and nice library for inference" + }, + { + "start": 7871.239, + "text": "for bpe I also really like the bite" + }, + { + "start": 7873.719, + "text": "level BP that uh Tik toen and openi uses" + }, + { + "start": 7877.32, + "text": "uh if you for some reason want to train" + }, + { + "start": 7879.04, + "text": "your own vocabulary from scratch um then" + }, + { + "start": 7882.679, + "text": "I would use uh the bpe with sentence" + }, + { + "start": 7885.0, + "text": "piece um oops as I mentioned I'm not a" + }, + { + "start": 7888.119, + "text": "huge fan of sentence piece I don't like" + }, + { + "start": 7890.679, + "text": "its uh bite fallback and I don't like" + }, + { + "start": 7893.92, + "text": "that it's doing BP on unic code code" + }, + { + "start": 7895.559, + "text": "points I think it's uh it also has like" + }, + { + "start": 7897.76, + "text": "a million settings and I think there's a" + }, + { + "start": 7899.119, + "text": "lot of foot gonss here and I think it's" + }, + { + "start": 7900.4, + "text": "really easy to Mis calibrate them and" + }, + { + "start": 7902.199, + "text": "you end up cropping your sentences or" + }, + { + "start": 7903.76, + "text": "something like that uh because of some" + }, + { + "start": 7905.8, + "text": "type of parameter that you don't fully" + }, + { + "start": 7907.28, + "text": "understand so so be very careful with" + }, + { + "start": 7909.44, + "text": "the settings try to copy paste exactly" + }, + { + "start": 7911.719, + "text": "maybe where what meta did or basically" + }, + { + "start": 7914.28, + "text": "spend a lot of time looking at all the" + }, + { + "start": 7916.119, + "text": "hyper parameters and go through the code" + }, + { + "start": 7917.48, + "text": "of sentence piece and make sure that you" + }, + { + "start": 7919.079, + "text": "have this correct um but even if you" + }, + { + "start": 7922.04, + "text": "have all the settings correct I still" + }, + { + "start": 7923.48, + "text": "think that the algorithm is kind of" + }, + { + "start": 7924.92, + "text": "inferior to what's happening here and" + }, + { + "start": 7927.679, + "text": "maybe the best if you really need to" + }, + { + "start": 7929.52, + "text": "train your vocabulary maybe the best" + }, + { + "start": 7931.32, + "text": "thing is to just wait for M bpe to" + }, + { + "start": 7933.159, + "text": "becomes as efficient as possible and uh" + }, + { + "start": 7936.84, + "text": "that's something that maybe I hope to" + }, + { + "start": 7938.159, + "text": "work on and at some point maybe we can" + }, + { + "start": 7940.8, + "text": "be training basically really what we" + }, + { + "start": 7942.88, + "text": "want is we want tick token but training" + }, + { + "start": 7944.96, + "text": "code and that is the ideal thing that" + }, + { + "start": 7947.84, + "text": "currently does not exist and MBP is um" + }, + { + "start": 7951.36, + "text": "is in implementation of it but currently" + }, + { + "start": 7953.239, + "text": "it's in Python so that's currently what" + }, + { + "start": 7955.88, + "text": "I have to say for uh tokenization there" + }, + { + "start": 7958.199, + "text": "might be an advanced video that has even" + }, + { + "start": 7960.4, + "text": "drier and even more detailed in the" + }, + { + "start": 7961.92, + "text": "future but for now I think we're going" + }, + { + "start": 7963.639, + "text": "to leave things off here and uh I hope" + }, + { + "start": 7966.76, + "text": "that was helpful bye" + }, + { + "start": 7974.119, + "text": "and uh they increase this contact size" + }, + { + "start": 7976.04, + "text": "from gpt1 of 512 uh to 1024 and GPT 4" + }, + { + "start": 7982.679, + "text": "two the" + }, + { + "start": 7985.44, + "text": "next okay next I would like us to" + }, + { + "start": 7987.639, + "text": "briefly walk through the code from open" + }, + { + "start": 7989.8, + "text": "AI on the gpt2 encoded" + }, + { + "start": 7995.84, + "text": "ATP I'm sorry I'm gonna sneeze" + }, + { + "start": 7999.119, + "text": "and then what's Happening Here" + }, + { + "start": 8001.84, + "text": "is this is a spous layer that I will" + }, + { + "start": 8004.639, + "text": "explain in a" + }, + { + "start": 8006.119, + "text": "bit What's Happening Here" + }, + { + "start": 8013.159, + "text": "is" + } +] \ No newline at end of file