Spaces:

gagan001
/

my-blogs

Runtime error

App Files Files Community

gagan001 commited on Jun 22, 2024

Commit

9cdcbb3

1 Parent(s): 838f2d4

Added model and tokenizer

Browse files

Files changed (16) hide show

.DS_Store +0 -0
.gitattributes +1 -0
.gitignore +3 -1
app.py +18 -5
model/.DS_Store +0 -0
model/model_1000_.bin +3 -0
my_gpt.py +186 -0
tokenizer/__init__.py +0 -0
tokenizer/__pycache__/__init__.cpython-38.pyc +0 -0
tokenizer/__pycache__/base.cpython-311.pyc +0 -0
tokenizer/__pycache__/base.cpython-38.pyc +0 -0
tokenizer/__pycache__/base.cpython-39.pyc +0 -0
tokenizer/__pycache__/tokenizer.cpython-38.pyc +0 -0
tokenizer/__pycache__/tokenizer.cpython-39.pyc +0 -0
tokenizer/base.py +110 -0
tokenizer/tokenizer.py +72 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model/model_1000_.bin filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

	@@ -1 +1,3 @@
1	- *.txt

+*.txt
+tokenizer/README.md
+__pycache__

app.py CHANGED Viewed

@@ -1,11 +1,24 @@
 import gradio as gr
-import sys
-sys.path.append("../Transformers")
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 iface.launch()

 import gradio as gr
+import torch
+from my_gpt import my_gpt
+from tokenizer.tokenizer import BPE
+##Load model
+model = my_gpt.load_pretrained("model/model_1000_.bin")
+tokenizer = BPE()
+def generate(input_text):
+    tokens = tokenizer.encode(input_text)
+    gen_ids = model.generate(torch.tensor([tokens]))
+    output = tokenizer.decode(gen_ids[0].tolist())
+    return output
+iface = gr.Interface(fn=generate,
+                     inputs="text",
+                     outputs="text",
+                     title="GPT - 1000 steps",
+                     description="""This model is trained for 1000 steps only. It is not
+                     able to generate perfect sentences/words. However, it has learnt a gist of the English language""")
 iface.launch()

model/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

model/model_1000_.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:566dd60c0869b306eda84fd1d64dbacce02b256492da9ea626d227b046125bd2
+size 56950645

my_gpt.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import json
+import logging
+block_size = 256
+vocab_size = 500
+n_embed = 384
+dropout = 0.2
+n_head = 6
+n_layer = 6
+class Head(nn.Module):
+    def __init__(self, head_size=16):
+        super().__init__()
+        self.query = nn.Linear(n_embed, head_size, bias=False)
+        self.key = nn.Linear(n_embed, head_size, bias=False)
+        self.value = nn.Linear(n_embed, head_size, bias=False)
+        self.register_buffer('tril',torch.tril(torch.ones(block_size,block_size)))
+        self.dropout = nn.Dropout(dropout)
+    def forward(self,x):
+        B,T,C = x.shape
+        q = self.query(x)
+        k = self.key(x)
+        wei = (q @ k.transpose(-2,-1)) * (k.shape[-1]**(-0.5))
+        wei = wei.masked_fill(self.tril[:T,:T]==0, float('-inf'))
+        wei = F.softmax(wei, dim=-1)
+        wei = self.dropout(wei)
+        v = self.value(x)
+        out = wei @ v ## (B,T,HS)
+        return out
+class MultiHeadAttention(nn.Module):
+    def __init__(self,num_heads, head_size) :
+        super().__init__()
+        self.heads = nn.ModuleList(Head(head_size=head_size) for _ in range(num_heads))
+        self.proj = nn.Linear(head_size * num_heads, n_embed)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        out = torch.cat([h(x) for h in self.heads], dim=-1)
+        out = self.dropout(self.proj(out))
+        return out
+class FeedForward(nn.Module):
+    def __init__(self,n_embed) -> None:
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(n_embed,4* n_embed),
+            nn.ReLU(),
+            nn.Linear(4 * n_embed, n_embed),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        x = self.net(x)
+        return x
+class decoder_block(nn.Module):
+    def __init__(self, n_embed, n_heads):
+        super().__init__()
+        self.sa = MultiHeadAttention(n_heads,n_embed//n_heads)
+        self.ln1 = nn.LayerNorm(n_embed)
+        self.ln2 = nn.LayerNorm(n_embed)
+        self.ffwd = FeedForward(n_embed)
+    def forward(self, x):
+        x = x + self.sa(self.ln1(x))
+        x = x + self.ffwd(self.ln2(x))
+        return x
+class my_gpt(nn.Module):
+    def __init__(self, block_size = 256):
+        super().__init__()
+        self.block_size = block_size ##context window size
+        self.token_embed = nn.Embedding(vocab_size, n_embed)
+        self.pos_embed = nn.Embedding(vocab_size, n_embed)
+        self.lm_head = nn.Linear(n_embed, vocab_size)
+        self.sa_head = Head(vocab_size)
+        self.d_blocks = nn.Sequential(*[decoder_block(n_embed=n_embed,n_heads=n_head) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embed) # final layer norm
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx, targets = None):
+        """
+        Args:
+            idx: int(B,T) Token ids
+            targets :
+        Returns:
+            logits
+        """
+        # print("idx ", idx)
+        B, T = idx.shape ##
+        tok_emd = self.token_embed(idx) ##(B,T,C)
+        pos_emd = self.pos_embed(idx)
+        x = tok_emd + pos_emd
+        # print("x1 ", x.shape)
+        x = self.d_blocks(x) #
+        x = self.ln_f(x) # (B,T,C)
+        logits = self.lm_head(x) ##(B,T,vocab_size)
+        if targets is None:
+            loss = None
+        else:
+            B, T, C = logits.shape
+            # print("logits ", logits.shape)
+            logits = logits.view(B*T,C)
+            targets = targets.view(B*T)
+            loss = F.cross_entropy(logits, targets)
+        # print("Logits", logits.shape)
+        return logits, loss
+    def generate(self, context : torch.tensor, max_new_tokens: int = 46, use_cache = False):
+        """
+        Generates the next "max_new_tokens" number of tokens.
+        Args:
+            context (B,T):
+            max_new_tokens (int):
+        Returns:
+            [token] : List of generated tokens.
+        """
+        # print("Context:" , context)
+        for _ in range(max_new_tokens):
+            ##Take only last allowed number of tokens
+            idx_tokens = context[:, -self.block_size:]
+            ##generate the next token
+            logits, loss = self(idx_tokens)
+            ##Take only last allowed number of tokens
+            logits = logits[:,-1,:] ##(B,vocab_size)
+            # print("logits:" , logits.shape)
+            probs = F.softmax(logits, dim= -1)
+            idx_next = torch.multinomial(probs,num_samples=1) ##(B,1)
+            context = torch.concatenate([context, idx_next], dim=1)
+        return context
+    def save_pretrained(self, path):
+        torch.save(self.state_dict(),path)
+        print("Saved pretrained Successfully")
+    @classmethod
+    def load_pretrained(cls, path):
+        print("Loading pretrained model...")
+        model = cls()
+        model.load_state_dict(torch.load(path))
+        return  model

tokenizer/__init__.py ADDED Viewed

File without changes

tokenizer/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (186 Bytes). View file

tokenizer/__pycache__/base.cpython-311.pyc ADDED Viewed

Binary file (4.35 kB). View file

tokenizer/__pycache__/base.cpython-38.pyc ADDED Viewed

Binary file (2.8 kB). View file

tokenizer/__pycache__/base.cpython-39.pyc ADDED Viewed

Binary file (2.02 kB). View file

tokenizer/__pycache__/tokenizer.cpython-38.pyc ADDED Viewed

Binary file (2.34 kB). View file

tokenizer/__pycache__/tokenizer.cpython-39.pyc ADDED Viewed

Binary file (1.97 kB). View file

tokenizer/base.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import json
+import sys
+import os
+sys.path.append("./")
+def render_token(t: bytes) -> str:
+    # pretty print a token, escaping control characters
+    s = t.decode('utf-8', errors='replace')
+    # s = replace_control_characters(s)
+    return s
+def get_freq_pairs(inp_toks):
+  """Returns a count of the pairs"""
+  count = {}
+  for pair in zip(inp_toks, inp_toks[1:]):
+    count[pair] = count.get(pair,0) + 1
+  return count
+def merge(id_list, pair, replace_with_idx):
+  """
+  Replace the occurence of 'pair' in 'id_list' with 'replace_with_idx'
+  id_list : List of tokens
+  pair : List of 2 numbers
+  replace_with_idx : Int value
+  Returns new list with the pair replaced
+  """
+  i=0
+  new_ids_list = []
+  while(i<len(id_list)):
+    if(i<len(id_list)-1 and id_list[i]==pair[0] and id_list[i+1]==pair[1]):
+      new_ids_list.append(replace_with_idx)
+      i+=2
+    else:
+      new_ids_list.append(id_list[i])
+      i+=1
+  return new_ids_list
+class Tokenizer():
+  def __init__(self):
+    self.merges = {}
+    ##vocab -> (int) : bytes . For all ints (0-256, 256+ from new merges)
+    self.vocab = {}
+    self.load()
+  def save(self):
+    with open('merges.txt', 'w') as f:
+      ##Write only the pairs. Not the index of the merged pairs.
+      ##When the tokenizer is loaded, allow the user to specify the index
+      for p1,p2 in self.merges.keys():
+        f.write(f"{p1} {p2}\n")
+    with open('vocab.txt', 'w') as f:
+      for idx, byte in self.vocab.items():
+        s = render_token(byte)
+        f.write(f"{idx} {s}\n")
+  def _build_vocab(self):
+    self.vocab = {idx: bytes([idx]) for idx in range(256)}
+    try:
+      for (tok0, tok1),idx in self.merges.items():
+        self.vocab[idx] = self.vocab[tok0] + self.vocab[tok1]
+    except Exception as e:
+      print(e)
+  def load(self):
+    try:
+      # print("Loading", os.getcwd(), "hey" , __file__)
+      with open(os.path.join(os.path.dirname(os.path.abspath(__file__)),'merges.txt'), 'r') as file:
+        idx = 256
+        for line in file:
+          tok0, tok1 = map(int,line.split())
+          self.merges[(tok0, tok1)] = idx
+          idx += 1
+      # print(self.merges)
+      self._build_vocab()
+    except Exception as e:
+      print(e)
+if __name__ == '__main__':
+  # print(merge([5, 6, 6, 7, 9, 1], (6, 7), 99))
+  tokenizer = Tokenizer()

tokenizer/tokenizer.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from .base import get_freq_pairs, merge, Tokenizer
+class BPE(Tokenizer):
+    def __init__(self) -> None:
+        super().__init__()
+    def train(self, vocab_size, text):
+        ##Vocabulary should contain atleast the ASCII characters
+        assert vocab_size>=256
+        num_merges = vocab_size-256
+        tokens = list(text.encode('utf-8'))
+        merges = {}
+        vocab = {idx: bytes([idx]) for idx in range(256)}
+        for i in range(num_merges):
+            stats = get_freq_pairs(tokens)
+            max_pair = max(stats, key=stats.get)
+            idx = 256 + i
+            tokens = merge(tokens, max_pair, idx)
+            merges[max_pair] = idx
+            vocab[idx] = vocab[max_pair[0]] + vocab[max_pair[1]]
+        self.merges = merges
+        self.vocab = vocab
+        self.save()
+    def encode(self, text):
+        ids = list(text.encode('utf-8'))
+        # print(ids)
+        # assert len(self.merges) > 0
+        ##if len(ids) is greater than 2, we need to merge it
+        while True:
+            pair_counts = get_freq_pairs(ids)
+            # print(pair_counts)
+            min_index_pair = min(pair_counts, key= lambda x: self.merges.get(x, float('inf')))
+            if(min_index_pair) not in self.merges:
+                break
+            idx = self.merges.get(min_index_pair)
+            # print(ids)
+            ids = merge(ids, min_index_pair, idx)
+        return ids
+    def decode(self, ids):
+        print(ids)
+        # given ids (list of integers), return Python string
+        text_bytes = b"".join(self.vocab[idx] for idx in ids)
+        text = text_bytes.decode("utf-8", errors="replace")
+        return text
+if __name__ == "__main__":
+    tokenizer = tokenizer()
+    with open('cindrella_stories.txt', 'r') as f:
+        text = f.read()
+    tokenizer.train(500, text)
+    s = "😁"
+    print("String is",s)
+    ids  = tokenizer.encode(s)
+    print("Encoded string ",ids)
+    decoded_string = tokenizer.decode(ids)
+    print("Decoded string ",decoded_string)