initial no big file

Files changed (8) hide show

README.md +59 -0
added_tokens.json +1 -0
config.json +59 -0
merges.txt +0 -0
model.py +500 -0
special_tokens_map.json +1 -0
tokenizer_config.json +1 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+---
+language:
+- en
+thumbnail:
+tags:
+- gpt2
+- conversational
+license: apache-2.0
+datasets:
+- wikipedia-turkish
+metrics:
+- perplexity
+- accuracy
+widget:
+- text: Bu yazıyı bir bilgisayar yazdı. Yazarken
+  context: ''
+- text: İnternete kolay erişim sayesinde dünya daha da küçüldü. Bunun sonucunda
+  context: ''
+---
+# GPT2 Persona Chatbot based on Movie Characters
+Model used for https://www.metayazar.com/chatbot
+GPT2 Small Trained on movie scripts (especially Sci-fi)
+This work is based on Persona Chatbot originally done by Hugging Face team (https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313)
+For cleaning movie scripts I also provide cleaner code
+https://github.com/gorkemgoknar/moviescriptcleaner
+Example persona how to:
+https://gist.github.com/gorkemgoknar/ae29bf9d14fa814e6a64d0e57a4a4ed7
+For obvious reasons I cannot share raw personafile but you can check above gist for example how to create it.
+A working "full" demo can be seen in https://www.metayazar.com/chatbot
+For Turkish version (with limited training) https://www.metayazar.com/chatbot_tr
+```python
+tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
+model = AutoModelWithLMHead.from_pretrained('output-small')
+# Let's chat for 5 lines
+for step in range(100):
+    # encode the new user input, add the eos_token and return a tensor in Pytorch
+    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
+    # print(new_user_input_ids)
+    # append the new user input tokens to the chat history
+    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
+    # generated a response while limiting the total chat history to 1000 tokens,
+    chat_history_ids = model.generate(
+        bot_input_ids, max_length=500,
+        pad_token_id=tokenizer.eos_token_id,
+        no_repeat_ngram_size=3,
+        do_sample=True,
+        top_k=100,
+        top_p=0.7,
+        temperature = 0.8
+    )
+    # pretty print last ouput tokens from bot
+    print("AI: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"<bos>": 50257, "<eos>": 50258, "<pad>": 50259, "<speaker1>": 50260, "<speaker2>": 50261}

config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "do_sample": false,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "eos_token_ids": null,
+  "finetuning_task": null,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1"
+  },
+  "initializer_range": 0.02,
+  "is_decoder": false,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1
+  },
+  "layer_norm_epsilon": 1e-05,
+  "length_penalty": 1.0,
+  "max_length": 20,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "num_beams": 1,
+  "num_labels": 1,
+  "num_return_sequences": 1,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "output_past": true,
+  "pad_token_id": null,
+  "pruned_heads": {},
+  "repetition_penalty": 1.0,
+  "resid_pdrop": 0.1,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "temperature": 1.0,
+  "top_k": 50,
+  "top_p": 1.0,
+  "torchscript": false,
+  "use_bfloat16": false,
+  "vocab_size": 50262
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.py ADDED Viewed

	@@ -0,0 +1,500 @@

+from transformers import AutoConfig
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+from utils import SPECIAL_TOKENS, build_input_from_segments, add_special_tokens_
+from utils import get_dataset, download_pretrained_model
+import timeit
+import logging
+logging.basicConfig(format='%(asctime)s: %(message)s',level=logging.INFO)
+logger = logging.getLogger(__file__)
+import random
+from itertools import chain
+from pprint import pformat
+#import warnings
+import torch
+import torch.nn.functional as F
+import boto3
+import os
+import tarfile
+import io
+import base64
+import json
+import re
+from types import SimpleNamespace
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+print("Loading Model.py module...")
+s3 = boto3.client('s3')
+def is_list_of_strings(lst):
+        if lst and isinstance(lst, list):
+            return all(isinstance(elem, str) for elem in lst)
+        else:
+            return False
+class ServerlessModel:
+    def __init__(self, model_path=None, s3_bucket=None, file_prefix=None, efs_path=None):
+        #logging.basicConfig(level=logging.INFO)
+        #logger = logging.getLogger(__file__)
+        print("Trying to init model")
+        self.model = None
+        self.tokenizer = None
+        self.dataset = None
+        if s3_bucket is None:
+                if model_path is not None and efs_path is None :
+                    print("Loading model from local..")
+                    self.model, self.tokenizer, self.dataset = self.from_pretrained_local_path(model_path, file_prefix)
+                    logging.debug("Done loading")
+                else:
+                    ##Load model from EFS, with config and tokenizer from local lambda space
+                    if model_path is not None and efs_path is not None:
+                        print("loading model from EFS")
+                        self.model, self.tokenizer, self.dataset = self.from_pretrained(model_path, s3_bucket, file_prefix, efs_path=efs_path)
+                        logging.debug("Done loading")
+                    else:
+                        #no bucket no path fail
+                        print("ERROR: Model path not found")
+                        raise Exception("No model path found")
+        else:
+                print("Loading model from s3 path..")
+                print(s3_bucket)
+                self.model, self.tokenizer, self.dataset = self.from_pretrained(
+                    model_path, s3_bucket, file_prefix)
+                logging.debug("Done loading")
+        self.parameters = {
+            'max_length' : 25, #60
+            'min_length' : 1,
+            'device' : 'cpu',
+            'temperature' : 1.0, #1.5
+            'dynamic_temperature' : True,
+            'dynamic_temperature_range' : 0.15,
+            'top_k' : 50,  #50
+            'top_p' : 0.9, #0.9
+            'no_sample' : False,
+            'max_history' : 2,
+        }
+        print("Done initializing model")
+    def from_pretrained(self, model_path: str, s3_bucket: str, file_prefix: str , efs_path = None ):
+        if efs_path is None:
+            model = self.load_model_from_s3(model_path, s3_bucket, file_prefix)
+        else:
+            model = self.load_model_from_efs(model_path,efs_path)
+        print("Model loaded.")
+        print("loading tokenizer from path: ", model_path)
+        tokenizer = self.load_tokenizer(model_path)
+        # Get sequence length max of 1024
+        tokenizer.model_max_length = 1024
+        print("tokenizer loaded")
+        self.model = model
+        self.tokenizer = tokenizer
+        add_special_tokens_(self.model, self.tokenizer)
+        #Will only use if it cannot find cache
+        DATASET_PATH = model_path + '/personafile.json'  #maynot be needed if cache exists!
+        ##We have cache no need for dataset path
+        DATASET_CACHE = model_path +'/persona_good'  ##persona_good_gpt2_cache  (no zip extension)
+        dataset = self.load_dataset(DATASET_PATH, DATASET_CACHE)
+        self.dataset = dataset
+        print("dataset loaded")
+        model.eval()
+        print("Model in eval mode, dataset and tokenizer also loaded")
+        return model, tokenizer, dataset
+    def load_model_from_path(self, model_path:str):
+        print("Loading model from path:",model_path)
+        model = GPT2LMHeadModel.from_pretrained(model_path)
+        model.eval()
+        self.model = model
+        return model
+    def from_pretrained_local_path(self, model_path: str, file_prefix: str):
+        print("Local model loading...")
+        model = GPT2LMHeadModel.from_pretrained(model_path)
+        tokenizer = self.load_tokenizer(model_path)
+        self.model = model
+        self.tokenizer = tokenizer
+        # Get sequence length max of 1024
+        tokenizer.model_max_length = 1024
+        add_special_tokens_(model, tokenizer)
+        #Will only use if it cannot find cache
+        DATASET_PATH = model_path + '/personafile.json'  #maynot be needed if cache exists!
+        ##We have cache no need for dataset path
+        DATASET_CACHE = model_path +'/persona_good'  ##persona_good_gpt2_cache  (no zip extension)
+        dataset = self.load_dataset(DATASET_PATH, DATASET_CACHE)
+        self.dataset = dataset
+        model.eval()
+        print("Model in eval mode, dataset and tokenizer also loaded")
+        return model, tokenizer, dataset
+    def load_model_from_efs(self, model_path: str, efs_path: str):
+        if model_path and efs_path:
+            config = AutoConfig.from_pretrained(f'{model_path}/config.json')
+            with open(efs_path, 'rb') as f:
+                # state messes things just use classics!
+                state = torch.load(io.BytesIO(
+                        f.read()), map_location=lambda storage, loc: storage)
+                '''alt
+                with open(efs_path, 'rb') as f:
+                    state = pickle.load(f, encoding='latin1')
+                '''
+                model = GPT2LMHeadModel.from_pretrained(
+                    pretrained_model_name_or_path=None, state_dict=state, config=config)
+            return model
+        else:
+            raise KeyError('No model config path or EFS bin path')
+    def load_model_from_s3(self, model_path: str, s3_bucket: str, file_prefix: str):
+        if model_path and s3_bucket and file_prefix:
+            obj = s3.get_object(Bucket=s3_bucket, Key=file_prefix)
+            bytestream = io.BytesIO(obj['Body'].read())
+            tar = tarfile.open(fileobj=bytestream, mode="r:gz")
+            config = AutoConfig.from_pretrained(f'{model_path}/config.json')
+            for member in tar.getmembers():
+                if member.name.startswith("./._"):
+                    # osx tar adds ./._XXX copyfile need to pass this file
+                    continue
+                if member.name.endswith(".bin"):
+                    f = tar.extractfile(member)
+                    print("Model file extracted: " + member.name)
+                    # state messes things just use classics!
+                    state = torch.load(io.BytesIO(
+                        f.read()), map_location=lambda storage, loc: storage)
+                    model = GPT2LMHeadModel.from_pretrained(
+                        pretrained_model_name_or_path=None, state_dict=state, config=config)
+                    #model = AutoModelWithLMHead.from_pretrained("./",  config=config)
+            return model
+        else:
+            raise KeyError('No S3 Bucket and Key Prefix provided')
+    def load_tokenizer(self, model_path: str):
+        print("loading tokenizer")
+        tokenizer = GPT2Tokenizer.from_pretrained(model_path)
+        return tokenizer
+    def load_dataset(self, DATASET_PATH: str, DATASET_CACHE: str, use_efs= False):
+        print("loading dataset")
+        dataset = get_dataset(self.tokenizer, DATASET_PATH, DATASET_CACHE)
+        return dataset
+    def encode(self, question, context):
+        encoded = self.tokenizer.encode_plus(question, context)
+        return encoded["input_ids"], encoded["attention_mask"]
+    def decode(self, token):
+        answer_tokens = self.tokenizer.convert_ids_to_tokens(
+            token, skip_special_tokens=True)
+        return self.tokenizer.convert_tokens_to_string(answer_tokens)
+    def generate_word(self, text, model=None, tokenizer=None, noprint=False):
+        if model is None or tokenizer is None:
+            print("ERROR: No model or tokenizer")
+            return None
+        inputs = tokenizer(text, return_tensors="pt")
+        # model output
+        outputs = model(**inputs, labels=inputs["input_ids"])
+        loss, logits = outputs[:2]
+        predicted_index = torch.argmax(logits[0, -1, :]).item()
+        predicted_text = tokenizer.decode([predicted_index])
+        # results
+        if not noprint:
+            print('input text:', text)
+            print('predicted text:', predicted_text)
+        return predicted_text
+    def top_filtering(self,logits, top_k=0., top_p=0.9, threshold=-float('Inf'), filter_value=-float('Inf')):
+        """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering
+            Args:
+                logits: logits distribution shape (vocabulary size)
+                top_k: <=0: no filtering, >0: keep only top k tokens with highest probability.
+                top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset
+                    whose total probability mass is greater than or equal to the threshold top_p.
+                    In practice, we select the highest probability tokens whose cumulative probability mass exceeds
+                    the threshold top_p.
+                threshold: a minimal threshold to keep logits
+        """
+        assert logits.dim() == 1  # Only work for batch size 1 for now - could update but it would obfuscate a bit the code
+        top_k = min(top_k, logits.size(-1))
+        if top_k > 0:
+            # Remove all tokens with a probability less than the last token in the top-k tokens
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+            logits[indices_to_remove] = filter_value
+        if top_p > 0.0:
+            # Compute cumulative probabilities of sorted tokens
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+            # Remove tokens with cumulative probability above the threshold
+            sorted_indices_to_remove = cumulative_probabilities > top_p
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            # Back to unsorted indices and set them to -infinity
+            indices_to_remove = sorted_indices[sorted_indices_to_remove]
+            logits[indices_to_remove] = filter_value
+        indices_to_remove = logits < threshold
+        logits[indices_to_remove] = filter_value
+        return logits
+    def sample_sequence(self,personality, history, tokenizer, model, params=None, current_output=None):
+        start = timeit.default_timer()
+        if params is not None:
+            for k,v in params.items():
+                self.parameters[k] = v
+        ##to access as dot notation
+        ##param = SimpleNamespace(**parameters)
+        special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
+        if current_output is None:
+            current_output = []
+        for i in range(self.parameters['max_length']):
+            #print(">: {}/{}       ".format(i, self.parameters['max_length'] ) ,end='\r', flush=True)
+            instance = build_input_from_segments(personality, history, current_output, tokenizer, with_eos=False)
+            input_ids = torch.tensor(instance["input_ids"], device=self.parameters['device']).unsqueeze(0)
+            token_type_ids = torch.tensor(instance["token_type_ids"], device=self.parameters['device']).unsqueeze(0)
+            logits = model(input_ids, token_type_ids=token_type_ids)
+            if isinstance(logits, tuple):  # for gpt2 and maybe others
+                logits = logits[0]
+            #SPECIAL Dynamic Temperature mode
+            if self.parameters['dynamic_temperature']:
+                #random temperature withing -0.1 / + 0.1 or 'dynamic_temperature_range'
+                rand_range = random.uniform(-1 * self.parameters['dynamic_temperature_range'] , self.parameters['dynamic_temperature_range'])
+                temperature = self.parameters['temperature'] + rand_range
+            else:
+                temperature = self.parameters['temperature']
+            logits = logits[0, -1, :] / temperature
+            logits = self.top_filtering(logits, top_k=self.parameters['top_k'], top_p=self.parameters['top_p'])
+            probs = F.softmax(logits, dim=-1)
+            prev = torch.topk(probs, 1)[1] if self.parameters['no_sample'] else torch.multinomial(probs, 1)
+            if i < self.parameters['min_length'] and prev.item() in special_tokens_ids:
+                while prev.item() in special_tokens_ids:
+                    if probs.max().item() == 1:
+                        warnings.warn("Warning: model generating special token with probability 1.")
+                        break  # avoid infinitely looping over special token
+                    prev = torch.multinomial(probs, num_samples=1)
+            if prev.item() in special_tokens_ids:
+                ##breaks here if found end of anser!!
+                break
+            current_output.append(prev.item())
+        stop = timeit.default_timer()
+        #print(f"\nPredict in {stop - start} seconds\n")
+        return current_output
+    def dump_personalities_with_movies(self):
+        personalities = [ [dialog["name"], dialog["moviename"]] for dialog in self.dataset["train"]]
+        name_list = []
+        for person in personalities:
+            try:
+                name_tokenized = person[0]
+                name = self.tokenizer.decode(name_tokenized)
+                movies_tokenized = person[1]
+                movienames= ""
+                ##check type of first element
+                ##if int , only 1 movie
+                if isinstance(movies_tokenized[0], int):
+                    movienames = self.tokenizer.decode(movies_tokenized)
+                    movienames = movienames.replace(".txt", "")
+                else:
+                    for movie in movies_tokenized:
+                        moviename = self.tokenizer.decode(movie)
+                        moviename = moviename.replace(".txt", "")
+                        movienames = movienames + " / " + moviename
+                name_list.append([name,movienames])
+            except:
+                print("Could not do name:", self.tokenizer.decode(person[0]))
+        return name_list
+    def dump_personalities(self,as_list=False):
+        personalities = [dialog["personality"] for dialog in self.dataset["train"]]
+        name_list = []
+        for person in personalities:
+            name_tokenized = person[-1]
+            name = self.tokenizer.decode(name_tokenized)
+            name = name.replace("My name is ", "")[:-1]
+            name_list.append(name)
+            #print(name)
+        if as_list:
+            return name_list
+        else:
+            return " | ".join(name_list)
+    def get_personalities(self):
+        ##THIS FUNCTION IS NOW LEGACY, USE dump_personalities
+        personalities = [dialog["personality"] for dialog in self.dataset["train"]]
+        people = [item[-1][-10:-1] for item in personalities]
+        ##will get My Name is Something
+        people_list = self.tokenizer.decode(chain(*people))
+        #print( " | ".join( people_list.split(" ") ) )
+        text_to_remove = "My name is "
+        people_list = people_list.replace(text_to_remove,  " | ")
+        #characters =  " | ".join( people_list.split(" ") )
+        return people_list
+    def select_personality(self,characters,select_random=False):
+        ##FIND people list
+        ##this is for debug, usually has " is Name"
+        #people = [item[-1][-3:-1] for item in personalities]
+        personalities = [dialog["personality"]  for dialog in  self.dataset["train"]]
+        if select_random : return random.choice(personalities)
+        #people = [item[-1][-2:-1] for item in personalities]
+        #people_list = self.tokenizer.decode(chain(*people))
+        #print( " | ".join( people_list.split(" ") ) )
+        personality = None
+        name = "My name is " + str(characters)
+        name_token = self.tokenizer.encode(name)
+        #print(name_token)
+        index_start = len(name_token)+1
+        try:
+          index_of_name = [ item[-1][-1*index_start: -1]== name_token for item in personalities].index(True)
+          #print("Selected {} is at: {}".format(characters, str(index_of_name) ) )
+          personality = personalities[index_of_name]
+        except:
+          print("Not found ... Select again")
+          return None
+        ##TALK TO HAL
+        #personality_hal = ["that's true. My name is Hal"]
+        #personality = tokenize(personality_hal)
+        #print(personality)
+        print("Selected personality: %s", self.tokenizer.decode(chain(*personality)))
+        return personality
+    def get_answer(self, input_text, personality, history, params=None):
+        ##Check length of history (to save 1 computation!)
+        if len(history)>0:
+            #mostly it will be empty list so need a length check for performance
+            #would do string check also but just assume it is list of list of strings, as not public
+            new_hist = []
+            for ele in history:
+                new_hist.append( self.tokenizer.encode(ele) )
+            history = new_hist.copy()
+        history.append(self.tokenizer.encode(input_text))
+        with torch.no_grad():
+            out_ids = self.sample_sequence(personality, history, self.tokenizer, self.model, params=params)
+        history.append(out_ids)
+        history = history[-(2*self.parameters['max_history']+1):]
+        out_text = self.tokenizer.decode(out_ids, skip_special_tokens=True)
+        #print(out_text)
+        history_decoded = []
+        for ele in history:
+            history_decoded.append(self.tokenizer.decode(ele))
+        return out_text, history_decoded, self.parameters
+    def predict(self, question, parameter_dict):
+        try:
+            answer = self.generate_text(question, model=self.model,
+                                        tokenizer=self.tokenizer,
+                                        parameter_dict=parameter_dict,
+                                        )
+            return answer
+        except Exception as e:
+            raise Exception(
+                "Runtime error see cloudwatch logs : {}".format(repr(e)))

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "<bos>", "eos_token": "<eos>", "unk_token": "<\|endoftext\|>", "pad_token": "<pad>", "additional_special_tokens": ["<speaker1>", "<speaker2>"]}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"max_len": 1024, "bos_token": "<bos>", "eos_token": "<eos>", "unk_token": "<\|endoftext\|>", "pad_token": "<pad>", "additional_special_tokens": ["<speaker1>", "<speaker2>"]}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff