Spaces:

Felix5572
/

ChuanhuChatGPT

Sleeping

App Files Files Community

Tuchuanhuhuhu commited on Apr 22, 2023

Commit

b8e4532

1 Parent(s): c915adf

MOSS支持流式传输

Browse files

Files changed (1) hide show

modules/models/MOSS.py +246 -4

modules/models/MOSS.py CHANGED Viewed

@@ -1,11 +1,16 @@
 import os
 import torch
 import warnings
 import platform
 from huggingface_hub import snapshot_download
 from transformers.generation.utils import logger
 from accelerate import init_empty_weights, load_checkpoint_and_dispatch
 try:
     from transformers import MossForCausalLM, MossTokenizer
 except (ImportError, ModuleNotFoundError):
@@ -25,7 +30,7 @@ class MOSS_Client(BaseLLMModel):
         logger.setLevel("ERROR")
         warnings.filterwarnings("ignore")
         if MOSS_MODEL is None:
-            model_path = "models/moss-moon-003-sft"
             if not os.path.exists(model_path):
                 model_path = snapshot_download("fnlp/moss-moon-003-sft")
@@ -57,10 +62,33 @@ class MOSS_Client(BaseLLMModel):
         self.text_to_image_switch = '- Text-to-image: disabled.\n'
         self.image_edition_switch = '- Image edition: disabled.\n'
         self.text_to_speech_switch = '- Text-to-speech: disabled.\n'
-        self.token_upper_limit = 4096
-        self.top_p = 0.95
-        self.top_k = 50
         self.temperature = 0.7
     def _get_main_instruction(self):
         return self.system_prompt + self.web_search_switch + self.calculator_switch + self.equation_solver_switch + self.text_to_image_switch + self.image_edition_switch + self.text_to_speech_switch
@@ -86,6 +114,7 @@ class MOSS_Client(BaseLLMModel):
                 top_k=self.top_k,
                 top_p=self.top_p,
                 temperature=self.temperature,
                 num_return_sequences=1,
                 eos_token_id=106068,
                 pad_token_id=MOSS_TOKENIZER.pad_token_id)
@@ -93,6 +122,219 @@ class MOSS_Client(BaseLLMModel):
         response = response.lstrip("<|MOSS|>: ")
         return response, len(response)
 if __name__ == "__main__":
     model = MOSS_Client("MOSS")

+# 代码主要来源于 https://github.com/OpenLMLab/MOSS/blob/main/moss_inference.py
 import os
 import torch
 import warnings
 import platform
+import time
+from typing import Union, List, Tuple, Optional, Dict
 from huggingface_hub import snapshot_download
 from transformers.generation.utils import logger
 from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+from transformers.modeling_outputs import BaseModelOutputWithPast
 try:
     from transformers import MossForCausalLM, MossTokenizer
 except (ImportError, ModuleNotFoundError):
         logger.setLevel("ERROR")
         warnings.filterwarnings("ignore")
         if MOSS_MODEL is None:
+            model_path = "/home/guest/llm_models/moss/moss-moon-003-sft"
             if not os.path.exists(model_path):
                 model_path = snapshot_download("fnlp/moss-moon-003-sft")
         self.text_to_image_switch = '- Text-to-image: disabled.\n'
         self.image_edition_switch = '- Image edition: disabled.\n'
         self.text_to_speech_switch = '- Text-to-speech: disabled.\n'
+        self.token_upper_limit = 2048
+        self.top_p = 0.8
+        self.top_k = 40
         self.temperature = 0.7
+        self.repetition_penalty = 1.1
+        self.max_generation_token = 2048
+        self.default_paras = {
+                "temperature":0.7,
+                "top_k":0,
+                "top_p":0.8,
+                "length_penalty":1,
+                "max_time":60,
+                "repetition_penalty":1.1,
+                "max_iterations":512,
+                "regulation_start":512,
+                }
+        self.num_layers, self.heads, self.hidden, self.vocab_size = 34, 24, 256, 107008
+        self.moss_startwords = torch.LongTensor([27, 91, 44, 18420, 91, 31175])
+        self.tool_startwords = torch.LongTensor([27, 91, 6935, 1746, 91, 31175])
+        self.tool_specialwords = torch.LongTensor([6045])
+        self.innerthought_stopwords = torch.LongTensor([MOSS_TOKENIZER.convert_tokens_to_ids("<eot>")])
+        self.tool_stopwords = torch.LongTensor([MOSS_TOKENIZER.convert_tokens_to_ids("<eoc>")])
+        self.result_stopwords = torch.LongTensor([MOSS_TOKENIZER.convert_tokens_to_ids("<eor>")])
+        self.moss_stopwords = torch.LongTensor([MOSS_TOKENIZER.convert_tokens_to_ids("<eom>")])
     def _get_main_instruction(self):
         return self.system_prompt + self.web_search_switch + self.calculator_switch + self.equation_solver_switch + self.text_to_image_switch + self.image_edition_switch + self.text_to_speech_switch
                 top_k=self.top_k,
                 top_p=self.top_p,
                 temperature=self.temperature,
+                repetition_penalty=self.repetition_penalty,
                 num_return_sequences=1,
                 eos_token_id=106068,
                 pad_token_id=MOSS_TOKENIZER.pad_token_id)
         response = response.lstrip("<|MOSS|>: ")
         return response, len(response)
+    def get_answer_stream_iter(self):
+        prompt = self._get_moss_style_inputs()
+        it = self.forward(prompt)
+        for i in it:
+            yield i
+    def preprocess(self, raw_text: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Preprocesses the raw input text by adding the prefix and tokenizing it.
+        Args:
+            raw_text (str): The raw input text.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing the tokenized input IDs and attention mask.
+        """
+        tokens = MOSS_TOKENIZER.batch_encode_plus([raw_text], return_tensors="pt")
+        input_ids, attention_mask = tokens['input_ids'], tokens['attention_mask']
+        return input_ids, attention_mask
+    def forward(
+        self, data: str, paras: Optional[Dict[str, float]] = None
+    ) -> List[str]:
+        """
+        Generates text using the model, given the input data and generation parameters.
+        Args:
+            data (str): The input text for generation.
+            paras (Optional[Dict[str, float]], optional): A dictionary of generation parameters. Defaults to None.
+        Returns:
+            List[str]: The list of generated texts.
+        """
+        input_ids, attention_mask = self.preprocess(data)
+        if not paras:
+            paras = self.default_paras
+        streaming_iter = self.streaming_topk_search(
+            input_ids,
+            attention_mask,
+            temperature=self.temperature,
+            repetition_penalty=self.repetition_penalty,
+            top_k=self.top_k,
+            top_p=self.top_p,
+            max_iterations=self.max_generation_token,
+            regulation_start=paras["regulation_start"],
+            length_penalty=paras["length_penalty"],
+            max_time=paras["max_time"],
+        )
+        for outputs in streaming_iter:
+            preds = MOSS_TOKENIZER.batch_decode(outputs)
+            res = [pred.lstrip(data) for pred in preds]
+            yield res[0]
+    def streaming_topk_search(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        temperature: float = 0.7,
+        repetition_penalty: float = 1.1,
+        top_k: int = 0,
+        top_p: float = 0.92,
+        max_iterations: int = 1024,
+        regulation_start: int = 512,
+        length_penalty: float = 1,
+        max_time: int = 60,
+    ) -> torch.Tensor:
+        """
+        Performs a streaming top-k search using the given parameters.
+        Args:
+            input_ids (torch.Tensor): The input IDs tensor.
+            attention_mask (torch.Tensor): The attention mask tensor.
+            temperature (float, optional): The temperature for logits. Defaults to 0.7.
+            repetition_penalty (float, optional): The repetition penalty factor. Defaults to 1.1.
+            top_k (int, optional): The top-k value for filtering. Defaults to 0.
+            top_p (float, optional): The top-p value for filtering. Defaults to 0.92.
+            max_iterations (int, optional): The maximum number of iterations. Defaults to 1024.
+            regulation_start (int, optional): The number of iterations after which regulation starts. Defaults to 512.
+            length_penalty (float, optional): The length penalty factor. Defaults to 1.
+            max_time (int, optional): The maximum allowed time in seconds. Defaults to 60.
+        Returns:
+            torch.Tensor: The generated output IDs tensor.
+        """
+        assert input_ids.dtype == torch.int64 and attention_mask.dtype == torch.int64
+        self.bsz, self.seqlen = input_ids.shape
+        input_ids, attention_mask = input_ids.to('cuda'), attention_mask.to('cuda')
+        last_token_indices = attention_mask.sum(1) - 1
+        moss_stopwords = self.moss_stopwords.to(input_ids.device)
+        queue_for_moss_stopwords = torch.empty(size=(self.bsz, len(self.moss_stopwords)), device=input_ids.device, dtype=input_ids.dtype)
+        all_shall_stop = torch.tensor([False] * self.bsz, device=input_ids.device)
+        moss_stop = torch.tensor([False] * self.bsz, device=input_ids.device)
+        generations, start_time = torch.ones(self.bsz, 1, dtype=torch.int64), time.time()
+        past_key_values = None
+        for i in range(int(max_iterations)):
+            logits, past_key_values = self.infer_(input_ids if i == 0 else new_generated_id, attention_mask, past_key_values)
+            if i == 0:
+                logits = logits.gather(1, last_token_indices.view(self.bsz, 1, 1).repeat(1, 1, self.vocab_size)).squeeze(1)
+            else:
+                logits = logits[:, -1, :]
+            if repetition_penalty > 1:
+                score = logits.gather(1, input_ids)
+                # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
+                # just gather the histroy token from input_ids, preprocess then scatter back
+                # here we apply extra work to exclude special token
+                score = torch.where(score < 0, score * repetition_penalty, score / repetition_penalty)
+                logits.scatter_(1, input_ids, score)
+            logits = logits / temperature
+            filtered_logits = self.top_k_top_p_filtering(logits, top_k, top_p)
+            probabilities = torch.softmax(filtered_logits, dim=-1)
+            cur_len = i
+            if cur_len > int(regulation_start):
+                for i in self.moss_stopwords:
+                    probabilities[:, i] = probabilities[:, i] * pow(length_penalty, cur_len - regulation_start)
+            new_generated_id = torch.multinomial(probabilities, 1)
+            # update extra_ignored_tokens
+            new_generated_id_cpu = new_generated_id.cpu()
+            input_ids, attention_mask = torch.cat([input_ids, new_generated_id], dim=1), torch.cat([attention_mask, torch.ones((self.bsz, 1), device=attention_mask.device, dtype=attention_mask.dtype)], dim=1)
+            generations = torch.cat([generations, new_generated_id.cpu()], dim=1)
+            # stop words components
+            queue_for_moss_stopwords = torch.cat([queue_for_moss_stopwords[:, 1:], new_generated_id], dim=1)
+            moss_stop |= (queue_for_moss_stopwords == moss_stopwords).all(1)
+            all_shall_stop |= moss_stop
+            if all_shall_stop.all().item():
+                break
+            elif time.time() - start_time > max_time:
+                break
+            yield input_ids
+    def top_k_top_p_filtering(self, logits, top_k, top_p, filter_value=-float("Inf"), min_tokens_to_keep=1, ):
+        if top_k > 0:
+            # Remove all tokens with a probability less than the last token of the top-k
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+            logits[indices_to_remove] = filter_value
+        if top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+            # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+            sorted_indices_to_remove = cumulative_probs > top_p
+            if min_tokens_to_keep > 1:
+                # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+                sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            # scatter sorted tensors to original indexing
+            indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+            logits[indices_to_remove] = filter_value
+        return logits
+    def infer_(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        past_key_values: Optional[Tuple[torch.Tensor]],
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]:
+        """
+        Inference method that computes logits and past key values.
+        Args:
+            input_ids (torch.Tensor): The input IDs tensor.
+            attention_mask (torch.Tensor): The attention mask tensor.
+            past_key_values (Optional[Tuple[torch.Tensor]]): The past key values tuple.
+        Returns:
+            Tuple[torch.Tensor, Tuple[torch.Tensor]]: A tuple containing the logits and past key values.
+        """
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+        }
+        with torch.no_grad():
+            outputs: BaseModelOutputWithPast = MOSS_MODEL(**inputs)
+        return outputs.logits, outputs.past_key_values
+    def __call__(self, input):
+        return self.forward(input)
 if __name__ == "__main__":
     model = MOSS_Client("MOSS")