Spaces:

not-lain
/

text-streaming

Running on Zero

not-lain commited on Apr 9, 2024

Commit

0c3b8fb

verified ·

1 Parent(s): 1f8acf3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-it",
                                              # torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                                              torch_dtype=torch.float16,
                                              token=token)
-tok = AutoTokenizer.from_pretrained("google/gemma-7b-it",token=token)
 # using CUDA for an optimal experience
 # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 device = torch.device('cuda')
@@ -25,11 +25,11 @@ def chat(message, history):
         if item[1] is not None:
             chat.append({"role": "assistant", "content": item[1]})
     chat.append({"role": "user", "content": message})
-    messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
     # Tokenize the messages string
-    model_inputs = tok([messages], return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(
-        tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         model_inputs,
         streamer=streamer,

                                              # torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                                              torch_dtype=torch.float16,
                                              token=token)
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it",token=token)
 # using CUDA for an optimal experience
 # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 device = torch.device('cuda')
         if item[1] is not None:
             chat.append({"role": "assistant", "content": item[1]})
     chat.append({"role": "user", "content": message})
+    messages = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
     # Tokenize the messages string
+    model_inputs = tokenizer([messages], return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(
+        tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         model_inputs,
         streamer=streamer,