|
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig |
|
import gradio as gr |
|
import torch |
|
|
|
|
|
title = "????AI ChatBot bajo GPU" |
|
description = "A State-of-the-Art Large-scale Pretrained Response generation model (DialoGPT)" |
|
examples = [["How are you?"]] |
|
model_id="clibrain/Llama-2-13b-ft-instruct-es-gptq-4bit" |
|
config = AutoConfig.from_pretrained(model_id) |
|
|
|
config.quantization_config["disable_exllama"] = True |
|
config.quantization_config["exllama_config"] = {"version":2} |
|
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
|
print("********************") |
|
print(device) |
|
print("********************") |
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, config=config) |
|
model = model.to(device) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
|
|
def predict(input, history=[]): |
|
|
|
new_user_input_ids = tokenizer.encode( |
|
input + tokenizer.eos_token, return_tensors="pt" |
|
).to(device) |
|
|
|
|
|
historygpu=torch.LongTensor(history).to(device) |
|
bot_input_ids = torch.cat([historygpu, new_user_input_ids], dim=-1) |
|
|
|
|
|
history = model.generate( |
|
bot_input_ids, max_length=4000, pad_token_id=tokenizer.eos_token_id |
|
) |
|
|
|
|
|
response = tokenizer.decode(history[0]).split("<|endoftext|>") |
|
|
|
print(response) |
|
response = [ |
|
(response[i], response[i + 1]) for i in range(0, len(response) - 1, 2) |
|
] |
|
|
|
return response, history |
|
|
|
|
|
gr.Interface( |
|
fn=predict, |
|
title=title, |
|
description=description, |
|
examples=examples, |
|
inputs=["text", "state"], |
|
outputs=["chatbot", "state"], |
|
theme="finlaymacklon/boxy_violet", |
|
).launch() |