|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
|
|
|
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model |
|
|
|
model_id = "/share/models/open-zharfa" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True) |
|
|
|
base_model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
low_cpu_mem_usage=True, |
|
return_dict=True, |
|
torch_dtype=torch.float16, |
|
device_map="auto", |
|
) |
|
|
|
base_model.generation_config.do_sample = True |
|
|
|
|
|
|
|
tokenizer.pad_token = tokenizer.unk_token |
|
tokenizer.padding_side = "right" |
|
|
|
def get_completion_merged(query: str, model, tokenizer) -> str: |
|
device = "cuda:0" |
|
|
|
prompt_template = """ |
|
|
|
GPT4 Correct User: {query}<|end_of_turn|>GPT4 Correct Assistant: |
|
|
|
""" |
|
prompt = prompt_template.format(query=query) |
|
|
|
encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) |
|
|
|
model_inputs = encodeds.to(device) |
|
|
|
generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, temperature=0.5, pad_token_id=tokenizer.unk_token_id) |
|
decoded = tokenizer.batch_decode(generated_ids) |
|
return (decoded[0]) |
|
|
|
while True: |
|
q = input('q : ') |
|
result = get_completion_merged(query=q, model=base_model, tokenizer=tokenizer) |
|
print(result) |
|
|