Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import time | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
loaded_hf_models = {} | |
def complete_text_hf(message, | |
model="huggingface/codellama/CodeLlama-7b-hf", | |
max_tokens=2000, | |
temperature=0.5, | |
json_object=False, | |
max_retry=1, | |
sleep_time=0, | |
stop_sequences=[], | |
**kwargs): | |
if json_object: | |
message = "You are a helpful assistant designed to output in JSON format." + message | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model = model.split("/", 1)[1] | |
if model in loaded_hf_models: | |
hf_model, tokenizer = loaded_hf_models[model] | |
else: | |
hf_model = AutoModelForCausalLM.from_pretrained(model).to(device) | |
tokenizer = AutoTokenizer.from_pretrained(model) | |
loaded_hf_models[model] = (hf_model, tokenizer) | |
encoded_input = tokenizer(message, | |
return_tensors="pt", | |
return_token_type_ids=False | |
).to(device) | |
for cnt in range(max_retry): | |
try: | |
output = hf_model.generate( | |
**encoded_input, | |
temperature=temperature, | |
max_new_tokens=max_tokens, | |
do_sample=True, | |
return_dict_in_generate=True, | |
output_scores=True, | |
**kwargs, | |
) | |
sequences = output.sequences | |
sequences = [sequence[len(encoded_input.input_ids[0]) :] for sequence in sequences] | |
all_decoded_text = tokenizer.batch_decode(sequences) | |
completion = all_decoded_text[0] | |
return completion | |
except Exception as e: | |
print(cnt, "=>", e) | |
time.sleep(sleep_time) | |
raise e | |