# from transformers import AutoTokenizer, pipeline, logging # from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig # model_name_or_path = "asyafiqe/Merak-7B-v3-Mini-Orca-Indo-GPTQ" # model_basename = "Merak-7B-v3-Mini-Orca-Indo-GPTQ" # use_triton = False # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) # model = AutoGPTQForCausalLM.from_quantized(model_name_or_path, # model_basename=model_basename, # use_safetensors=True, # trust_remote_code=True, # device="cuda:0", # use_triton=use_triton, # quantize_config=None) # def predict(prompt): # # prompt = "Buat rencana untuk menghemat listrik di rumah" # system_message = "Anda adalah asisten AI. Anda akan diberi tugas. Anda harus menghasilkan jawaban yang rinci dan panjang.\n" # prompt_template=f'''SYSTEM: {system_message} # USER: {prompt} # ASSISTANT: ''' # print("\n\n*** Generate:") # input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda() # output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512) # print(tokenizer.decode(output[0])) # # Inference can also be done using transformers' pipeline # # Prevent printing spurious transformers error when using pipeline with AutoGPTQ # logging.set_verbosity(logging.CRITICAL) # print("*** Pipeline:") # pipe = pipeline( # "text-generation", # model=model, # tokenizer=tokenizer, # max_new_tokens=512, # temperature=0.7, # top_p=0.95, # repetition_penalty=1.15 # ) # result = pipe(prompt_template)[0]['generated_text'] # return result