Spaces:

rodrigomasini
/

rephrase

Paused

App Files Files Community

rodrigomasini commited on Nov 7, 2023

Commit

2995eda

•

1 Parent(s): 3124ebe

Create app_v3.py

Browse files

Files changed (1) hide show

app_v3.py +69 -0

app_v3.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from transformers import AutoTokenizer, TextStreamer, pipeline, logging
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+import time
+model_name_or_path = "TheBloke/llama2_7b_chat_uncensored-GPTQ"
+model_basename = "gptq_model-4bit-128g"
+use_triton = False
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, legacy=False)
+model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
+        model_basename=model_basename,
+        use_safetensors=True,
+        trust_remote_code=True,
+        device="cuda:0",
+        use_triton=use_triton,
+        quantize_config=None)
+"""
+To download from a specific branch, use the revision parameter, as in this example:
+model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
+        revision="gptq-4bit-32g-actorder_True",
+        model_basename=model_basename,
+        use_safetensors=True,
+        trust_remote_code=True,
+        device="cuda:0",
+        quantize_config=None)
+"""
+prompt = "Tell me about AI"
+prompt_template=f'''### HUMAN:
+{prompt}
+### RESPONSE:
+'''
+print("\n\n*** Generate:")
+start_time = time.time()
+input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
+streamer = TextStreamer(tokenizer)
+# output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
+# print(tokenizer.decode(output[0]))
+_ = model.generate(inputs=input_ids, streamer=streamer, temperature=0.7, max_new_tokens=512)
+print(f"Inference time: {time.time() - start_time:.4f} seconds")
+# Inference can also be done using transformers' pipeline
+# Prevent printing spurious transformers error when using pipeline with AutoGPTQ
+logging.set_verbosity(logging.CRITICAL)
+print("*** Pipeline:")
+start_time = time.time()
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    streamer=streamer,
+    max_new_tokens=512,
+    temperature=0.7,
+    top_p=0.95,
+    repetition_penalty=1.15
+)
+pipe(prompt_template)
+#print(pipe(prompt_template)[0]['generated_text'])
+print(f"Inference time: {time.time() - start_time:.4f} seconds")