Spaces:

rodrigomasini
/

rephrase

Paused

App Files Files Community

rodrigomasini commited on Nov 7, 2023

Commit

37f4fec

•

1 Parent(s): 70d3e9b

Update app_v3.py

Browse files

Files changed (1) hide show

app_v3.py +32 -32

app_v3.py CHANGED Viewed

@@ -7,38 +7,37 @@ import gc
 # Define pretrained and quantized model directories
 pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
-cwd = os.getcwd()
-quantized_model_dir = cwd + "/Jackson2-4bit-128g-GPTQ"
 # Check if the model directory is empty (i.e., model not downloaded yet)
-if not os.path.exists(quantized_model_dir) or not os.listdir(quantized_model_dir):
     # Create the cache directory if it doesn't exist
-    os.makedirs(quantized_model_dir, exist_ok=True)
-    snapshot_download(repo_id=pretrained_model_dir, local_dir=quantized_model_dir, local_dir_use_symlinks=True)
-st.write(f'{os.listdir(quantized_model_dir)}')
-model_name_or_path = quantized_model_dir
-model_basename = "Jackson2-4bit-128g-GPTQ"
 #os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 # Before allocating or loading the model, clear up memory
-gc.collect()
-torch.cuda.empty_cache()
 use_triton = False
-tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, legacy=False)
 model = AutoGPTQForCausalLM.from_quantized(
-    model_name_or_path,
-    model_basename=model_basename,
     use_safetensors=True,
-    trust_remote_code=True,
     device="cuda:0",
-    use_triton=use_triton,
-    quantize_config=None
 )
 user_input = st.text_input("Input a phrase")
@@ -46,18 +45,19 @@ user_input = st.text_input("Input a phrase")
 prompt_template = f'USER: {user_input}\nASSISTANT:'
 if st.button("Generate the prompt"):
-    input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
-    streamer = TextStreamer(tokenizer)
-    pipe = pipeline(
-        "text-generation",
-        model=model,
-        tokenizer=tokenizer,
-        streamer=streamer,
-        max_new_tokens=512,
-        temperature=0.2,
-        top_p=0.95,
-        repetition_penalty=1.15
-    )
-    # You had called pipe(prompt_template) twice which was unnecessary. Just call it once.
-    output = pipe(prompt_template)
-    st.write(output[0]['generated_text'])

 # Define pretrained and quantized model directories
 pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
+#cwd = os.getcwd()
+#quantized_model_dir = cwd + "/Jackson2-4bit-128g-GPTQ"
 # Check if the model directory is empty (i.e., model not downloaded yet)
+#if not os.path.exists(quantized_model_dir) or not os.listdir(quantized_model_dir):
     # Create the cache directory if it doesn't exist
+#    os.makedirs(quantized_model_dir, exist_ok=True)
+#    snapshot_download(repo_id=pretrained_model_dir, local_dir=quantized_model_dir, local_dir_use_symlinks=True)
+#st.write(f'{os.listdir(quantized_model_dir)}')
+#model_name_or_path = quantized_model_dir
+#model_basename = "Jackson2-4bit-128g-GPTQ"
 #os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 # Before allocating or loading the model, clear up memory
+#gc.collect()
+#torch.cuda.empty_cache()
 use_triton = False
+#tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, legacy=False)
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
 model = AutoGPTQForCausalLM.from_quantized(
+    pretrained_model_dir,
+    #model_basename=model_basename,
     use_safetensors=True,
     device="cuda:0",
+    #use_triton=use_triton,
+    #quantize_config=None
 )
 user_input = st.text_input("Input a phrase")
 prompt_template = f'USER: {user_input}\nASSISTANT:'
 if st.button("Generate the prompt"):
+    inputs = tokenizer(prompt_template, return_tensors='pt')
+    #streamer = TextStreamer(tokenizer)
+    #pipe = pipeline(
+    #    "text-generation",
+    #    model=model,
+    #    tokenizer=tokenizer,
+    #    streamer=streamer,
+    #    max_new_tokens=512,
+    #    temperature=0.2,
+    #    top_p=0.95,
+    #    repetition_penalty=1.15
+    #)
+    output = model.generate(**prompt_template)
+    st.markdown(f"tokenizer.decode(output)")
+    #st.write(output[0]['generated_text'])