rodrigomasini commited on
Commit
59bf219
1 Parent(s): 22bcba4

Update app_v1.py

Browse files
Files changed (1) hide show
  1. app_v1.py +33 -31
app_v1.py CHANGED
@@ -1,13 +1,12 @@
1
  import streamlit as st
2
  from transformers import AutoTokenizer
3
- from auto_gptq import AutoGPTQForCausalLM
4
  from huggingface_hub import snapshot_download
5
-
6
  import os
7
  import threading
8
 
9
  cwd = os.getcwd()
10
- cachedir = cwd+'/cache'
11
 
12
  # Check if the directory exists before creating it
13
  if not os.path.exists(cachedir):
@@ -17,37 +16,31 @@ os.environ['HF_HOME'] = cachedir
17
 
18
  local_folder = cachedir + "/model"
19
 
20
-
21
  quantized_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
22
-
23
  snapshot_download(repo_id=quantized_model_dir, local_dir=local_folder, local_dir_use_symlinks=True)
24
 
25
  model_basename = cachedir + "/model/Jackson2-4bit-128g-GPTQ"
26
 
27
- class QuantizedModel:
28
- def __init__(self, model_dir):
29
- self.tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False)
30
- self.model = AutoGPTQForCausalLM.from_quantized(
31
- model_dir,
32
- use_safetensors=True,
33
- strict=False,
34
- device="cuda:0",
35
- use_triton=False
36
- )
37
-
38
- def generate(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, repetition_penalty=1.15):
39
- inputs = self.tokenizer(prompt, return_tensors="pt")
40
- outputs = self.model.generate(
41
- input_ids=inputs['input_ids'].to("cuda:0"),
42
- attention_mask=inputs['attention_mask'].to("cuda:0"),
43
- max_length=max_new_tokens + inputs['input_ids'].size(-1),
44
- temperature=temperature,
45
- top_p=top_p,
46
- repetition_penalty=repetition_penalty
47
- )
48
- return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
49
-
50
- quantized_model = QuantizedModel(local_folder)
51
 
52
  user_input = st.text_input("Input a phrase")
53
 
@@ -55,5 +48,14 @@ prompt_template = f'USER: {user_input}\nASSISTANT:'
55
 
56
  # Generate output when the "Generate" button is pressed
57
  if st.button("Generate the prompt"):
58
- output = quantized_model.generate(prompt_template)
59
- st.text_area("Prompt", value=output)
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from transformers import AutoTokenizer
3
+ from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
4
  from huggingface_hub import snapshot_download
 
5
  import os
6
  import threading
7
 
8
  cwd = os.getcwd()
9
+ cachedir = cwd + '/cache'
10
 
11
  # Check if the directory exists before creating it
12
  if not os.path.exists(cachedir):
 
16
 
17
  local_folder = cachedir + "/model"
18
 
 
19
  quantized_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
 
20
  snapshot_download(repo_id=quantized_model_dir, local_dir=local_folder, local_dir_use_symlinks=True)
21
 
22
  model_basename = cachedir + "/model/Jackson2-4bit-128g-GPTQ"
23
 
24
+ use_strict = False
25
+ use_triton = False
26
+
27
+ tokenizer = AutoTokenizer.from_pretrained(local_folder, use_fast=False)
28
+
29
+ quantize_config = BaseQuantizeConfig(
30
+ bits=4,
31
+ group_size=128,
32
+ desc_act=False
33
+ )
34
+
35
+ model = AutoGPTQForCausalLM.from_quantized(
36
+ local_folder,
37
+ use_safetensors=True,
38
+ strict=use_strict,
39
+ model_basename=model_basename,
40
+ device="cuda:0",
41
+ use_triton=use_triton,
42
+ quantize_config=quantize_config
43
+ )
 
 
 
 
44
 
45
  user_input = st.text_input("Input a phrase")
46
 
 
48
 
49
  # Generate output when the "Generate" button is pressed
50
  if st.button("Generate the prompt"):
51
+ inputs = tokenizer(prompt_template, return_tensors="pt")
52
+ outputs = model.generate(
53
+ input_ids=inputs.input_ids.to("cuda:0"),
54
+ attention_mask=inputs.attention_mask.to("cuda:0"),
55
+ max_length=512 + inputs.input_ids.size(-1),
56
+ temperature=0.1,
57
+ top_p=0.95,
58
+ repetition_penalty=1.15
59
+ )
60
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
61
+ st.text_area("Prompt", value=generated_text)