rodrigomasini commited on
Commit
37f4fec
1 Parent(s): 70d3e9b

Update app_v3.py

Browse files
Files changed (1) hide show
  1. app_v3.py +32 -32
app_v3.py CHANGED
@@ -7,38 +7,37 @@ import gc
7
 
8
  # Define pretrained and quantized model directories
9
  pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
10
- cwd = os.getcwd()
11
 
12
- quantized_model_dir = cwd + "/Jackson2-4bit-128g-GPTQ"
13
 
14
  # Check if the model directory is empty (i.e., model not downloaded yet)
15
- if not os.path.exists(quantized_model_dir) or not os.listdir(quantized_model_dir):
16
  # Create the cache directory if it doesn't exist
17
- os.makedirs(quantized_model_dir, exist_ok=True)
18
- snapshot_download(repo_id=pretrained_model_dir, local_dir=quantized_model_dir, local_dir_use_symlinks=True)
19
 
20
- st.write(f'{os.listdir(quantized_model_dir)}')
21
- model_name_or_path = quantized_model_dir
22
- model_basename = "Jackson2-4bit-128g-GPTQ"
23
 
24
  #os.environ['CUDA_VISIBLE_DEVICES'] = '0'
25
 
26
  # Before allocating or loading the model, clear up memory
27
- gc.collect()
28
- torch.cuda.empty_cache()
29
 
30
  use_triton = False
31
 
32
- tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, legacy=False)
33
-
34
  model = AutoGPTQForCausalLM.from_quantized(
35
- model_name_or_path,
36
- model_basename=model_basename,
37
  use_safetensors=True,
38
- trust_remote_code=True,
39
  device="cuda:0",
40
- use_triton=use_triton,
41
- quantize_config=None
42
  )
43
 
44
  user_input = st.text_input("Input a phrase")
@@ -46,18 +45,19 @@ user_input = st.text_input("Input a phrase")
46
  prompt_template = f'USER: {user_input}\nASSISTANT:'
47
 
48
  if st.button("Generate the prompt"):
49
- input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
50
- streamer = TextStreamer(tokenizer)
51
- pipe = pipeline(
52
- "text-generation",
53
- model=model,
54
- tokenizer=tokenizer,
55
- streamer=streamer,
56
- max_new_tokens=512,
57
- temperature=0.2,
58
- top_p=0.95,
59
- repetition_penalty=1.15
60
- )
61
- # You had called pipe(prompt_template) twice which was unnecessary. Just call it once.
62
- output = pipe(prompt_template)
63
- st.write(output[0]['generated_text'])
 
 
7
 
8
  # Define pretrained and quantized model directories
9
  pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
10
+ #cwd = os.getcwd()
11
 
12
+ #quantized_model_dir = cwd + "/Jackson2-4bit-128g-GPTQ"
13
 
14
  # Check if the model directory is empty (i.e., model not downloaded yet)
15
+ #if not os.path.exists(quantized_model_dir) or not os.listdir(quantized_model_dir):
16
  # Create the cache directory if it doesn't exist
17
+ # os.makedirs(quantized_model_dir, exist_ok=True)
18
+ # snapshot_download(repo_id=pretrained_model_dir, local_dir=quantized_model_dir, local_dir_use_symlinks=True)
19
 
20
+ #st.write(f'{os.listdir(quantized_model_dir)}')
21
+ #model_name_or_path = quantized_model_dir
22
+ #model_basename = "Jackson2-4bit-128g-GPTQ"
23
 
24
  #os.environ['CUDA_VISIBLE_DEVICES'] = '0'
25
 
26
  # Before allocating or loading the model, clear up memory
27
+ #gc.collect()
28
+ #torch.cuda.empty_cache()
29
 
30
  use_triton = False
31
 
32
+ #tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, legacy=False)
33
+ tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
34
  model = AutoGPTQForCausalLM.from_quantized(
35
+ pretrained_model_dir,
36
+ #model_basename=model_basename,
37
  use_safetensors=True,
 
38
  device="cuda:0",
39
+ #use_triton=use_triton,
40
+ #quantize_config=None
41
  )
42
 
43
  user_input = st.text_input("Input a phrase")
 
45
  prompt_template = f'USER: {user_input}\nASSISTANT:'
46
 
47
  if st.button("Generate the prompt"):
48
+ inputs = tokenizer(prompt_template, return_tensors='pt')
49
+ #streamer = TextStreamer(tokenizer)
50
+ #pipe = pipeline(
51
+ # "text-generation",
52
+ # model=model,
53
+ # tokenizer=tokenizer,
54
+ # streamer=streamer,
55
+ # max_new_tokens=512,
56
+ # temperature=0.2,
57
+ # top_p=0.95,
58
+ # repetition_penalty=1.15
59
+ #)
60
+
61
+ output = model.generate(**prompt_template)
62
+ st.markdown(f"tokenizer.decode(output)")
63
+ #st.write(output[0]['generated_text'])