File size: 2,993 Bytes
3bc65a2
e008ae1
7843ac8
55ed521
f5ab0cc
 
c4576fb
da8d0db
2995eda
f093c76
 
 
ad7103c
 
f5ab0cc
 
37f4fec
f5ab0cc
37f4fec
f5ab0cc
55ed521
37f4fec
55ed521
37f4fec
 
f5ab0cc
37f4fec
 
cb0f371
2995eda
fd4c28d
37f4fec
 
fd4c28d
2995eda
 
cb21f37
 
10d87a5
37f4fec
 
f093c76
55ed521
37f4fec
cb0f371
55ed521
b23a956
f093c76
55ed521
2995eda
b23a956
8e4ec04
 
 
10d87a5
 
 
 
 
 
dd8bf51
10d87a5
9ebdc85
 
 
 
 
b23a956
 
37f4fec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
################### WHATEVER IT TAKES PHASE ###################################
import streamlit as st
from transformers import AutoTokenizer, TextStreamer, pipeline
from auto_gptq import AutoGPTQForCausalLM
from huggingface_hub import snapshot_download
import os
import torch
import subprocess

# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' # => just makes sense with more than one GPU, since is trying to split 
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # => just makes sense when more GPUs
#os.environ["CUDA_VISIBLE_DEVICES"]="0,1" # => this is an example of numbers of devices


# Define pretrained and quantized model directories
pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
#cwd = os.getcwd()

#quantized_model_dir = cwd + "/Jackson2-4bit-128g-GPTQ"

# Check if the model directory is empty (i.e., model not downloaded yet)
#if not os.path.exists(quantized_model_dir) or not os.listdir(quantized_model_dir):
    # Create the cache directory if it doesn't exist
#    os.makedirs(quantized_model_dir, exist_ok=True)
#    snapshot_download(repo_id=pretrained_model_dir, local_dir=quantized_model_dir, local_dir_use_symlinks=True)

#st.write(f'{os.listdir(quantized_model_dir)}')
#model_name_or_path = quantized_model_dir
model_basename = "Jackson2-4bit-128g-GPTQ"

# Before allocating or loading the model, clear up memory
#gc.collect()
#torch.cuda.empty_cache()

use_triton = False

if torch.cuda.is_available():
    torch.cuda.empty_cache()

#tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, legacy=False)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
device = "cuda:0" if torch.cuda.is_available() else "cpu" # best configuration besides the auto option
model = AutoGPTQForCausalLM.from_quantized(
    pretrained_model_dir,
    model_basename=model_basename,
    use_safetensors=True,
    device=device,
    max_memory={0: "10GIB"}
)


viz = torch.cuda.memory_summary()
st.write(viz)

def run():
    output: str = ""
    try:
        output = subprocess.check_output(["nvidia-smi"], text=True)
    except FileNotFoundError:
        output = subprocess.check_output(["ls", "-alh"], text=True)
    return f"#{output}"
st.write(run())
user_input = st.text_input("Input a phrase")

prompt_template = f'USER: {user_input}\nASSISTANT:'

if st.button("Generate the prompt"):
    inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length', batch_size=4)
    #inputs = tokenizer(prompt_template, return_tensors='pt')
    #streamer = TextStreamer(tokenizer)
    #pipe = pipeline(
    #    "text-generation",
    #    model=model,
    #    tokenizer=tokenizer,
    #    streamer=streamer,
    #    max_new_tokens=512,
    #    temperature=0.2,
    #    top_p=0.95,
    #    repetition_penalty=1.15
    #)

    output = model.generate(**prompt_template)
    st.markdown(f"tokenizer.decode(output)")
    #st.write(output[0]['generated_text'])