File size: 5,062 Bytes
b6934d2
e8ae0c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d7c581
1784d72
1d7c581
 
 
 
 
 
 
a110fec
1d7c581
 
 
e15f802
1d7c581
 
229fec7
de1f6e0
 
 
 
 
 
 
 
 
229fec7
b6934d2
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import streamlit as st
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import torch
import subprocess
import traceback

# Function to get memory info
def get_gpu_memory():
    try:
        result = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.free,memory.total", "--format=csv,nounits,noheader"], text=True)
        memory_info = [x.split(',') for x in result.strip().split('\n')]
        memory_info = [{"free": int(x[0].strip()), "total": int(x[1].strip())} for x in memory_info]
    except FileNotFoundError:
        memory_info = [{"free": "N/A", "total": "N/A"}]
    return memory_info

# Display GPU memory information before loading the model
gpu_memory_before = get_gpu_memory()
st.write(f"GPU Memory Info before loading the model: {gpu_memory_before}")

# Define pretrained model directory
pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"

# Check if CUDA is available and get the device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Before allocating or loading the model, clear up memory if CUDA is available
if device == "cuda:0":
    torch.cuda.empty_cache()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token  # Ensure padding token is set correctly for the model

# Attempt to load the model, catch any OOM errors
@st.cache_resource
def load_gptq_model():
    model = AutoGPTQForCausalLM.from_quantized(
        pretrained_model_dir,
        model_basename="Jackson2-4bit-128g-GPTQ",
        use_safetensors=True,
        device=device,
        disable_exllamav2=True
    )
    model.eval()  # Set the model to inference mode
    return model

model_loaded = False 
# Attempt to load the model, catch any OOM errors
try:
    model = load_gptq_model()
    model_loaded = True
except RuntimeError as e:
    if 'CUDA out of memory' in str(e):
        st.error("CUDA out of memory while loading the model. Try reducing the model size or restarting the app.")
        st.stop()
    else:
        raise e

if model_loaded:
    # Display GPU memory information after loading the model
    gpu_memory_after = get_gpu_memory()
    st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")

    col1, col2 = st.columns(2)
    with col1:
        user_input = st.text_input("Input a phrase")
    with col2:
        max_token = st.number_input(label="Select max number of generated tokens", min_value=1, max_value=512, value=50, step=5)

    # Generate button
    if st.button("Generate the prompt"):
        try:
            prompt_template = f'USER: {user_input}\nASSISTANT:'
            inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
            inputs = inputs.to(device)  # Move inputs to the same device as model
        # Generate text using torch.inference_mode for better performance during inference
            with torch.inference_mode():
                output = model.generate(**inputs, max_new_tokens=max_token)  
            
            # Cut the tokens at the input length to display only the generated text
            output_ids_cut = output[:, inputs["input_ids"].shape[1]:]
            generated_text = tokenizer.decode(output_ids_cut[0], skip_special_tokens=True)
            
            st.markdown(f"**Generated Text:**\n{generated_text}")
        except RuntimeError as e:
            if 'CUDA out of memory' in str(e):
                st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
                # Log the detailed error message
                with open('error_log.txt', 'a') as f:
                    f.write(traceback.format_exc())
            else:
                # Log the error and re-raise it
                with open('error_log.txt', 'a') as f:
                    f.write(traceback.format_exc())
                raise e

        # Display GPU memory information after generation
        gpu_memory_after_generation = get_gpu_memory()
        st.write(f"GPU Memory Info after generation: {gpu_memory_after_generation}")

tokenizer = AutoTokenizer.from_pretrained(local_folder, use_fast=False)

quantize_config = BaseQuantizeConfig(
        bits=4,
        group_size=128,
        desc_act=False
    )

model = AutoGPTQForCausalLM.from_quantized(local_folder,
        use_safetensors=True,
        strict=use_strict,
        model_basename=model_basename,
        device="cuda:0",
        use_triton=use_triton,
        quantize_config=quantize_config)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.1,
    top_p=0.95,
    repetition_penalty=1.15
)

user_input = st.text_input("Input a phrase")

prompt_template=f'''USER: {user_input}
ASSISTANT:'''

# Generate output when the "Generate" button is pressed
if st.button("Generate the prompt"):
    output = pipe(prompt_template)[0]['generated_text']
    st.text_area("Prompt", value=output)