File size: 4,165 Bytes
d5c810f
b21da5b
 
 
 
4ab4748
b21da5b
 
 
 
 
 
 
 
 
 
 
3b33c19
 
 
b21da5b
 
 
 
 
 
 
 
 
 
 
 
0831088
51717bc
b21da5b
 
6de173e
4b8d66c
5d04704
b21da5b
 
 
51717bc
 
b21da5b
24eb0d4
4b8d66c
51717bc
 
5d04704
4b8d66c
 
51717bc
b21da5b
 
24eb0d4
b21da5b
 
 
 
24eb0d4
 
 
 
3b33c19
9fc5e2e
 
 
 
557f7d8
d5c810f
24eb0d4
 
 
 
 
 
557f7d8
51717bc
557f7d8
 
 
 
 
 
 
24eb0d4
 
 
 
 
 
 
 
 
 
9fc5e2e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import streamlit as st
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import torch
import subprocess
import traceback

# Function to get memory info
def get_gpu_memory():
    try:
        result = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.free,memory.total", "--format=csv,nounits,noheader"], text=True)
        memory_info = [x.split(',') for x in result.strip().split('\n')]
        memory_info = [{"free": int(x[0].strip()), "total": int(x[1].strip())} for x in memory_info]
    except FileNotFoundError:
        memory_info = [{"free": "N/A", "total": "N/A"}]
    return memory_info

# Display GPU memory information before loading the model
gpu_memory_before = get_gpu_memory()
st.write(f"GPU Memory Info before loading the model: {gpu_memory_before}")

# Define pretrained model directory
pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"

# Check if CUDA is available and get the device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Before allocating or loading the model, clear up memory if CUDA is available
if device == "cuda:0":
    torch.cuda.empty_cache()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token  # Ensure padding token is set correctly for the model

# Attempt to load the model, catch any OOM errors
@st.cache_resource
def load_gptq_model():
    model = AutoGPTQForCausalLM.from_quantized(
        pretrained_model_dir,
        model_basename="Jackson2-4bit-128g-GPTQ",
        use_safetensors=True,
        device=device,
        disable_exllamav2=True
    )
    model.eval()  # Set the model to inference mode
    return model

model_loaded = False 
# Attempt to load the model, catch any OOM errors
try:
    model = load_gptq_model()
    model_loaded = True
except RuntimeError as e:
    if 'CUDA out of memory' in str(e):
        st.error("CUDA out of memory while loading the model. Try reducing the model size or restarting the app.")
        st.stop()
    else:
        raise e

if model_loaded:
    # Display GPU memory information after loading the model
    gpu_memory_after = get_gpu_memory()
    st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")

    col1, col2 = st.columns(2)
    with col1:
        user_input = st.text_input("Input a phrase")
    with col2:
        max_token = st.number_input(label="Select max number of generated tokens", min_value=1, max_value=512, value=50, step=5)

    # Generate button
    if st.button("Generate the prompt"):
        try:
            prompt_template = f'USER: {user_input}\nASSISTANT:'
            inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
            inputs = inputs.to(device)  # Move inputs to the same device as model
        # Generate text using torch.inference_mode for better performance during inference
            with torch.inference_mode():
                output = model.generate(**inputs, max_new_tokens=max_token)  
            
            # Cut the tokens at the input length to display only the generated text
            output_ids_cut = output[:, inputs["input_ids"].shape[1]:]
            generated_text = tokenizer.decode(output_ids_cut[0], skip_special_tokens=True)
            
            st.markdown(f"**Generated Text:**\n{generated_text}")
        except RuntimeError as e:
            if 'CUDA out of memory' in str(e):
                st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
                # Log the detailed error message
                with open('error_log.txt', 'a') as f:
                    f.write(traceback.format_exc())
            else:
                # Log the error and re-raise it
                with open('error_log.txt', 'a') as f:
                    f.write(traceback.format_exc())
                raise e

        # Display GPU memory information after generation
        gpu_memory_after_generation = get_gpu_memory()
        st.write(f"GPU Memory Info after generation: {gpu_memory_after_generation}")