Spaces:

rodrigomasini
/

rephrase

Paused

File size: 3,111 Bytes

d5c810f
b21da5b
 
 
 
4ab4748
b21da5b
 
 
 
 
 
 
 
 
 
 
3b33c19
 
 
b21da5b
 
 
 
 
 
 
 
 
 
 
 
0831088
b21da5b
 
24eb0d4
b21da5b
 
 
 
 
24eb0d4
b21da5b
24eb0d4
 
b21da5b
 
24eb0d4
b21da5b
 
 
 
24eb0d4
 
 
 
3b33c19
24eb0d4
 
d5c810f
24eb0d4

import streamlit as st
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import torch
import subprocess
import traceback

# Function to get memory info
def get_gpu_memory():
    try:
        result = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.free,memory.total", "--format=csv,nounits,noheader"], text=True)
        memory_info = [x.split(',') for x in result.strip().split('\n')]
        memory_info = [{"free": int(x[0].strip()), "total": int(x[1].strip())} for x in memory_info]
    except FileNotFoundError:
        memory_info = [{"free": "N/A", "total": "N/A"}]
    return memory_info

# Display GPU memory information before loading the model
gpu_memory_before = get_gpu_memory()
st.write(f"GPU Memory Info before loading the model: {gpu_memory_before}")

# Define pretrained model directory
pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"

# Check if CUDA is available and get the device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Before allocating or loading the model, clear up memory if CUDA is available
if device == "cuda:0":
    torch.cuda.empty_cache()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)

# Attempt to load the model, catch any OOM errors
model_loaded = False
try:
    model = AutoGPTQForCausalLM.from_quantized(
        pretrained_model_dir,
        model_basename="Jackson2-4bit-128g-GPTQ",
        use_safetensors=True,
        device=device
    )
    model.eval()  # Set the model to inference mode
    model_loaded = True
except RuntimeError as e:
    if 'CUDA out of memory' in str(e):
        st.error("CUDA out of memory while loading the model. Try reducing the model size or restarting the app.")
        st.stop()
    else:
        raise e

if model_loaded:
    # Display GPU memory information after loading the model
    gpu_memory_after = get_gpu_memory()
    st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")

    # User input for the model
    user_input = st.text_input("Input a phrase")

    # Generate button
    if st.button("Generate the prompt"):
        try:
            prompt_template = f'USER: {user_input}\nASSISTANT:'
            inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
            inputs = inputs.to(device)  # Move inputs to the same device as model
            output = model.generate(**inputs)
            st.markdown(f"**Generated Text:**\n{tokenizer.decode(output[0])}")
        except RuntimeError as e:
            if 'CUDA out of memory' in str(e):
                st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
                # Log the detailed error message
                with open('error_log.txt', 'a') as f:
                    f.write(traceback.format_exc())
            else:
                # Log the error and re-raise it
                with open('error_log.txt', 'a') as f:
                    f.write(traceback.format_exc())
                raise e