Spaces:

rodrigomasini
/

rephrase

Paused

App Files Files Community

rodrigomasini commited on Nov 8, 2023

Commit

24eb0d4

•

1 Parent(s): 3b33c19

Update app_v4.py

Browse files

Files changed (1) hide show

app_v4.py +30 -26

app_v4.py CHANGED Viewed

@@ -33,43 +33,47 @@ if device == "cuda:0":
 tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
 # Attempt to load the model, catch any OOM errors
 try:
     model = AutoGPTQForCausalLM.from_quantized(
         pretrained_model_dir,
         model_basename="Jackson2-4bit-128g-GPTQ",
         use_safetensors=True,
-        device=device,
-        max_memory={0: "15GIB"}
     )
 except RuntimeError as e:
     if 'CUDA out of memory' in str(e):
-        st.error("CUDA out of memory while loading the model. Try reducing the model size or input length.")
         st.stop()
     else:
         raise e
-# Display GPU memory information after loading the model
-gpu_memory_after = get_gpu_memory()
-st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")
-# User input for the model
-user_input = st.text_input("Input a phrase")
-# Generate button
-if st.button("Generate the prompt"):
-    try:
-        prompt_template = f'USER: {user_input}\nASSISTANT:'
-        inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
-        output = model.generate(**inputs)
-        st.markdown(f"**Generated Text:**\n{tokenizer.decode(output[0])}")
-    except RuntimeError as e:
-        if 'CUDA out of memory' in str(e):
-            st.error("CUDA out of memory during generation. Try reducing the input length.")
-            # Log the detailed error message
-            with open('error_log.txt', 'a') as f:
-                f.write(traceback.format_exc())
-        else:
-            # Log the error and re-raise it
-            with open('error_log.txt', 'a') as f:
-                f.write(traceback.format_exc())
-            raise e

 tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
 # Attempt to load the model, catch any OOM errors
+model_loaded = False
 try:
     model = AutoGPTQForCausalLM.from_quantized(
         pretrained_model_dir,
         model_basename="Jackson2-4bit-128g-GPTQ",
         use_safetensors=True,
+        device=device
     )
+    model.eval()  # Set the model to inference mode
+    model_loaded = True
 except RuntimeError as e:
     if 'CUDA out of memory' in str(e):
+        st.error("CUDA out of memory while loading the model. Try reducing the model size or restarting the app.")
         st.stop()
     else:
         raise e
+if model_loaded:
+    # Display GPU memory information after loading the model
+    gpu_memory_after = get_gpu_memory()
+    st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")
+    # User input for the model
+    user_input = st.text_input("Input a phrase")
+    # Generate button
+    if st.button("Generate the prompt"):
+        try:
+            prompt_template = f'USER: {user_input}\nASSISTANT:'
+            inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
+            inputs = inputs.to(device)  # Move inputs to the same device as model
+            output = model.generate(**inputs)
+            st.markdown(f"**Generated Text:**\n{tokenizer.decode(output[0])}")
+        except RuntimeError as e:
+            if 'CUDA out of memory' in str(e):
+                st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
+                # Log the detailed error message
+                with open('error_log.txt', 'a') as f:
+                    f.write(traceback.format_exc())
+            else:
+                # Log the error and re-raise it
+                with open('error_log.txt', 'a') as f:
+                    f.write(traceback.format_exc())
+                raise e