Spaces:

rodrigomasini
/

rephrase

Paused

App Files Files Community

rodrigomasini commited on Nov 8, 2023

Commit

9fc5e2e

•

1 Parent(s): dacf75f

Update app_v4.py

Browse files

Files changed (1) hide show

app_v4.py +20 -13

app_v4.py CHANGED Viewed

@@ -63,10 +63,11 @@ if model_loaded:
     gpu_memory_after = get_gpu_memory()
     st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")
-    # User input for the model
-    col1, col2 =st.columns(2)
-    user_input = col1.st.text_input("Input a phrase")
-    max_token = col2.st.number_input(label="Select maxnumber of generated tokens", min_value=1, max_value=1024, value=350, step = 5)
     # Generate button
     if st.button("Generate the prompt"):
@@ -74,15 +75,17 @@ if model_loaded:
             prompt_template = f'USER: {user_input}\nASSISTANT:'
             inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
             inputs = inputs.to(device)  # Move inputs to the same device as model
-        # Generate text using torch.inference_mode for better performance during inference
             with torch.inference_mode():
-                output = model.generate(**inputs, max_new_tokens=max_token)
-            # Cut the tokens at the input length to display only the generated text
-            output_ids_cut = output[:, inputs["input_ids"].shape[1]:]
-            generated_text = tokenizer.decode(output_ids_cut[0], skip_special_tokens=True)
-            st.markdown(f"**Generated Text:**\n{generated_text}")
         except RuntimeError as e:
             if 'CUDA out of memory' in str(e):
                 st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
@@ -93,4 +96,8 @@ if model_loaded:
                 # Log the error and re-raise it
                 with open('error_log.txt', 'a') as f:
                     f.write(traceback.format_exc())
-                raise e

     gpu_memory_after = get_gpu_memory()
     st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")
+    col1, col2 = st.columns(2)
+    with col1:
+        user_input = st.text_input("Input a phrase")
+    with col2:
+        max_token = st.number_input(label="Select max number of generated tokens", min_value=1, max_value=1024, value=50, step=5)
     # Generate button
     if st.button("Generate the prompt"):
             prompt_template = f'USER: {user_input}\nASSISTANT:'
             inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
             inputs = inputs.to(device)  # Move inputs to the same device as model
+            # Generate text using torch.inference_mode for better performance during inference
             with torch.inference_mode():
+                output = model.generate(**inputs, max_new_tokens=max_token, num_return_sequences=2)
+            # Display generated texts
+            for i in range(2):  # Loop through the number of return sequences
+                output_ids_cut = output[i, inputs["input_ids"].shape[1]:]
+                generated_text = tokenizer.decode(output_ids_cut, skip_special_tokens=True)
+                st.markdown(f"**Generated Text {i+1}:**\n{generated_text}")
         except RuntimeError as e:
             if 'CUDA out of memory' in str(e):
                 st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
                 # Log the error and re-raise it
                 with open('error_log.txt', 'a') as f:
                     f.write(traceback.format_exc())
+                raise e
+        # Display GPU memory information after generation
+        gpu_memory_after_generation = get_gpu_memory()
+        st.write(f"GPU Memory Info after generation: {gpu_memory_after_generation}")