rodrigomasini commited on
Commit
9fc5e2e
1 Parent(s): dacf75f

Update app_v4.py

Browse files
Files changed (1) hide show
  1. app_v4.py +20 -13
app_v4.py CHANGED
@@ -63,10 +63,11 @@ if model_loaded:
63
  gpu_memory_after = get_gpu_memory()
64
  st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")
65
 
66
- # User input for the model
67
- col1, col2 =st.columns(2)
68
- user_input = col1.st.text_input("Input a phrase")
69
- max_token = col2.st.number_input(label="Select maxnumber of generated tokens", min_value=1, max_value=1024, value=350, step = 5)
 
70
 
71
  # Generate button
72
  if st.button("Generate the prompt"):
@@ -74,15 +75,17 @@ if model_loaded:
74
  prompt_template = f'USER: {user_input}\nASSISTANT:'
75
  inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
76
  inputs = inputs.to(device) # Move inputs to the same device as model
77
- # Generate text using torch.inference_mode for better performance during inference
 
78
  with torch.inference_mode():
79
- output = model.generate(**inputs, max_new_tokens=max_token)
80
-
81
- # Cut the tokens at the input length to display only the generated text
82
- output_ids_cut = output[:, inputs["input_ids"].shape[1]:]
83
- generated_text = tokenizer.decode(output_ids_cut[0], skip_special_tokens=True)
84
-
85
- st.markdown(f"**Generated Text:**\n{generated_text}")
 
86
  except RuntimeError as e:
87
  if 'CUDA out of memory' in str(e):
88
  st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
@@ -93,4 +96,8 @@ if model_loaded:
93
  # Log the error and re-raise it
94
  with open('error_log.txt', 'a') as f:
95
  f.write(traceback.format_exc())
96
- raise e
 
 
 
 
 
63
  gpu_memory_after = get_gpu_memory()
64
  st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")
65
 
66
+ col1, col2 = st.columns(2)
67
+ with col1:
68
+ user_input = st.text_input("Input a phrase")
69
+ with col2:
70
+ max_token = st.number_input(label="Select max number of generated tokens", min_value=1, max_value=1024, value=50, step=5)
71
 
72
  # Generate button
73
  if st.button("Generate the prompt"):
 
75
  prompt_template = f'USER: {user_input}\nASSISTANT:'
76
  inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
77
  inputs = inputs.to(device) # Move inputs to the same device as model
78
+
79
+ # Generate text using torch.inference_mode for better performance during inference
80
  with torch.inference_mode():
81
+ output = model.generate(**inputs, max_new_tokens=max_token, num_return_sequences=2)
82
+
83
+ # Display generated texts
84
+ for i in range(2): # Loop through the number of return sequences
85
+ output_ids_cut = output[i, inputs["input_ids"].shape[1]:]
86
+ generated_text = tokenizer.decode(output_ids_cut, skip_special_tokens=True)
87
+ st.markdown(f"**Generated Text {i+1}:**\n{generated_text}")
88
+
89
  except RuntimeError as e:
90
  if 'CUDA out of memory' in str(e):
91
  st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
 
96
  # Log the error and re-raise it
97
  with open('error_log.txt', 'a') as f:
98
  f.write(traceback.format_exc())
99
+ raise e
100
+
101
+ # Display GPU memory information after generation
102
+ gpu_memory_after_generation = get_gpu_memory()
103
+ st.write(f"GPU Memory Info after generation: {gpu_memory_after_generation}")