rodrigomasini commited on
Commit
557f7d8
1 Parent(s): 9fc5e2e

Update app_v4.py

Browse files
Files changed (1) hide show
  1. app_v4.py +9 -11
app_v4.py CHANGED
@@ -67,7 +67,7 @@ if model_loaded:
67
  with col1:
68
  user_input = st.text_input("Input a phrase")
69
  with col2:
70
- max_token = st.number_input(label="Select max number of generated tokens", min_value=1, max_value=1024, value=50, step=5)
71
 
72
  # Generate button
73
  if st.button("Generate the prompt"):
@@ -75,17 +75,15 @@ if model_loaded:
75
  prompt_template = f'USER: {user_input}\nASSISTANT:'
76
  inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
77
  inputs = inputs.to(device) # Move inputs to the same device as model
78
-
79
- # Generate text using torch.inference_mode for better performance during inference
80
  with torch.inference_mode():
81
- output = model.generate(**inputs, max_new_tokens=max_token, num_return_sequences=2)
82
-
83
- # Display generated texts
84
- for i in range(2): # Loop through the number of return sequences
85
- output_ids_cut = output[i, inputs["input_ids"].shape[1]:]
86
- generated_text = tokenizer.decode(output_ids_cut, skip_special_tokens=True)
87
- st.markdown(f"**Generated Text {i+1}:**\n{generated_text}")
88
-
89
  except RuntimeError as e:
90
  if 'CUDA out of memory' in str(e):
91
  st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
 
67
  with col1:
68
  user_input = st.text_input("Input a phrase")
69
  with col2:
70
+ max_token = st.number_input(label="Select max number of generated tokens", min_value=1, max_value=512, value=50, step=5)
71
 
72
  # Generate button
73
  if st.button("Generate the prompt"):
 
75
  prompt_template = f'USER: {user_input}\nASSISTANT:'
76
  inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
77
  inputs = inputs.to(device) # Move inputs to the same device as model
78
+ # Generate text using torch.inference_mode for better performance during inference
 
79
  with torch.inference_mode():
80
+ output = model.generate(**inputs, max_new_tokens=max_token)
81
+
82
+ # Cut the tokens at the input length to display only the generated text
83
+ output_ids_cut = output[:, inputs["input_ids"].shape[1]:]
84
+ generated_text = tokenizer.decode(output_ids_cut[0], skip_special_tokens=True)
85
+
86
+ st.markdown(f"**Generated Text:**\n{generated_text}")
 
87
  except RuntimeError as e:
88
  if 'CUDA out of memory' in str(e):
89
  st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")