rodrigomasini commited on
Commit
24eb0d4
1 Parent(s): 3b33c19

Update app_v4.py

Browse files
Files changed (1) hide show
  1. app_v4.py +30 -26
app_v4.py CHANGED
@@ -33,43 +33,47 @@ if device == "cuda:0":
33
  tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
34
 
35
  # Attempt to load the model, catch any OOM errors
 
36
  try:
37
  model = AutoGPTQForCausalLM.from_quantized(
38
  pretrained_model_dir,
39
  model_basename="Jackson2-4bit-128g-GPTQ",
40
  use_safetensors=True,
41
- device=device,
42
- max_memory={0: "15GIB"}
43
  )
 
 
44
  except RuntimeError as e:
45
  if 'CUDA out of memory' in str(e):
46
- st.error("CUDA out of memory while loading the model. Try reducing the model size or input length.")
47
  st.stop()
48
  else:
49
  raise e
50
 
51
- # Display GPU memory information after loading the model
52
- gpu_memory_after = get_gpu_memory()
53
- st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")
 
54
 
55
- # User input for the model
56
- user_input = st.text_input("Input a phrase")
57
 
58
- # Generate button
59
- if st.button("Generate the prompt"):
60
- try:
61
- prompt_template = f'USER: {user_input}\nASSISTANT:'
62
- inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
63
- output = model.generate(**inputs)
64
- st.markdown(f"**Generated Text:**\n{tokenizer.decode(output[0])}")
65
- except RuntimeError as e:
66
- if 'CUDA out of memory' in str(e):
67
- st.error("CUDA out of memory during generation. Try reducing the input length.")
68
- # Log the detailed error message
69
- with open('error_log.txt', 'a') as f:
70
- f.write(traceback.format_exc())
71
- else:
72
- # Log the error and re-raise it
73
- with open('error_log.txt', 'a') as f:
74
- f.write(traceback.format_exc())
75
- raise e
 
 
33
  tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
34
 
35
  # Attempt to load the model, catch any OOM errors
36
+ model_loaded = False
37
  try:
38
  model = AutoGPTQForCausalLM.from_quantized(
39
  pretrained_model_dir,
40
  model_basename="Jackson2-4bit-128g-GPTQ",
41
  use_safetensors=True,
42
+ device=device
 
43
  )
44
+ model.eval() # Set the model to inference mode
45
+ model_loaded = True
46
  except RuntimeError as e:
47
  if 'CUDA out of memory' in str(e):
48
+ st.error("CUDA out of memory while loading the model. Try reducing the model size or restarting the app.")
49
  st.stop()
50
  else:
51
  raise e
52
 
53
+ if model_loaded:
54
+ # Display GPU memory information after loading the model
55
+ gpu_memory_after = get_gpu_memory()
56
+ st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")
57
 
58
+ # User input for the model
59
+ user_input = st.text_input("Input a phrase")
60
 
61
+ # Generate button
62
+ if st.button("Generate the prompt"):
63
+ try:
64
+ prompt_template = f'USER: {user_input}\nASSISTANT:'
65
+ inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
66
+ inputs = inputs.to(device) # Move inputs to the same device as model
67
+ output = model.generate(**inputs)
68
+ st.markdown(f"**Generated Text:**\n{tokenizer.decode(output[0])}")
69
+ except RuntimeError as e:
70
+ if 'CUDA out of memory' in str(e):
71
+ st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
72
+ # Log the detailed error message
73
+ with open('error_log.txt', 'a') as f:
74
+ f.write(traceback.format_exc())
75
+ else:
76
+ # Log the error and re-raise it
77
+ with open('error_log.txt', 'a') as f:
78
+ f.write(traceback.format_exc())
79
+ raise e