rodrigomasini commited on
Commit
f093c76
1 Parent(s): ce75bc0

Update app_v3.py

Browse files
Files changed (1) hide show
  1. app_v3.py +5 -7
app_v3.py CHANGED
@@ -6,9 +6,9 @@ import os
6
  import torch
7
  import subprocess
8
 
9
- os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
10
- os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
11
- os.environ["CUDA_VISIBLE_DEVICES"]="1"
12
 
13
 
14
  # Define pretrained and quantized model directories
@@ -27,8 +27,6 @@ pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
27
  #model_name_or_path = quantized_model_dir
28
  model_basename = "Jackson2-4bit-128g-GPTQ"
29
 
30
- #os.environ['CUDA_VISIBLE_DEVICES'] = '0'
31
-
32
  # Before allocating or loading the model, clear up memory
33
  #gc.collect()
34
  #torch.cuda.empty_cache()
@@ -38,15 +36,15 @@ use_triton = False
38
  if torch.cuda.is_available():
39
  torch.cuda.empty_cache()
40
 
41
-
42
  #tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, legacy=False)
43
  tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
44
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
45
  model = AutoGPTQForCausalLM.from_quantized(
46
  pretrained_model_dir,
47
  model_basename=model_basename,
48
  use_safetensors=True,
49
  device=device,
 
50
  )
51
 
52
 
 
6
  import torch
7
  import subprocess
8
 
9
+ # os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' # => just makes sense with more than one GPU, since is trying to split
10
+ #os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # => just makes sense when more GPUs
11
+ #os.environ["CUDA_VISIBLE_DEVICES"]="0,1" # => this is an example of numbers of devices
12
 
13
 
14
  # Define pretrained and quantized model directories
 
27
  #model_name_or_path = quantized_model_dir
28
  model_basename = "Jackson2-4bit-128g-GPTQ"
29
 
 
 
30
  # Before allocating or loading the model, clear up memory
31
  #gc.collect()
32
  #torch.cuda.empty_cache()
 
36
  if torch.cuda.is_available():
37
  torch.cuda.empty_cache()
38
 
 
39
  #tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, legacy=False)
40
  tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
41
+ device = "cuda:0" if torch.cuda.is_available() else "cpu" # best configuration besides the auto option
42
  model = AutoGPTQForCausalLM.from_quantized(
43
  pretrained_model_dir,
44
  model_basename=model_basename,
45
  use_safetensors=True,
46
  device=device,
47
+ max_memory={0: "10GIB"}
48
  )
49
 
50