asdaswadefswefr commited on
Commit
95d1255
Β·
verified Β·
1 Parent(s): a2fb46d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -16
app.py CHANGED
@@ -1,41 +1,42 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
3
  import torch
4
 
5
- # Configuração da quantização
6
- quantization_config = BitsAndBytesConfig(
7
- load_in_4bit=True, # ou use True para 4-bit
8
- bnb_4bit_compute_dtype=torch.float16,
9
- bnb_4bit_use_double_quant=True,
10
- bnb_4bit_quant_type="nf4"
11
- )
12
-
13
  # Inicializa o modelo e tokenizer
14
  model_name = "Orenguteng/Llama-3-8B-Lexi-Uncensored"
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
16
  model = AutoModelForCausalLM.from_pretrained(
17
  model_name,
18
  torch_dtype=torch.float16,
19
- device_map="auto",
20
- quantization_config=quantization_config
21
  )
22
 
23
  def generate_text(prompt):
24
  inputs = tokenizer(prompt, return_tensors="pt")
 
 
25
  outputs = model.generate(
26
  inputs["input_ids"],
27
- max_new_tokens=100,
28
  temperature=0.7,
29
- pad_token_id=tokenizer.eos_token_id
 
 
30
  )
31
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
32
 
33
- # Cria a interface
34
  iface = gr.Interface(
35
  fn=generate_text,
36
  inputs="text",
37
  outputs="text",
38
- title="LLama Chat"
 
 
39
  )
40
 
41
- iface.launch()
 
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
 
 
 
 
 
 
 
 
 
5
  # Inicializa o modelo e tokenizer
6
  model_name = "Orenguteng/Llama-3-8B-Lexi-Uncensored"
7
  tokenizer = AutoTokenizer.from_pretrained(model_name)
8
+
9
+ # Carregando em CPU com precisΓ£o reduzida
10
  model = AutoModelForCausalLM.from_pretrained(
11
  model_name,
12
  torch_dtype=torch.float16,
13
+ low_cpu_mem_usage=True,
14
+ device_map="cpu"
15
  )
16
 
17
  def generate_text(prompt):
18
  inputs = tokenizer(prompt, return_tensors="pt")
19
+
20
+ # Ajustando parΓ’metros para economizar memΓ³ria
21
  outputs = model.generate(
22
  inputs["input_ids"],
23
+ max_new_tokens=50, # Reduzido para economizar memΓ³ria
24
  temperature=0.7,
25
+ pad_token_id=tokenizer.eos_token_id,
26
+ num_beams=1, # Beam search simples para economizar memΓ³ria
27
+ do_sample=True
28
  )
29
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
30
 
31
+ # Cria a interface com tempo limite maior
32
  iface = gr.Interface(
33
  fn=generate_text,
34
  inputs="text",
35
  outputs="text",
36
+ title="LLama Chat",
37
+ examples=["Hello, how are you?"],
38
+ cache_examples=False,
39
  )
40
 
41
+ # Aumentando o tempo limite devido ao processamento mais lento em CPU
42
+ iface.launch(share=True, server_timeout=180)