mounseflit commited on
Commit
e5f8e18
·
verified ·
1 Parent(s): 35a9a74

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -9
app.py CHANGED
@@ -1,25 +1,39 @@
1
  import torch
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  from peft import PeftModel
 
4
 
5
- # Load the tokenizer and base model
6
  model_name = "ybelkada/falcon-7b-sharded-bf16"
7
  fine_tuned_model = "mounseflit/falcon-7b-marrakech"
8
 
 
9
  tokenizer = AutoTokenizer.from_pretrained(model_name)
10
- base_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
11
 
12
- # Load the fine-tuned LoRA model
 
 
 
 
 
 
 
 
 
13
  model = PeftModel.from_pretrained(base_model, fine_tuned_model)
14
 
15
- # Define the function for generating text
 
 
 
16
  def generate_text(prompt):
17
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
18
- outputs = model.generate(**inputs, max_length=200)
 
19
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
20
 
21
- # Gradio Interface
22
- import gradio as gr
23
 
24
- iface = gr.Interface(fn=generate_text, inputs="text", outputs="text")
25
  iface.launch()
 
1
  import torch
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  from peft import PeftModel
4
+ import gradio as gr
5
 
6
+ # Set model name and path
7
  model_name = "ybelkada/falcon-7b-sharded-bf16"
8
  fine_tuned_model = "mounseflit/falcon-7b-marrakech"
9
 
10
+ # Load tokenizer
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
12
 
13
+ # Load base model with 8-bit precision and offload to CPU
14
+ base_model = AutoModelForCausalLM.from_pretrained(
15
+ model_name,
16
+ load_in_8bit=True, # Quantization to 8-bit
17
+ device_map="auto", # Auto device map for offloading
18
+ offload_folder="offload", # Offload large parts of the model to disk
19
+ offload_state_dict=True # Enable state dict offloading to reduce memory usage
20
+ )
21
+
22
+ # Load the fine-tuned LoRA model on top of the quantized model
23
  model = PeftModel.from_pretrained(base_model, fine_tuned_model)
24
 
25
+ # Ensure the model is in evaluation mode
26
+ model.eval()
27
+
28
+ # Function to generate text
29
  def generate_text(prompt):
30
+ inputs = tokenizer(prompt, return_tensors="pt", max_length=50, truncation=True).to("cpu") # Reduce input length
31
+ with torch.no_grad():
32
+ outputs = model.generate(**inputs, max_length=100) # Reduce output length
33
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
34
 
35
+ # Create Gradio interface
36
+ iface = gr.Interface(fn=generate_text, inputs="text", outputs="text", title="Falcon 7B Lite")
37
 
38
+ # Launch the app
39
  iface.launch()