samdeniyi commited on
Commit
4c13408
·
1 Parent(s): 92d0750

new handler

Browse files
Files changed (2) hide show
  1. Old_file_handler_old.py +0 -108
  2. handler.py +105 -25
Old_file_handler_old.py DELETED
@@ -1,108 +0,0 @@
1
- from typing import Dict, Any
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
- from peft import PeftModel, PeftConfig
4
- import torch
5
- import time
6
-
7
- class EndpointHandler:
8
- def __init__(self, path="samadeniyi/lora_lesson_plan_model"):
9
- # Load the model configuration
10
- config = PeftConfig.from_pretrained(path)
11
-
12
- try:
13
- config = PeftConfig.from_pretrained(path)
14
- except TypeError as e:
15
- print(f"Error while loading config: {e}")
16
- # Manually filter out any unsupported config parameters (e.g., 'layer_replication')
17
- config_dict = PeftConfig.from_pretrained(path).__dict__
18
- if "layer_replication" in config_dict:
19
- del config_dict["layer_replication"]
20
- config = PeftConfig(**config_dict)
21
-
22
- # Define 4-bit quantization configuration (this is necessary for low-memory usage)
23
- bnb_config = BitsAndBytesConfig(
24
- load_in_4bit=True,
25
- bnb_4bit_use_double_quant=True,
26
- bnb_4bit_quant_type="nf4",
27
- bnb_4bit_compute_dtype=torch.float16,
28
- )
29
-
30
- # Load the model using 4-bit quantization and optimized settings
31
- self.model = AutoModelForCausalLM.from_pretrained(
32
- config.base_model_name_or_path,
33
- return_dict=True,
34
- load_in_4bit=True,
35
- device_map={"": 0}, # Map to CUDA device 0
36
- trust_remote_code=True,
37
- quantization_config=bnb_config,
38
- )
39
-
40
- # Load tokenizer and ensure it matches the model
41
- self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
42
- self.tokenizer.pad_token = self.tokenizer.eos_token
43
-
44
- # Apply PEFT (Parameter-Efficient Fine-Tuning) to the model
45
- self.model = PeftModel.from_pretrained(self.model, path)
46
-
47
- def __call__(self, data: Any) -> Dict[str, Any]:
48
- """
49
- Args:
50
- data :obj:`dict`:. The object should contain {"instruction": "some text", "input": "some text"}:
51
- - "instruction": The instruction describing what to generate.
52
- - "input": Context to guide the generation.
53
-
54
- Returns:
55
- A :obj:`dict` containing {"generated_text": "the generated lesson plan", "time": "..."}:
56
- - "generated_text": The generated text based on the input.
57
- - "time": The time taken to generate the output.
58
- """
59
-
60
- # Parse input data
61
- inputs = data.pop("inputs", data)
62
- instruction = inputs.get("instruction", "")
63
- input_context = inputs.get("input", "")
64
-
65
- # Create the lesson plan prompt based on your preparation format
66
- lesson_prompt = f"""Below is an instruction that describes how to create a lesson plan, paired with an input that provides further context. Write a response that appropriately completes the request.
67
-
68
- ### Instruction:
69
- {instruction}
70
-
71
- ### Input:
72
- {input_context}
73
-
74
- ### Response:
75
- """
76
-
77
- # Tokenize the prompt
78
- batch = self.tokenizer(
79
- lesson_prompt,
80
- padding=True,
81
- truncation=True,
82
- return_tensors='pt'
83
- )
84
- batch = batch.to('cuda:0')
85
-
86
- # Configure generation settings
87
- generation_config = self.model.generation_config
88
- generation_config.top_p = 0.7
89
- generation_config.temperature = 0.7
90
- generation_config.max_new_tokens = 256
91
- generation_config.num_return_sequences = 1
92
- generation_config.pad_token_id = self.tokenizer.eos_token_id
93
- generation_config.eos_token_id = self.tokenizer.eos_token_id
94
-
95
- # Time the prediction
96
- start = time.time()
97
- with torch.cuda.amp.autocast():
98
- output_tokens = self.model.generate(
99
- input_ids=batch.input_ids,
100
- generation_config=generation_config,
101
- )
102
- end = time.time()
103
-
104
- # Decode generated tokens into text
105
- generated_text = self.tokenizer.decode(output_tokens[0], skip_special_tokens=True)
106
-
107
- # Return the generated text and the time taken
108
- return {"generated_text": generated_text, "time": f"{(end-start):.2f} s"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
handler.py CHANGED
@@ -1,28 +1,108 @@
1
- from transformers import AutoModelForCausalLM, AutoTokenizer
2
- from peft import PeftModel
 
3
  import torch
4
-
5
 
6
  class EndpointHandler:
7
- def __init__(self, path="unsloth/Meta-Llama-3.1-8B-bnb-4bit"):
8
- # Load model and tokenizer
9
- self.tokenizer = AutoTokenizer.from_pretrained(path)
10
- base_model = AutoModelForCausalLM.from_pretrained(path)
11
- self.model = PeftModel.from_pretrained(base_model, path)
12
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
- self.model.to(self.device)
14
-
15
- def __call__(self, data):
16
- # Extract input text
17
- input_text = data.get("inputs", {}).get("text", "")
18
-
19
- # Tokenize input text
20
- inputs = self.tokenizer(input_text, return_tensors="pt").to(self.device)
21
-
22
- # Generate output
23
- output_tokens = self.model.generate(inputs["input_ids"], max_length=1024)
24
-
25
- # Decode generated tokens
26
- generated_text = self.tokenizer.decode(output_tokens[0], skip_special_tokens=True)
27
-
28
- return {"generated_text": generated_text}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
+ from peft import PeftModel, PeftConfig
4
  import torch
5
+ import time
6
 
7
  class EndpointHandler:
8
+ def __init__(self, path="samadeniyi/lora_lesson_plan_model"):
9
+ # Load the model configuration
10
+ config = PeftConfig.from_pretrained(path)
11
+
12
+ try:
13
+ config = PeftConfig.from_pretrained(path)
14
+ except TypeError as e:
15
+ print(f"Error while loading config: {e}")
16
+ # Manually filter out any unsupported config parameters (e.g., 'layer_replication')
17
+ config_dict = PeftConfig.from_pretrained(path).__dict__
18
+ if "layer_replication" in config_dict:
19
+ del config_dict["layer_replication"]
20
+ config = PeftConfig(**config_dict)
21
+
22
+ # Define 4-bit quantization configuration (this is necessary for low-memory usage)
23
+ bnb_config = BitsAndBytesConfig(
24
+ load_in_4bit=True,
25
+ bnb_4bit_use_double_quant=True,
26
+ bnb_4bit_quant_type="nf4",
27
+ bnb_4bit_compute_dtype=torch.float16,
28
+ )
29
+
30
+ # Load the model using 4-bit quantization and optimized settings
31
+ self.model = AutoModelForCausalLM.from_pretrained(
32
+ config.base_model_name_or_path,
33
+ return_dict=True,
34
+ load_in_4bit=True,
35
+ device_map={"": 0}, # Map to CUDA device 0
36
+ trust_remote_code=True,
37
+ quantization_config=bnb_config,
38
+ )
39
+
40
+ # Load tokenizer and ensure it matches the model
41
+ self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
42
+ self.tokenizer.pad_token = self.tokenizer.eos_token
43
+
44
+ # Apply PEFT (Parameter-Efficient Fine-Tuning) to the model
45
+ self.model = PeftModel.from_pretrained(self.model, path)
46
+
47
+ def __call__(self, data: Any) -> Dict[str, Any]:
48
+ """
49
+ Args:
50
+ data :obj:`dict`:. The object should contain {"instruction": "some text", "input": "some text"}:
51
+ - "instruction": The instruction describing what to generate.
52
+ - "input": Context to guide the generation.
53
+
54
+ Returns:
55
+ A :obj:`dict` containing {"generated_text": "the generated lesson plan", "time": "..."}:
56
+ - "generated_text": The generated text based on the input.
57
+ - "time": The time taken to generate the output.
58
+ """
59
+
60
+ # Parse input data
61
+ inputs = data.pop("inputs", data)
62
+ instruction = inputs.get("instruction", "")
63
+ input_context = inputs.get("input", "")
64
+
65
+ # Create the lesson plan prompt based on your preparation format
66
+ lesson_prompt = f"""Below is an instruction that describes how to create a lesson plan, paired with an input that provides further context. Write a response that appropriately completes the request.
67
+
68
+ ### Instruction:
69
+ {instruction}
70
+
71
+ ### Input:
72
+ {input_context}
73
+
74
+ ### Response:
75
+ """
76
+
77
+ # Tokenize the prompt
78
+ batch = self.tokenizer(
79
+ lesson_prompt,
80
+ padding=True,
81
+ truncation=True,
82
+ return_tensors='pt'
83
+ )
84
+ batch = batch.to('cuda:0')
85
+
86
+ # Configure generation settings
87
+ generation_config = self.model.generation_config
88
+ generation_config.top_p = 0.7
89
+ generation_config.temperature = 0.7
90
+ generation_config.max_new_tokens = 256
91
+ generation_config.num_return_sequences = 1
92
+ generation_config.pad_token_id = self.tokenizer.eos_token_id
93
+ generation_config.eos_token_id = self.tokenizer.eos_token_id
94
+
95
+ # Time the prediction
96
+ start = time.time()
97
+ with torch.cuda.amp.autocast():
98
+ output_tokens = self.model.generate(
99
+ input_ids=batch.input_ids,
100
+ generation_config=generation_config,
101
+ )
102
+ end = time.time()
103
+
104
+ # Decode generated tokens into text
105
+ generated_text = self.tokenizer.decode(output_tokens[0], skip_special_tokens=True)
106
+
107
+ # Return the generated text and the time taken
108
+ return {"generated_text": generated_text, "time": f"{(end-start):.2f} s"}