samdeniyi commited on
Commit
f89ab5e
1 Parent(s): f6153b4

fixing layer_replication error 1

Browse files
Files changed (2) hide show
  1. handler.py +86 -83
  2. requirements.txt +10 -4
handler.py CHANGED
@@ -4,57 +4,60 @@ from peft import PeftModel, PeftConfig
4
  import torch
5
  import time
6
 
7
-
8
  class EndpointHandler:
9
- def __init__(self, path="samadeniyi/lora_lesson_plan_model"):
10
- # Load the model configuration
11
- config = PeftConfig.from_pretrained(path)
12
-
13
- # Define 4-bit quantization configuration
14
- bnb_config = BitsAndBytesConfig(
15
- load_in_4bit=True,
16
- bnb_4bit_use_double_quant=True,
17
- bnb_4bit_quant_type="nf4",
18
- bnb_4bit_compute_dtype=torch.float16,
19
- )
20
-
21
- # Load the model without extra arguments like `layer_replication`
22
- self.model = AutoModelForCausalLM.from_pretrained(
23
- config.base_model_name_or_path,
24
- return_dict=True,
25
- load_in_4bit=True,
26
- device_map={"": 0},
27
- trust_remote_code=True,
28
- quantization_config=bnb_config,
29
- )
30
-
31
- # Load tokenizer
32
- self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
33
- self.tokenizer.pad_token = self.tokenizer.eos_token
34
-
35
- # Apply PEFT (Parameter-Efficient Fine-Tuning) to the model
36
- self.model = PeftModel.from_pretrained(self.model, path)
37
-
38
- def __call__(self, data: Any) -> Dict[str, Any]:
39
- """
40
- Args:
41
- data :obj:`dict`:. The object should contain {"instruction": "some text", "input": "some text"}:
42
- - "instruction": The instruction describing what to generate.
43
- - "input": Context to guide the generation.
44
-
45
- Returns:
46
- A :obj:`dict` containing {"generated_text": "the generated lesson plan", "time": "..."}:
47
- - "generated_text": The generated text based on the input.
48
- - "time": The time taken to generate the output.
49
- """
50
-
51
- # Parse input data
52
- inputs = data.pop("inputs", data)
53
- instruction = inputs.get("instruction", "")
54
- input_context = inputs.get("input", "")
55
-
56
- # Create the lesson plan prompt based on your preparation format
57
- lesson_prompt = f"""Below is an instruction that describes how to create a lesson plan, paired with an input that provides further context. Write a response that appropriately completes the request.
 
 
 
 
58
 
59
  ### Instruction:
60
  {instruction}
@@ -64,36 +67,36 @@ class EndpointHandler:
64
 
65
  ### Response:
66
  """
67
-
68
- # Tokenize the prompt
69
- batch = self.tokenizer(
70
- lesson_prompt,
71
- padding=True,
72
- truncation=True,
73
- return_tensors='pt'
74
- )
75
- batch = batch.to('cuda:0')
76
-
77
- # Configure generation settings
78
- generation_config = self.model.generation_config
79
- generation_config.top_p = 0.7
80
- generation_config.temperature = 0.7
81
- generation_config.max_new_tokens = 256
82
- generation_config.num_return_sequences = 1
83
- generation_config.pad_token_id = self.tokenizer.eos_token_id
84
- generation_config.eos_token_id = self.tokenizer.eos_token_id
85
-
86
- # Time the prediction
87
- start = time.time()
88
- with torch.cuda.amp.autocast():
89
- output_tokens = self.model.generate(
90
- input_ids=batch.input_ids,
91
- generation_config=generation_config,
92
- )
93
- end = time.time()
94
-
95
- # Decode generated tokens into text
96
- generated_text = self.tokenizer.decode(output_tokens[0], skip_special_tokens=True)
97
-
98
- # Return the generated text and the time taken
99
- return {"generated_text": generated_text, "time": f"{(end - start):.2f} s"}
 
4
  import torch
5
  import time
6
 
 
7
  class EndpointHandler:
8
+ def __init__(self, path="samadeniyi/lora_lesson_plan_model"):
9
+ # Load the model configuration
10
+ config = PeftConfig.from_pretrained(path)
11
+
12
+ # Filter out unsupported arguments like 'layer_replication'
13
+ if "layer_replication" in config.__dict__:
14
+ del config.__dict__["layer_replication"]
15
+
16
+ # Define 4-bit quantization configuration (this is necessary for low-memory usage)
17
+ bnb_config = BitsAndBytesConfig(
18
+ load_in_4bit=True,
19
+ bnb_4bit_use_double_quant=True,
20
+ bnb_4bit_quant_type="nf4",
21
+ bnb_4bit_compute_dtype=torch.float16,
22
+ )
23
+
24
+ # Load the model using 4-bit quantization and optimized settings
25
+ self.model = AutoModelForCausalLM.from_pretrained(
26
+ config.base_model_name_or_path,
27
+ return_dict=True,
28
+ load_in_4bit=True,
29
+ device_map={"": 0}, # Map to CUDA device 0
30
+ trust_remote_code=True,
31
+ quantization_config=bnb_config,
32
+ )
33
+
34
+ # Load tokenizer and ensure it matches the model
35
+ self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
36
+ self.tokenizer.pad_token = self.tokenizer.eos_token
37
+
38
+ # Apply PEFT (Parameter-Efficient Fine-Tuning) to the model
39
+ self.model = PeftModel.from_pretrained(self.model, path)
40
+
41
+ def __call__(self, data: Any) -> Dict[str, Any]:
42
+ """
43
+ Args:
44
+ data :obj:`dict`:. The object should contain {"instruction": "some text", "input": "some text"}:
45
+ - "instruction": The instruction describing what to generate.
46
+ - "input": Context to guide the generation.
47
+
48
+ Returns:
49
+ A :obj:`dict` containing {"generated_text": "the generated lesson plan", "time": "..."}:
50
+ - "generated_text": The generated text based on the input.
51
+ - "time": The time taken to generate the output.
52
+ """
53
+
54
+ # Parse input data
55
+ inputs = data.pop("inputs", data)
56
+ instruction = inputs.get("instruction", "")
57
+ input_context = inputs.get("input", "")
58
+
59
+ # Create the lesson plan prompt based on your preparation format
60
+ lesson_prompt = f"""Below is an instruction that describes how to create a lesson plan, paired with an input that provides further context. Write a response that appropriately completes the request.
61
 
62
  ### Instruction:
63
  {instruction}
 
67
 
68
  ### Response:
69
  """
70
+
71
+ # Tokenize the prompt
72
+ batch = self.tokenizer(
73
+ lesson_prompt,
74
+ padding=True,
75
+ truncation=True,
76
+ return_tensors='pt'
77
+ )
78
+ batch = batch.to('cuda:0')
79
+
80
+ # Configure generation settings
81
+ generation_config = self.model.generation_config
82
+ generation_config.top_p = 0.7
83
+ generation_config.temperature = 0.7
84
+ generation_config.max_new_tokens = 256
85
+ generation_config.num_return_sequences = 1
86
+ generation_config.pad_token_id = self.tokenizer.eos_token_id
87
+ generation_config.eos_token_id = self.tokenizer.eos_token_id
88
+
89
+ # Time the prediction
90
+ start = time.time()
91
+ with torch.cuda.amp.autocast():
92
+ output_tokens = self.model.generate(
93
+ input_ids=batch.input_ids,
94
+ generation_config=generation_config,
95
+ )
96
+ end = time.time()
97
+
98
+ # Decode generated tokens into text
99
+ generated_text = self.tokenizer.decode(output_tokens[0], skip_special_tokens=True)
100
+
101
+ # Return the generated text and the time taken
102
+ return {"generated_text": generated_text, "time": f"{(end-start):.2f} s"}
requirements.txt CHANGED
@@ -1,4 +1,10 @@
1
- torch
2
- transformers
3
- optimum
4
- peft>=0.3.0
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ transformers>=4.25.0
3
+ unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
4
+ xformers==0.0.27 # or the appropriate version for your Torch version
5
+ trl
6
+ peft
7
+ accelerate
8
+ bitsandbytes
9
+ triton
10
+ wandb