wambugu71 commited on
Commit
cb989c6
·
verified ·
1 Parent(s): d61ba05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -205
app.py CHANGED
@@ -1,34 +1,15 @@
1
  import gradio as gr
2
- import spaces
3
  import torch
4
- from torch.cuda.amp import autocast
5
- import subprocess
6
- from huggingface_hub import InferenceClient
7
- import os
8
- import psutil
9
-
10
-
11
-
12
- """
13
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
14
- """
15
-
16
- from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
17
- from accelerate import Accelerator
18
-
19
-
20
- subprocess.run(
21
- "pip install psutil",
22
-
23
- shell=True,
24
  )
25
-
26
- import bitsandbytes as bnb # Import bitsandbytes for 8-bit quantization
27
-
28
-
29
-
30
- from datetime import datetime
31
-
32
 
33
  subprocess.run(
34
  "pip install flash-attn --no-build-isolation",
@@ -36,191 +17,90 @@ subprocess.run(
36
  shell=True,
37
  )
38
 
39
- # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
40
- # pip install 'git+https://github.com/huggingface/transformers.git'
41
-
42
-
43
-
44
- token=os.getenv('token')
45
- print('token = ',token)
46
-
47
- from transformers import AutoModelForCausalLM, AutoTokenizer
48
- import transformers
49
-
50
- # model_id = "mistralai/Mistral-7B-v0.3"
51
-
52
- model_id = "microsoft/Phi-3-medium-4k-instruct"
53
- # model_id = "microsoft/phi-4"
54
-
55
- # model_id = "Qwen/Qwen2-7B-Instruct"
56
-
57
-
58
- tokenizer = AutoTokenizer.from_pretrained(
59
- # model_id
60
- model_id,
61
- # use_fast=False
62
- token= token,
63
- trust_remote_code=True)
64
-
65
-
66
- accelerator = Accelerator()
67
-
68
- model = AutoModelForCausalLM.from_pretrained(model_id, token= token,
69
- # torch_dtype= torch.uint8,
70
- torch_dtype=torch.bfloat16,
71
- # load_in_8bit=True,
72
- # # # torch_dtype=torch.fl,
73
- attn_implementation="flash_attention_2",
74
- low_cpu_mem_usage=True,
75
- trust_remote_code=True,
76
- device_map='cuda',
77
- # device_map=accelerator.device_map,
78
-
79
- )
80
 
81
 
82
-
83
-
84
-
85
- #
86
- model = accelerator.prepare(model)
87
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
88
-
89
- pipe = pipeline(
90
- "text-generation",
91
- model=model,
92
- tokenizer=tokenizer,
93
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
 
96
-
97
-
98
- # pipeline = transformers.pipeline(
99
- # "text-generation",
100
- # model="microsoft/phi-4",
101
- # model_kwargs={"torch_dtype": "auto"},
102
- # device_map="auto",
103
- # )
104
-
105
-
106
- # device_map = infer_auto_device_map(model, max_memory={0: "79GB", "cpu":"65GB" })
107
-
108
- # Load the model with the inferred device map
109
- # model = load_checkpoint_and_dispatch(model, model_id, device_map=device_map, no_split_module_classes=["GPTJBlock"])
110
- # model.half()
111
-
112
- import json
113
-
114
- def str_to_json(str_obj):
115
- json_obj = json.loads(str_obj)
116
- return json_obj
117
-
118
-
119
- @spaces.GPU(duration=170)
120
- def respond(
121
- message,
122
- history: list[tuple[str, str]],
123
- system_message,
124
- max_tokens,
125
- temperature,
126
- top_p,
127
- ):
128
- # yield 'retuend'
129
- # model.to(accelerator.device)
130
-
131
- messages = []
132
- json_obj = str_to_json(message)
133
- print(json_obj)
134
-
135
- messages= json_obj
136
-
137
- # input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(accelerator.device)
138
- # input_ids2 = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt") #.to('cuda')
139
- # print(f"Converted input_ids dtype: {input_ids.dtype}")
140
- # input_str= str(input_ids2)
141
- # print('input str = ', input_str)
142
-
143
- generation_args = {
144
- "max_new_tokens": max_tokens,
145
- "return_full_text": False,
146
- "temperature": temperature,
147
- "do_sample": False,
148
- }
149
-
150
- output = pipe(messages, **generation_args)
151
- print(output[0]['generated_text'])
152
- gen_text=output[0]['generated_text']
153
-
154
- # with torch.no_grad():
155
- # gen_tokens = model.generate(
156
- # input_ids,
157
- # max_new_tokens=max_tokens,
158
- # # do_sample=True,
159
- # temperature=temperature,
160
- # )
161
-
162
- # gen_text = tokenizer.decode(gen_tokens[0])
163
- # print(gen_text)
164
- # gen_text= gen_text.replace(input_str,'')
165
- # gen_text= gen_text.replace('<|im_end|>','')
166
-
167
- yield gen_text
168
-
169
-
170
- # messages = [
171
- # # {"role": "user", "content": "What is your favourite condiment?"},
172
- # # {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
173
- # # {"role": "user", "content": "Do you have mayonnaise recipes?"}
174
- # ]
175
-
176
- # inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
177
-
178
- # outputs = model.generate(inputs, max_new_tokens=2000)
179
- # gen_text=tokenizer.decode(outputs[0], skip_special_tokens=True)
180
-
181
- # print(gen_text)
182
- # yield gen_text
183
- # for val in history:
184
- # if val[0]:
185
- # messages.append({"role": "user", "content": val[0]})
186
- # if val[1]:
187
- # messages.append({"role": "assistant", "content": val[1]})
188
-
189
- # messages.append({"role": "user", "content": message})
190
-
191
- # response = ""
192
-
193
- # for message in client.chat_completion(
194
- # messages,
195
- # max_tokens=max_tokens,
196
- # stream=True,
197
- # temperature=temperature,
198
- # top_p=top_p,
199
- # ):
200
- # token = message.choices[0].delta.content
201
-
202
- # response += token
203
- # yield response
204
-
205
- """
206
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
207
- """
208
  demo = gr.ChatInterface(
209
- respond,
 
 
 
 
 
210
  additional_inputs=[
211
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
212
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
213
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
214
  gr.Slider(
215
- minimum=0.1,
216
- maximum=1.0,
217
- value=0.95,
218
- step=0.05,
219
- label="Top-p (nucleus sampling)",
 
 
 
 
 
220
  ),
221
  ],
 
 
 
222
  )
223
-
224
-
225
- if __name__ == "__main__":
226
- demo.launch()
 
1
  import gradio as gr
 
2
  import torch
3
+ from transformers import (
4
+ AutoModelForCausalLM,
5
+ AutoTokenizer,
6
+ TextIteratorStreamer,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  )
8
+ import os
9
+ from threading import Thread
10
+ import spaces
11
+ import time
12
+ import subprocess
 
 
13
 
14
  subprocess.run(
15
  "pip install flash-attn --no-build-isolation",
 
17
  shell=True,
18
  )
19
 
20
+ token = os.environ["HF_TOKEN"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
+ model = AutoModelForCausalLM.from_pretrained(
24
+ "microsoft/phi-4",
25
+ token=token,
26
+ trust_remote_code=True,
27
+ torch_dtype=torch.bfloat16
 
 
 
 
 
 
28
  )
29
+ tok = AutoTokenizer.from_pretrained("microsoft/phi-4", token=token)
30
+ terminators = [
31
+ tok.eos_token_id,
32
+ ]
33
+
34
+ if torch.cuda.is_available():
35
+ device = torch.device("cuda")
36
+ print(f"Using GPU: {torch.cuda.get_device_name(device)}")
37
+ else:
38
+ device = torch.device("cpu")
39
+ print("Using CPU")
40
+
41
+ model = model.to(device)
42
+ # Dispatch Errors
43
+
44
+
45
+ @spaces.GPU(duration=60)
46
+ def chat(message, history, temperature, do_sample, max_tokens):
47
+ chat = []
48
+ for item in history:
49
+ chat.append({"role": "user", "content": item[0]})
50
+ if item[1] is not None:
51
+ chat.append({"role": "assistant", "content": item[1]})
52
+ chat.append({"role": "user", "content": message})
53
+ messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
54
+ model_inputs = tok([messages], return_tensors="pt").to(device)
55
+ streamer = TextIteratorStreamer(
56
+ tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
57
+ )
58
+ generate_kwargs = dict(
59
+ model_inputs,
60
+ streamer=streamer,
61
+ max_new_tokens=max_tokens,
62
+ do_sample=True,
63
+ temperature=temperature,
64
+ eos_token_id=terminators,
65
+ )
66
+
67
+ if temperature == 0:
68
+ generate_kwargs["do_sample"] = False
69
+
70
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
71
+ t.start()
72
+
73
+ partial_text = ""
74
+ for new_text in streamer:
75
+ partial_text += new_text
76
+ yield partial_text
77
+
78
+ yield partial_text
79
 
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  demo = gr.ChatInterface(
82
+ fn=chat,
83
+ examples=[["Write me a poem about Machine Learning."]],
84
+ # multimodal=False,
85
+ additional_inputs_accordion=gr.Accordion(
86
+ label="⚙️ Parameters", open=False, render=False
87
+ ),
88
  additional_inputs=[
 
 
 
89
  gr.Slider(
90
+ minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
91
+ ),
92
+ gr.Checkbox(label="Sampling", value=True),
93
+ gr.Slider(
94
+ minimum=128,
95
+ maximum=4096,
96
+ step=1,
97
+ value=512,
98
+ label="Max new tokens",
99
+ render=False,
100
  ),
101
  ],
102
+ stop_btn="Stop Generation",
103
+ title="Chat With LLMs",
104
+ description="Now Running [microsoft/phi-4](https://huggingface.co/microsoft/phi-4)",
105
  )
106
+ demo.launch()