xu song commited on
Commit
726a01e
·
1 Parent(s): 698d703
Files changed (5) hide show
  1. README.md +9 -1
  2. app.py +12 -9
  3. app_util.py +2 -2
  4. config.py +2 -3
  5. models/cpp_qwen2.py +55 -47
README.md CHANGED
@@ -10,4 +10,12 @@ pinned: false
10
  license: apache-2.0
11
  ---
12
 
13
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
 
 
 
 
 
 
 
 
10
  license: apache-2.0
11
  ---
12
 
13
+ An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
14
+
15
+
16
+ ## 安装问题
17
+
18
+ 直接从源码安装,推理速度较慢,因此加入以下参数。
19
+ ```sh
20
+ pip install git+https://github.com/abetlen/llama-cpp-python.git -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
21
+ ```
app.py CHANGED
@@ -36,8 +36,9 @@ Essentially, it is a form of model compression.
36
 
37
  with gr.Blocks() as demo:
38
  # Knowledge Distillation through Self Chatting
39
- #
40
- gr.HTML("""<h1 align="center">Distilling the Knowledge from LLM through Self Chatting</h1>""")
 
41
  with gr.Row():
42
  with gr.Column(scale=5):
43
  system = gr.Dropdown(
@@ -53,6 +54,8 @@ with gr.Blocks() as demo:
53
  show_share_button=True,
54
  avatar_images=("assets/man.png", "assets/bot.png"))
55
 
 
 
56
  with gradio.Tab("Self Chat"):
57
  input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
58
  generate_btn = gr.Button("🤔️ Self-Chat", variant="primary")
@@ -66,6 +69,7 @@ with gr.Blocks() as demo:
66
  "It is based on user simulator and response generator.",
67
  visible=True)
68
 
 
69
  with gradio.Tab("Response Generator"):
70
  with gr.Row():
71
  input_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
@@ -76,6 +80,7 @@ with gr.Blocks() as demo:
76
  clear_btn_2 = gr.Button("🗑️ Clear", variant="secondary", size="sm", ) # 🧹 Clear History (清除历史)
77
  gr.Markdown("Response simulator is the most commonly used chatbot.")
78
 
 
79
  with gradio.Tab("User Simulator"):
80
  with gr.Row():
81
  input_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
@@ -96,8 +101,8 @@ with gr.Blocks() as demo:
96
  # visible=False
97
  )
98
  with gr.Accordion(label="Parameters", open=True):
99
- slider_max_tokens = gr.Slider(minimum=1, maximum=config.MAX_SEQUENCE_LENGTH,
100
- value=config.DEFAULT_MAX_TOKENS, step=1, label="Max tokens")
101
  slider_temperature = gr.Slider(minimum=0.1, maximum=10.0,
102
  value=config.DEFAULT_TEMPERATURE, step=0.1, label="Temperature",
103
  info="Larger temperature increase the randomness")
@@ -136,7 +141,7 @@ with gr.Blocks() as demo:
136
  .then(generate, [chatbot, history], outputs=[chatbot, history],
137
  show_progress="full")
138
  undo_btn_2.click(undo_generate, [chatbot, history], outputs=[chatbot, history])
139
- clear_btn_2.click(reset_state, inputs=[system], outputs=[chatbot, history])\
140
  .then(reset_user_input, outputs=[input_text_2])
141
 
142
  ######## tab3
@@ -147,12 +152,10 @@ with gr.Blocks() as demo:
147
  .then(generate, [chatbot, history], outputs=[chatbot, history],
148
  show_progress="full")
149
  undo_btn_3.click(undo_generate, [chatbot, history], outputs=[chatbot, history])
150
- clear_btn_3.click(reset_state, inputs=[system], outputs=[chatbot, history])\
151
  .then(reset_user_input, outputs=[input_text_3])
152
 
153
-
154
-
155
- slider_max_tokens.change(set_max_tokens, inputs=[slider_max_tokens])
156
  slider_temperature.change(set_temperature, inputs=[slider_temperature])
157
  slider_top_p.change(set_top_p, inputs=[slider_top_p])
158
  slider_top_k.change(set_top_k, inputs=[slider_top_k])
 
36
 
37
  with gr.Blocks() as demo:
38
  # Knowledge Distillation through Self Chatting
39
+ # Distilling the Knowledge from LLM through Self Chatting
40
+ # Generating Synthetic Data through Self Chat
41
+ gr.HTML("""<h1 align="center">Generating Synthetic Data through Self Chat</h1>""")
42
  with gr.Row():
43
  with gr.Column(scale=5):
44
  system = gr.Dropdown(
 
54
  show_share_button=True,
55
  avatar_images=("assets/man.png", "assets/bot.png"))
56
 
57
+ gr.Textbox("For faster inference, you can build locally with ")
58
+ # ss
59
  with gradio.Tab("Self Chat"):
60
  input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
61
  generate_btn = gr.Button("🤔️ Self-Chat", variant="primary")
 
69
  "It is based on user simulator and response generator.",
70
  visible=True)
71
 
72
+ # 也叫 chat-assistant,
73
  with gradio.Tab("Response Generator"):
74
  with gr.Row():
75
  input_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
 
80
  clear_btn_2 = gr.Button("🗑️ Clear", variant="secondary", size="sm", ) # 🧹 Clear History (清除历史)
81
  gr.Markdown("Response simulator is the most commonly used chatbot.")
82
 
83
+ #
84
  with gradio.Tab("User Simulator"):
85
  with gr.Row():
86
  input_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
 
101
  # visible=False
102
  )
103
  with gr.Accordion(label="Parameters", open=True):
104
+ slider_max_new_tokens = gr.Slider(minimum=1, maximum=4096,
105
+ value=config.DEFAULT_MAX_NEW_TOKENS, step=1, label="Max tokens")
106
  slider_temperature = gr.Slider(minimum=0.1, maximum=10.0,
107
  value=config.DEFAULT_TEMPERATURE, step=0.1, label="Temperature",
108
  info="Larger temperature increase the randomness")
 
141
  .then(generate, [chatbot, history], outputs=[chatbot, history],
142
  show_progress="full")
143
  undo_btn_2.click(undo_generate, [chatbot, history], outputs=[chatbot, history])
144
+ clear_btn_2.click(reset_state, inputs=[system], outputs=[chatbot, history]) \
145
  .then(reset_user_input, outputs=[input_text_2])
146
 
147
  ######## tab3
 
152
  .then(generate, [chatbot, history], outputs=[chatbot, history],
153
  show_progress="full")
154
  undo_btn_3.click(undo_generate, [chatbot, history], outputs=[chatbot, history])
155
+ clear_btn_3.click(reset_state, inputs=[system], outputs=[chatbot, history]) \
156
  .then(reset_user_input, outputs=[input_text_3])
157
 
158
+ slider_max_new_tokens.change(set_max_new_tokens, inputs=[slider_max_new_tokens])
 
 
159
  slider_temperature.change(set_temperature, inputs=[slider_temperature])
160
  slider_top_p.change(set_top_p, inputs=[slider_top_p])
161
  slider_top_k.change(set_top_k, inputs=[slider_top_k])
app_util.py CHANGED
@@ -112,8 +112,8 @@ def reset_state(system):
112
  return [], [{"role": "system", "content": system}]
113
 
114
 
115
- def set_max_tokens(max_tokens):
116
- bot.generation_kwargs["max_tokens"] = max_tokens
117
 
118
  def set_temperature(temperature):
119
  bot.generation_kwargs["temperature"] = temperature
 
112
  return [], [{"role": "system", "content": system}]
113
 
114
 
115
+ def set_max_new_tokens(max_new_tokens):
116
+ bot.generation_kwargs["max_tokens"] = max_new_tokens
117
 
118
  def set_temperature(temperature):
119
  bot.generation_kwargs["temperature"] = temperature
config.py CHANGED
@@ -1,9 +1,8 @@
1
 
2
 
3
- MAX_SEQUENCE_LENGTH = 2048 # max_seq_len
4
 
5
- DEFAULT_MAX_TOKENS = 128
6
- # DEFAULT_MAX_NEW_TOKENS = None
7
  DEFAULT_TOP_K = 100
8
  DEFAULT_TOP_P = 0.95
9
  DEFAULT_TEMPERATURE = 5
 
1
 
2
 
3
+ MAX_SEQUENCE_LENGTH = 32768 # max_seq_len
4
 
5
+ DEFAULT_MAX_NEW_TOKENS = 128
 
6
  DEFAULT_TOP_K = 100
7
  DEFAULT_TOP_P = 0.95
8
  DEFAULT_TEMPERATURE = 5
models/cpp_qwen2.py CHANGED
@@ -1,6 +1,4 @@
1
  """
2
-
3
-
4
  ## convert to gguf
5
 
6
  python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/
@@ -15,49 +13,45 @@ python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen2-0.5B-Ins
15
 
16
 
17
  **重庆GPU服务器,cache为空 **
18
- llama_print_timings: load time = 1711.48 ms
19
- llama_print_timings: sample time = 214.87 ms / 122 runs ( 1.76 ms per token, 567.78 tokens per second)
20
- llama_print_timings: prompt eval time = 892.14 ms / 5 tokens ( 178.43 ms per token, 5.60 tokens per second)
21
- llama_print_timings: eval time = 4277.26 ms / 121 runs ( 35.35 ms per token, 28.29 tokens per second)
22
- llama_print_timings: total time = 8351.28 ms / 126 tokens
23
-
24
- llama_print_timings: load time = 1711.48 ms
25
- llama_print_timings: sample time = 45.11 ms / 25 runs ( 1.80 ms per token, 554.24 tokens per second)
26
- llama_print_timings: prompt eval time = 1059.46 ms / 5 tokens ( 211.89 ms per token, 4.72 tokens per second)
27
- llama_print_timings: eval time = 843.71 ms / 24 runs ( 35.15 ms per token, 28.45 tokens per second)
28
- llama_print_timings: total time = 2501.50 ms / 29 tokens
29
-
30
-
31
- llama_print_timings: load time = 1711.48 ms
32
- llama_print_timings: sample time = 227.75 ms / 125 runs ( 1.82 ms per token, 548.85 tokens per second)
33
- llama_print_timings: prompt eval time = 2056.86 ms / 5 tokens ( 411.37 ms per token, 2.43 tokens per second)
34
- llama_print_timings: eval time = 4657.86 ms / 124 runs ( 37.56 ms per token, 26.62 tokens per second)
35
- llama_print_timings: total time = 9532.50 ms / 129 tokens
36
-
37
  llama_print_timings: load time = 1711.48 ms
38
  llama_print_timings: sample time = 73.89 ms / 41 runs ( 1.80 ms per token, 554.84 tokens per second)
39
- llama_print_timings: prompt eval time = 2621.25 ms / 5 tokens ( 524.25 ms per token, 1.91 tokens per second) # 0.5秒/token
40
  llama_print_timings: eval time = 1430.91 ms / 40 runs ( 35.77 ms per token, 27.95 tokens per second)
41
  llama_print_timings: total time = 4848.09 ms / 45 tokens
42
 
43
- **hf-space,cache为空 ** -----------
 
 
 
 
 
 
 
44
  llama_print_timings: load time = 28230.06 ms
45
- llama_print_timings: sample time = 147.58 ms / 8 runs ( 18.45 ms per token, 54.21 tokens per second)
46
- llama_print_timings: prompt eval time = 28864.82 ms / 5 tokens ( 5772.96 ms per token, 0.17 tokens per second) # 5.7秒/token
47
  llama_print_timings: eval time = 1557.94 ms / 7 runs ( 222.56 ms per token, 4.49 tokens per second)
48
  llama_print_timings: total time = 30753.48 ms / 12 tokens
49
 
50
- llama_print_timings: load time = 28230.06 ms
51
- llama_print_timings: sample time = 74.34 ms / 61 runs ( 1.22 ms per token, 820.52 tokens per second)
52
- llama_print_timings: prompt eval time = 28821.26 ms / 9 tokens ( 3202.36 ms per token, 0.31 tokens per second)
53
- llama_print_timings: eval time = 21634.71 ms / 60 runs ( 360.58 ms per token, 2.77 tokens per second)
54
- llama_print_timings: total time = 51255.55 ms / 69 tokens
55
 
56
- llama_print_timings: load time = 28230.06 ms
57
- llama_print_timings: sample time = 98.03 ms / 68 runs ( 1.44 ms per token, 693.66 tokens per second)
58
- llama_print_timings: prompt eval time = 27749.35 ms / 5 tokens ( 5549.87 ms per token, 0.18 tokens per second)
59
- llama_print_timings: eval time = 26998.58 ms / 67 runs ( 402.96 ms per token, 2.48 tokens per second)
60
- llama_print_timings: total time = 56335.37 ms / 72 tokens
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  ## reference
63
 
@@ -75,7 +69,6 @@ import os
75
 
76
  from models.base_model import Simulator
77
  import llama_cpp
78
- # import llama_cpp.llama_tokenizer
79
  from transformers import AutoTokenizer
80
  from utils.logging_util import logger
81
  import config
@@ -121,7 +114,7 @@ class Qwen2Simulator(Simulator):
121
  temperature=config.DEFAULT_TEMPERATURE,
122
  top_p=config.DEFAULT_TOP_P,
123
  top_k=config.DEFAULT_TOP_K,
124
- max_tokens=config.DEFAULT_MAX_TOKENS,
125
  repeat_penalty=1.1,
126
  # qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>,直接跟 <|im_start|>
127
  stop=self.stop_words,
@@ -131,12 +124,9 @@ class Qwen2Simulator(Simulator):
131
  self.assistant_start_tokens = self.tokenize("<|im_start|>assistant\n")
132
  # self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
133
 
134
-
135
- self.cache_size=10
136
-
137
- cache = llama_cpp.LlamaRAMCache(capacity_bytes=self.cache_size)
138
-
139
- # self.llm.set_cache()
140
 
141
  def tokenize(self, text):
142
  return self.llm.tokenize(text.encode("utf-8"))
@@ -195,7 +185,8 @@ class Qwen2Simulator(Simulator):
195
  if stream["choices"][0]["finish_reason"] is None:
196
  yield stream["choices"][0]["completion_text"], stream["choices"][0]["completion_tokens"]
197
  else:
198
- logger.info(f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
 
199
 
200
  # warmup for next turn (下轮解码的加速)
201
  if suffix_tokens:
@@ -204,11 +195,28 @@ class Qwen2Simulator(Simulator):
204
  self.llm.eval([151645, 198] + suffix_tokens) # 增加 n_tokens
205
  logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
 
 
208
 
209
-
210
-
211
-
212
 
213
 
214
  bot = Qwen2Simulator()
 
1
  """
 
 
2
  ## convert to gguf
3
 
4
  python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/
 
13
 
14
 
15
  **重庆GPU服务器,cache为空 **
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  llama_print_timings: load time = 1711.48 ms
17
  llama_print_timings: sample time = 73.89 ms / 41 runs ( 1.80 ms per token, 554.84 tokens per second)
18
+ llama_print_timings: prompt eval time = 2621.25 ms / 5 tokens ( 524.25 ms per token, 1.91 tokens per second) # 0.2-0.5秒/token
19
  llama_print_timings: eval time = 1430.91 ms / 40 runs ( 35.77 ms per token, 27.95 tokens per second)
20
  llama_print_timings: total time = 4848.09 ms / 45 tokens
21
 
22
+ llama_print_timings: load time = 1939.72 ms
23
+ llama_print_timings: sample time = 286.69 ms / 170 runs ( 1.69 ms per token, 592.99 tokens per second)
24
+ llama_print_timings: prompt eval time = 0.00 ms / 0 tokens ( -nan ms per token, -nan tokens per second) # warmup后,加速明显。
25
+ llama_print_timings: eval time = 5737.50 ms / 170 runs ( 33.75 ms per token, 29.63 tokens per second)
26
+ llama_print_timings: total time = 8219.82 ms / 170 tokens
27
+
28
+
29
+ **hf-space,cache为空 (关闭GGML_BLAS) ** -----------
30
  llama_print_timings: load time = 28230.06 ms
31
+ llama_print_timings: sample time = 147.58 ms / 8 runs ( 18.45 ms per token, 54.21 tokens per second) # 18ms/token
32
+ llama_print_timings: prompt eval time = 28864.82 ms / 5 tokens ( 5772.96 ms per token, 0.17 tokens per second) # 5.7s/token
33
  llama_print_timings: eval time = 1557.94 ms / 7 runs ( 222.56 ms per token, 4.49 tokens per second)
34
  llama_print_timings: total time = 30753.48 ms / 12 tokens
35
 
 
 
 
 
 
36
 
37
+ **hf-space,cache为空 (开启GGML_BLAS)** -----------
38
+ llama_print_timings: load time = 27347.29 ms
39
+ llama_print_timings: sample time = 82.53 ms / 26 runs ( 3.17 ms per token, 315.05 tokens per second) # 3ms/token
40
+ llama_print_timings: prompt eval time = 28855.64 ms / 9 tokens ( 3206.18 ms per token, 0.31 tokens per second) # 3s/token
41
+ llama_print_timings: eval time = 9810.01 ms / 25 runs ( 392.40 ms per token, 2.55 tokens per second)
42
+ llama_print_timings: total time = 39073.77 ms / 34 tokens
43
+
44
+ llama_print_timings: load time = 27347.29 ms
45
+ llama_print_timings: sample time = 272.12 ms / 96 runs ( 2.83 ms per token, 352.79 tokens per second) # 2.8ms/token
46
+ llama_print_timings: prompt eval time = 0.00 ms / 0 tokens ( -nan ms per token, -nan tokens per second)
47
+ llama_print_timings: eval time = 19974.85 ms / 96 runs ( 208.07 ms per token, 4.81 tokens per second)
48
+ llama_print_timings: total time = 22517.08 ms / 96 tokens
49
+
50
+
51
+ ## TODO:
52
+
53
+ - 解决warmup慢的问题
54
+ - 支持cache,并提前对所有预设system进行cache。
55
 
56
  ## reference
57
 
 
69
 
70
  from models.base_model import Simulator
71
  import llama_cpp
 
72
  from transformers import AutoTokenizer
73
  from utils.logging_util import logger
74
  import config
 
114
  temperature=config.DEFAULT_TEMPERATURE,
115
  top_p=config.DEFAULT_TOP_P,
116
  top_k=config.DEFAULT_TOP_K,
117
+ max_tokens=config.DEFAULT_MAX_NEW_TOKENS,
118
  repeat_penalty=1.1,
119
  # qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>,直接跟 <|im_start|>
120
  stop=self.stop_words,
 
124
  self.assistant_start_tokens = self.tokenize("<|im_start|>assistant\n")
125
  # self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
126
 
127
+ # cache = llama_cpp.LlamaDiskCache(capacity_bytes=cache_size)
128
+ cache = llama_cpp.LlamaRAMCache(capacity_bytes=2 << 30)
129
+ self.llm.set_cache(cache)
 
 
 
130
 
131
  def tokenize(self, text):
132
  return self.llm.tokenize(text.encode("utf-8"))
 
185
  if stream["choices"][0]["finish_reason"] is None:
186
  yield stream["choices"][0]["completion_text"], stream["choices"][0]["completion_tokens"]
187
  else:
188
+ logger.info(
189
+ f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
190
 
191
  # warmup for next turn (下轮解码的加速)
192
  if suffix_tokens:
 
195
  self.llm.eval([151645, 198] + suffix_tokens) # 增加 n_tokens
196
  logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
197
 
198
+ def pre_cache_system(self, system_list):
199
+ """
200
+ :param system_list:
201
+ :return:
202
+ """
203
+ logger.info(f"cache size {self.llm.cache.cache_size}")
204
+ for system_prompt in system_list:
205
+ logger.info(f"pre caching {system_prompt}")
206
+ input_ids = self.tokenize(f"<|im_start|>system{system_prompt}<|im_end|>\n<|im_start|>user\n")
207
+ output = self.llm.create_completion(
208
+ input_ids,
209
+ stream=False,
210
+ max_tokens=3,
211
+ top_k=1
212
+ )
213
+ logger.info(f"cache size {self.llm.cache.cache_size}")
214
 
215
+ # disable cache after
216
+ llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
217
 
218
+ def complete(self):
219
+ pass
 
220
 
221
 
222
  bot = Qwen2Simulator()