xu song
commited on
Commit
·
726a01e
1
Parent(s):
698d703
update
Browse files- README.md +9 -1
- app.py +12 -9
- app_util.py +2 -2
- config.py +2 -3
- models/cpp_qwen2.py +55 -47
README.md
CHANGED
@@ -10,4 +10,12 @@ pinned: false
|
|
10 |
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
-
An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
+
An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
|
14 |
+
|
15 |
+
|
16 |
+
## 安装问题
|
17 |
+
|
18 |
+
直接从源码安装,推理速度较慢,因此加入以下参数。
|
19 |
+
```sh
|
20 |
+
pip install git+https://github.com/abetlen/llama-cpp-python.git -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
|
21 |
+
```
|
app.py
CHANGED
@@ -36,8 +36,9 @@ Essentially, it is a form of model compression.
|
|
36 |
|
37 |
with gr.Blocks() as demo:
|
38 |
# Knowledge Distillation through Self Chatting
|
39 |
-
#
|
40 |
-
|
|
|
41 |
with gr.Row():
|
42 |
with gr.Column(scale=5):
|
43 |
system = gr.Dropdown(
|
@@ -53,6 +54,8 @@ with gr.Blocks() as demo:
|
|
53 |
show_share_button=True,
|
54 |
avatar_images=("assets/man.png", "assets/bot.png"))
|
55 |
|
|
|
|
|
56 |
with gradio.Tab("Self Chat"):
|
57 |
input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
|
58 |
generate_btn = gr.Button("🤔️ Self-Chat", variant="primary")
|
@@ -66,6 +69,7 @@ with gr.Blocks() as demo:
|
|
66 |
"It is based on user simulator and response generator.",
|
67 |
visible=True)
|
68 |
|
|
|
69 |
with gradio.Tab("Response Generator"):
|
70 |
with gr.Row():
|
71 |
input_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
|
@@ -76,6 +80,7 @@ with gr.Blocks() as demo:
|
|
76 |
clear_btn_2 = gr.Button("🗑️ Clear", variant="secondary", size="sm", ) # 🧹 Clear History (清除历史)
|
77 |
gr.Markdown("Response simulator is the most commonly used chatbot.")
|
78 |
|
|
|
79 |
with gradio.Tab("User Simulator"):
|
80 |
with gr.Row():
|
81 |
input_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
|
@@ -96,8 +101,8 @@ with gr.Blocks() as demo:
|
|
96 |
# visible=False
|
97 |
)
|
98 |
with gr.Accordion(label="Parameters", open=True):
|
99 |
-
|
100 |
-
|
101 |
slider_temperature = gr.Slider(minimum=0.1, maximum=10.0,
|
102 |
value=config.DEFAULT_TEMPERATURE, step=0.1, label="Temperature",
|
103 |
info="Larger temperature increase the randomness")
|
@@ -136,7 +141,7 @@ with gr.Blocks() as demo:
|
|
136 |
.then(generate, [chatbot, history], outputs=[chatbot, history],
|
137 |
show_progress="full")
|
138 |
undo_btn_2.click(undo_generate, [chatbot, history], outputs=[chatbot, history])
|
139 |
-
clear_btn_2.click(reset_state, inputs=[system], outputs=[chatbot, history])\
|
140 |
.then(reset_user_input, outputs=[input_text_2])
|
141 |
|
142 |
######## tab3
|
@@ -147,12 +152,10 @@ with gr.Blocks() as demo:
|
|
147 |
.then(generate, [chatbot, history], outputs=[chatbot, history],
|
148 |
show_progress="full")
|
149 |
undo_btn_3.click(undo_generate, [chatbot, history], outputs=[chatbot, history])
|
150 |
-
clear_btn_3.click(reset_state, inputs=[system], outputs=[chatbot, history])\
|
151 |
.then(reset_user_input, outputs=[input_text_3])
|
152 |
|
153 |
-
|
154 |
-
|
155 |
-
slider_max_tokens.change(set_max_tokens, inputs=[slider_max_tokens])
|
156 |
slider_temperature.change(set_temperature, inputs=[slider_temperature])
|
157 |
slider_top_p.change(set_top_p, inputs=[slider_top_p])
|
158 |
slider_top_k.change(set_top_k, inputs=[slider_top_k])
|
|
|
36 |
|
37 |
with gr.Blocks() as demo:
|
38 |
# Knowledge Distillation through Self Chatting
|
39 |
+
# Distilling the Knowledge from LLM through Self Chatting
|
40 |
+
# Generating Synthetic Data through Self Chat
|
41 |
+
gr.HTML("""<h1 align="center">Generating Synthetic Data through Self Chat</h1>""")
|
42 |
with gr.Row():
|
43 |
with gr.Column(scale=5):
|
44 |
system = gr.Dropdown(
|
|
|
54 |
show_share_button=True,
|
55 |
avatar_images=("assets/man.png", "assets/bot.png"))
|
56 |
|
57 |
+
gr.Textbox("For faster inference, you can build locally with ")
|
58 |
+
# ss
|
59 |
with gradio.Tab("Self Chat"):
|
60 |
input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
|
61 |
generate_btn = gr.Button("🤔️ Self-Chat", variant="primary")
|
|
|
69 |
"It is based on user simulator and response generator.",
|
70 |
visible=True)
|
71 |
|
72 |
+
# 也叫 chat-assistant,
|
73 |
with gradio.Tab("Response Generator"):
|
74 |
with gr.Row():
|
75 |
input_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
|
|
|
80 |
clear_btn_2 = gr.Button("🗑️ Clear", variant="secondary", size="sm", ) # 🧹 Clear History (清除历史)
|
81 |
gr.Markdown("Response simulator is the most commonly used chatbot.")
|
82 |
|
83 |
+
#
|
84 |
with gradio.Tab("User Simulator"):
|
85 |
with gr.Row():
|
86 |
input_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
|
|
|
101 |
# visible=False
|
102 |
)
|
103 |
with gr.Accordion(label="Parameters", open=True):
|
104 |
+
slider_max_new_tokens = gr.Slider(minimum=1, maximum=4096,
|
105 |
+
value=config.DEFAULT_MAX_NEW_TOKENS, step=1, label="Max tokens")
|
106 |
slider_temperature = gr.Slider(minimum=0.1, maximum=10.0,
|
107 |
value=config.DEFAULT_TEMPERATURE, step=0.1, label="Temperature",
|
108 |
info="Larger temperature increase the randomness")
|
|
|
141 |
.then(generate, [chatbot, history], outputs=[chatbot, history],
|
142 |
show_progress="full")
|
143 |
undo_btn_2.click(undo_generate, [chatbot, history], outputs=[chatbot, history])
|
144 |
+
clear_btn_2.click(reset_state, inputs=[system], outputs=[chatbot, history]) \
|
145 |
.then(reset_user_input, outputs=[input_text_2])
|
146 |
|
147 |
######## tab3
|
|
|
152 |
.then(generate, [chatbot, history], outputs=[chatbot, history],
|
153 |
show_progress="full")
|
154 |
undo_btn_3.click(undo_generate, [chatbot, history], outputs=[chatbot, history])
|
155 |
+
clear_btn_3.click(reset_state, inputs=[system], outputs=[chatbot, history]) \
|
156 |
.then(reset_user_input, outputs=[input_text_3])
|
157 |
|
158 |
+
slider_max_new_tokens.change(set_max_new_tokens, inputs=[slider_max_new_tokens])
|
|
|
|
|
159 |
slider_temperature.change(set_temperature, inputs=[slider_temperature])
|
160 |
slider_top_p.change(set_top_p, inputs=[slider_top_p])
|
161 |
slider_top_k.change(set_top_k, inputs=[slider_top_k])
|
app_util.py
CHANGED
@@ -112,8 +112,8 @@ def reset_state(system):
|
|
112 |
return [], [{"role": "system", "content": system}]
|
113 |
|
114 |
|
115 |
-
def
|
116 |
-
bot.generation_kwargs["max_tokens"] =
|
117 |
|
118 |
def set_temperature(temperature):
|
119 |
bot.generation_kwargs["temperature"] = temperature
|
|
|
112 |
return [], [{"role": "system", "content": system}]
|
113 |
|
114 |
|
115 |
+
def set_max_new_tokens(max_new_tokens):
|
116 |
+
bot.generation_kwargs["max_tokens"] = max_new_tokens
|
117 |
|
118 |
def set_temperature(temperature):
|
119 |
bot.generation_kwargs["temperature"] = temperature
|
config.py
CHANGED
@@ -1,9 +1,8 @@
|
|
1 |
|
2 |
|
3 |
-
MAX_SEQUENCE_LENGTH =
|
4 |
|
5 |
-
|
6 |
-
# DEFAULT_MAX_NEW_TOKENS = None
|
7 |
DEFAULT_TOP_K = 100
|
8 |
DEFAULT_TOP_P = 0.95
|
9 |
DEFAULT_TEMPERATURE = 5
|
|
|
1 |
|
2 |
|
3 |
+
MAX_SEQUENCE_LENGTH = 32768 # max_seq_len
|
4 |
|
5 |
+
DEFAULT_MAX_NEW_TOKENS = 128
|
|
|
6 |
DEFAULT_TOP_K = 100
|
7 |
DEFAULT_TOP_P = 0.95
|
8 |
DEFAULT_TEMPERATURE = 5
|
models/cpp_qwen2.py
CHANGED
@@ -1,6 +1,4 @@
|
|
1 |
"""
|
2 |
-
|
3 |
-
|
4 |
## convert to gguf
|
5 |
|
6 |
python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/
|
@@ -15,49 +13,45 @@ python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen2-0.5B-Ins
|
|
15 |
|
16 |
|
17 |
**重庆GPU服务器,cache为空 **
|
18 |
-
llama_print_timings: load time = 1711.48 ms
|
19 |
-
llama_print_timings: sample time = 214.87 ms / 122 runs ( 1.76 ms per token, 567.78 tokens per second)
|
20 |
-
llama_print_timings: prompt eval time = 892.14 ms / 5 tokens ( 178.43 ms per token, 5.60 tokens per second)
|
21 |
-
llama_print_timings: eval time = 4277.26 ms / 121 runs ( 35.35 ms per token, 28.29 tokens per second)
|
22 |
-
llama_print_timings: total time = 8351.28 ms / 126 tokens
|
23 |
-
|
24 |
-
llama_print_timings: load time = 1711.48 ms
|
25 |
-
llama_print_timings: sample time = 45.11 ms / 25 runs ( 1.80 ms per token, 554.24 tokens per second)
|
26 |
-
llama_print_timings: prompt eval time = 1059.46 ms / 5 tokens ( 211.89 ms per token, 4.72 tokens per second)
|
27 |
-
llama_print_timings: eval time = 843.71 ms / 24 runs ( 35.15 ms per token, 28.45 tokens per second)
|
28 |
-
llama_print_timings: total time = 2501.50 ms / 29 tokens
|
29 |
-
|
30 |
-
|
31 |
-
llama_print_timings: load time = 1711.48 ms
|
32 |
-
llama_print_timings: sample time = 227.75 ms / 125 runs ( 1.82 ms per token, 548.85 tokens per second)
|
33 |
-
llama_print_timings: prompt eval time = 2056.86 ms / 5 tokens ( 411.37 ms per token, 2.43 tokens per second)
|
34 |
-
llama_print_timings: eval time = 4657.86 ms / 124 runs ( 37.56 ms per token, 26.62 tokens per second)
|
35 |
-
llama_print_timings: total time = 9532.50 ms / 129 tokens
|
36 |
-
|
37 |
llama_print_timings: load time = 1711.48 ms
|
38 |
llama_print_timings: sample time = 73.89 ms / 41 runs ( 1.80 ms per token, 554.84 tokens per second)
|
39 |
-
llama_print_timings: prompt eval time = 2621.25 ms / 5 tokens ( 524.25 ms per token, 1.91 tokens per second) # 0.5秒/token
|
40 |
llama_print_timings: eval time = 1430.91 ms / 40 runs ( 35.77 ms per token, 27.95 tokens per second)
|
41 |
llama_print_timings: total time = 4848.09 ms / 45 tokens
|
42 |
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
llama_print_timings: load time = 28230.06 ms
|
45 |
-
llama_print_timings: sample time = 147.58 ms / 8 runs ( 18.45 ms per token, 54.21 tokens per second)
|
46 |
-
llama_print_timings: prompt eval time = 28864.82 ms / 5 tokens ( 5772.96 ms per token, 0.17 tokens per second) # 5.
|
47 |
llama_print_timings: eval time = 1557.94 ms / 7 runs ( 222.56 ms per token, 4.49 tokens per second)
|
48 |
llama_print_timings: total time = 30753.48 ms / 12 tokens
|
49 |
|
50 |
-
llama_print_timings: load time = 28230.06 ms
|
51 |
-
llama_print_timings: sample time = 74.34 ms / 61 runs ( 1.22 ms per token, 820.52 tokens per second)
|
52 |
-
llama_print_timings: prompt eval time = 28821.26 ms / 9 tokens ( 3202.36 ms per token, 0.31 tokens per second)
|
53 |
-
llama_print_timings: eval time = 21634.71 ms / 60 runs ( 360.58 ms per token, 2.77 tokens per second)
|
54 |
-
llama_print_timings: total time = 51255.55 ms / 69 tokens
|
55 |
|
56 |
-
|
57 |
-
llama_print_timings:
|
58 |
-
llama_print_timings:
|
59 |
-
llama_print_timings:
|
60 |
-
llama_print_timings:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
## reference
|
63 |
|
@@ -75,7 +69,6 @@ import os
|
|
75 |
|
76 |
from models.base_model import Simulator
|
77 |
import llama_cpp
|
78 |
-
# import llama_cpp.llama_tokenizer
|
79 |
from transformers import AutoTokenizer
|
80 |
from utils.logging_util import logger
|
81 |
import config
|
@@ -121,7 +114,7 @@ class Qwen2Simulator(Simulator):
|
|
121 |
temperature=config.DEFAULT_TEMPERATURE,
|
122 |
top_p=config.DEFAULT_TOP_P,
|
123 |
top_k=config.DEFAULT_TOP_K,
|
124 |
-
max_tokens=config.
|
125 |
repeat_penalty=1.1,
|
126 |
# qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>,直接跟 <|im_start|>
|
127 |
stop=self.stop_words,
|
@@ -131,12 +124,9 @@ class Qwen2Simulator(Simulator):
|
|
131 |
self.assistant_start_tokens = self.tokenize("<|im_start|>assistant\n")
|
132 |
# self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
cache = llama_cpp.LlamaRAMCache(capacity_bytes=self.cache_size)
|
138 |
-
|
139 |
-
# self.llm.set_cache()
|
140 |
|
141 |
def tokenize(self, text):
|
142 |
return self.llm.tokenize(text.encode("utf-8"))
|
@@ -195,7 +185,8 @@ class Qwen2Simulator(Simulator):
|
|
195 |
if stream["choices"][0]["finish_reason"] is None:
|
196 |
yield stream["choices"][0]["completion_text"], stream["choices"][0]["completion_tokens"]
|
197 |
else:
|
198 |
-
logger.info(
|
|
|
199 |
|
200 |
# warmup for next turn (下轮解码的加速)
|
201 |
if suffix_tokens:
|
@@ -204,11 +195,28 @@ class Qwen2Simulator(Simulator):
|
|
204 |
self.llm.eval([151645, 198] + suffix_tokens) # 增加 n_tokens
|
205 |
logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
|
206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
|
|
|
|
208 |
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
|
213 |
|
214 |
bot = Qwen2Simulator()
|
|
|
1 |
"""
|
|
|
|
|
2 |
## convert to gguf
|
3 |
|
4 |
python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/
|
|
|
13 |
|
14 |
|
15 |
**重庆GPU服务器,cache为空 **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
llama_print_timings: load time = 1711.48 ms
|
17 |
llama_print_timings: sample time = 73.89 ms / 41 runs ( 1.80 ms per token, 554.84 tokens per second)
|
18 |
+
llama_print_timings: prompt eval time = 2621.25 ms / 5 tokens ( 524.25 ms per token, 1.91 tokens per second) # 0.2-0.5秒/token
|
19 |
llama_print_timings: eval time = 1430.91 ms / 40 runs ( 35.77 ms per token, 27.95 tokens per second)
|
20 |
llama_print_timings: total time = 4848.09 ms / 45 tokens
|
21 |
|
22 |
+
llama_print_timings: load time = 1939.72 ms
|
23 |
+
llama_print_timings: sample time = 286.69 ms / 170 runs ( 1.69 ms per token, 592.99 tokens per second)
|
24 |
+
llama_print_timings: prompt eval time = 0.00 ms / 0 tokens ( -nan ms per token, -nan tokens per second) # warmup后,加速明显。
|
25 |
+
llama_print_timings: eval time = 5737.50 ms / 170 runs ( 33.75 ms per token, 29.63 tokens per second)
|
26 |
+
llama_print_timings: total time = 8219.82 ms / 170 tokens
|
27 |
+
|
28 |
+
|
29 |
+
**hf-space,cache为空 (关闭GGML_BLAS) ** -----------
|
30 |
llama_print_timings: load time = 28230.06 ms
|
31 |
+
llama_print_timings: sample time = 147.58 ms / 8 runs ( 18.45 ms per token, 54.21 tokens per second) # 18ms/token
|
32 |
+
llama_print_timings: prompt eval time = 28864.82 ms / 5 tokens ( 5772.96 ms per token, 0.17 tokens per second) # 5.7s/token
|
33 |
llama_print_timings: eval time = 1557.94 ms / 7 runs ( 222.56 ms per token, 4.49 tokens per second)
|
34 |
llama_print_timings: total time = 30753.48 ms / 12 tokens
|
35 |
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
**hf-space,cache为空 (开启GGML_BLAS)** -----------
|
38 |
+
llama_print_timings: load time = 27347.29 ms
|
39 |
+
llama_print_timings: sample time = 82.53 ms / 26 runs ( 3.17 ms per token, 315.05 tokens per second) # 3ms/token
|
40 |
+
llama_print_timings: prompt eval time = 28855.64 ms / 9 tokens ( 3206.18 ms per token, 0.31 tokens per second) # 3s/token
|
41 |
+
llama_print_timings: eval time = 9810.01 ms / 25 runs ( 392.40 ms per token, 2.55 tokens per second)
|
42 |
+
llama_print_timings: total time = 39073.77 ms / 34 tokens
|
43 |
+
|
44 |
+
llama_print_timings: load time = 27347.29 ms
|
45 |
+
llama_print_timings: sample time = 272.12 ms / 96 runs ( 2.83 ms per token, 352.79 tokens per second) # 2.8ms/token
|
46 |
+
llama_print_timings: prompt eval time = 0.00 ms / 0 tokens ( -nan ms per token, -nan tokens per second)
|
47 |
+
llama_print_timings: eval time = 19974.85 ms / 96 runs ( 208.07 ms per token, 4.81 tokens per second)
|
48 |
+
llama_print_timings: total time = 22517.08 ms / 96 tokens
|
49 |
+
|
50 |
+
|
51 |
+
## TODO:
|
52 |
+
|
53 |
+
- 解决warmup慢的问题
|
54 |
+
- 支持cache,并提前对所有预设system进行cache。
|
55 |
|
56 |
## reference
|
57 |
|
|
|
69 |
|
70 |
from models.base_model import Simulator
|
71 |
import llama_cpp
|
|
|
72 |
from transformers import AutoTokenizer
|
73 |
from utils.logging_util import logger
|
74 |
import config
|
|
|
114 |
temperature=config.DEFAULT_TEMPERATURE,
|
115 |
top_p=config.DEFAULT_TOP_P,
|
116 |
top_k=config.DEFAULT_TOP_K,
|
117 |
+
max_tokens=config.DEFAULT_MAX_NEW_TOKENS,
|
118 |
repeat_penalty=1.1,
|
119 |
# qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>,直接跟 <|im_start|>
|
120 |
stop=self.stop_words,
|
|
|
124 |
self.assistant_start_tokens = self.tokenize("<|im_start|>assistant\n")
|
125 |
# self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
|
126 |
|
127 |
+
# cache = llama_cpp.LlamaDiskCache(capacity_bytes=cache_size)
|
128 |
+
cache = llama_cpp.LlamaRAMCache(capacity_bytes=2 << 30)
|
129 |
+
self.llm.set_cache(cache)
|
|
|
|
|
|
|
130 |
|
131 |
def tokenize(self, text):
|
132 |
return self.llm.tokenize(text.encode("utf-8"))
|
|
|
185 |
if stream["choices"][0]["finish_reason"] is None:
|
186 |
yield stream["choices"][0]["completion_text"], stream["choices"][0]["completion_tokens"]
|
187 |
else:
|
188 |
+
logger.info(
|
189 |
+
f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
|
190 |
|
191 |
# warmup for next turn (下轮解码的加速)
|
192 |
if suffix_tokens:
|
|
|
195 |
self.llm.eval([151645, 198] + suffix_tokens) # 增加 n_tokens
|
196 |
logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
|
197 |
|
198 |
+
def pre_cache_system(self, system_list):
|
199 |
+
"""
|
200 |
+
:param system_list:
|
201 |
+
:return:
|
202 |
+
"""
|
203 |
+
logger.info(f"cache size {self.llm.cache.cache_size}")
|
204 |
+
for system_prompt in system_list:
|
205 |
+
logger.info(f"pre caching {system_prompt}")
|
206 |
+
input_ids = self.tokenize(f"<|im_start|>system{system_prompt}<|im_end|>\n<|im_start|>user\n")
|
207 |
+
output = self.llm.create_completion(
|
208 |
+
input_ids,
|
209 |
+
stream=False,
|
210 |
+
max_tokens=3,
|
211 |
+
top_k=1
|
212 |
+
)
|
213 |
+
logger.info(f"cache size {self.llm.cache.cache_size}")
|
214 |
|
215 |
+
# disable cache after
|
216 |
+
llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
|
217 |
|
218 |
+
def complete(self):
|
219 |
+
pass
|
|
|
220 |
|
221 |
|
222 |
bot = Qwen2Simulator()
|