yibolu commited on
Commit
693dde8
0 Parent(s):

Feat: Add support for cuda 11.x and faster model load speed

Browse files
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ models/glm6b-kv-cache-dy-bs8.ftm filter=lfs diff=lfs merge=lfs -text
36
+ models/glm6b-bs8.ftm filter=lfs diff=lfs merge=lfs -text
37
+ *.so filter=lfs diff=lfs merge=lfs -text
CHANGES.rst ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Changelog (lyraChatGLM)
2
+
3
+ ## 2.0
4
+ - rebuild whole system using modified Fastertransformer
5
+ - add dynamic library & models for Volta architecture.
6
+ - further acceleration, remove token generation limits.
7
+
8
+ ## 1.0
9
+
10
+ - add lyraChatGLM model, from original weights
LISENCE ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Tencent Music Entertainment
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+
24
+ Other dependencies and licenses:
25
+
26
+ Open Source Software Licensed under The ChatGLM-6B License and the Apache License Version 2.0 :
27
+ --------------------------------------------------------------------
28
+ 1. chatglm-6b
29
+
30
+ File:https://github.com/THUDM/ChatGLM-6B
31
+ License:The ChatGLM-6B License and Apache Licnese Version 2.0
32
+ For details:https://github.com/THUDM/ChatGLM-6B/blob/main/MODEL_LICENSE
33
+ https://github.com/THUDM/ChatGLM-6B/blob/main/LICENSE
34
+
35
+ APPENDIX: How to apply the Apache License to your work.
36
+
37
+ To apply the Apache License to your work, attach the following
38
+ boilerplate notice, with the fields enclosed by brackets "[]"
39
+ replaced with your own identifying information. (Don't include
40
+ the brackets!) The text should be enclosed in the appropriate
41
+ comment syntax for the file format. We also recommend that a
42
+ file or class name and description of purpose be included on the
43
+ same "printed page" as the copyright notice for easier
44
+ identification within third-party archives.
45
+
46
+ Copyright Zhengxiao Du
47
+
48
+ Licensed under the Apache License, Version 2.0 (the "License");
49
+ you may not use this file except in compliance with the License.
50
+ You may obtain a copy of the License at
51
+
52
+ http://www.apache.org/licenses/LICENSE-2.0
53
+
54
+ Unless required by applicable law or agreed to in writing, software
55
+ distributed under the License is distributed on an "AS IS" BASIS,
56
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
57
+ See the License for the specific language governing permissions and
58
+ limitations under the License.
59
+
60
+ A copy of the Apache License Version 2.0 is included in this file.
61
+
62
+
63
+ Terms of The ChatGLM-6B License:
64
+ --------------------------------------------------------------------
65
+
66
+ 一、定义
67
+
68
+ “许可方”是指分发其软件的 ChatGLM-6B 模型团队。
69
+
70
+ “软件”是指根据本许可提供的 ChatGLM-6B 模型参数。
71
+
72
+ 2. 许可授予
73
+
74
+ 根据本许可的条款和条件,许可方特此授予您非排他性、全球性、不可转让、不可再许可、可撤销、免版税的版权许可,仅用于您的非商业研究目的。
75
+
76
+ 上述版权声明和本许可声明应包含在本软件的所有副本或重要部分中。
77
+
78
+ 3.限制
79
+
80
+ 您不得出于任何商业、军事或非法目的使用、复制、修改、合并、发布、分发、复制或创建本软件的全部或部分衍生作品。
81
+
82
+ 您不得利用本软件从事任何危害国家安全和国家统一、危害社会公共利益、侵犯人身权益的行为。
83
+
84
+ 4.免责声明
85
+
86
+ 本软件“按原样”提供,不提供任何明示或暗示的保证,包括但不限于对适销性、特定用途的适用性和非侵权性的保证。 在任何情况下,作者或版权持有人均不对任何索赔、损害或其他责任负责,无论是在合同诉讼、侵权行为还是其他方面,由软件或软件的使用或其他交易引起、由软件引起或与之相关 软件。
87
+
88
+ 5. 责任限制
89
+
90
+ 除适用法律禁止的范围外,在任何情况下且根据任何法律理论,无论是基于侵权行为、疏忽、合同、责任或其他原因,任何许可方均不对您承担任何直接、间接、特殊、偶然、示范性、 或间接损害,或任何其他商业损失,即使许可人已被告知此类损害的可能性。
91
+
92
+ 6.争议解决
93
+
94
+ 本许可受中华人民共和国法律管辖并按其解释。 因本许可引起的或与本许可有关的任何争议应提交北京市海淀区人民法院。
95
+
96
+ 请注意,许可证可能会更新到更全面的版本。 有关许可和版权的任何问题,请通过 [email protected] 与我们联系。
97
+
98
+ 1. Definitions
99
+
100
+ “Licensor” means the ChatGLM-6B Model Team that distributes its Software.
101
+
102
+ “Software” means the ChatGLM-6B model parameters made available under this license.
103
+
104
+ 2. License Grant
105
+
106
+ Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
107
+
108
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
109
+
110
+ 3. Restriction
111
+
112
+ You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
113
+
114
+ You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
115
+
116
+ 4. Disclaimer
117
+
118
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
119
+
120
+ 5. Limitation of Liability
121
+
122
+ EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
123
+
124
+ 6. Dispute Resolution
125
+
126
+ This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
127
+
128
+ Note that the license is subject to update to a more comprehensive version. For any questions related to the license and copyright, please contact us at [email protected].
129
+
130
+
131
+ Open Source Software Licensed under the Apache License Version 2.0:
132
+ --------------------------------------------------------------------
133
+ 1. huggingface/transformers
134
+ Copyright 2018- The Hugging Face team. All rights reserved.
135
+
136
+
137
+ Terms of the Apache License Version 2.0:
138
+ --------------------------------------------------------------------
139
+ Apache License
140
+
141
+ Version 2.0, January 2004
142
+
143
+ http://www.apache.org/licenses/
144
+
145
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
146
+ 1. Definitions.
147
+
148
+ "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
149
+
150
+ "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
151
+
152
+ "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
153
+
154
+ "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
155
+
156
+ "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
157
+
158
+ "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
159
+
160
+ "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
161
+
162
+ "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
163
+
164
+ "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
165
+
166
+ "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
167
+
168
+ 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
169
+
170
+ 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
171
+
172
+ 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
173
+
174
+ You must give any other recipients of the Work or Derivative Works a copy of this License; and
175
+
176
+ You must cause any modified files to carry prominent notices stating that You changed the files; and
177
+
178
+ You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
179
+
180
+ If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
181
+
182
+ You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
183
+
184
+ 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
185
+
186
+ 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
187
+
188
+ 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
189
+
190
+ 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
191
+
192
+ 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
193
+
194
+ END OF TERMS AND CONDITIONS
195
+
196
+
197
+ Open Source Software Licensed under the Modified BSD License:
198
+ --------------------------------------------------------------------
199
+ 1. pytorch
200
+
201
+ From PyTorch:
202
+
203
+ Copyright (c) 2016- Facebook, Inc (Adam Paszke)
204
+ Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
205
+ Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
206
+ Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
207
+ Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
208
+ Copyright (c) 2011-2013 NYU (Clement Farabet)
209
+ Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
210
+ Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
211
+ Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
212
+
213
+ From Caffe2:
214
+
215
+ Copyright (c) 2016-present, Facebook Inc. All rights reserved.
216
+
217
+ All contributions by Facebook:
218
+ Copyright (c) 2016 Facebook Inc.
219
+
220
+ All contributions by Google:
221
+ Copyright (c) 2015 Google Inc.
222
+ All rights reserved.
223
+
224
+ All contributions by Yangqing Jia:
225
+ Copyright (c) 2015 Yangqing Jia
226
+ All rights reserved.
227
+
228
+ All contributions by Kakao Brain:
229
+ Copyright 2019-2020 Kakao Brain
230
+
231
+ All contributions by Cruise LLC:
232
+ Copyright (c) 2022 Cruise LLC.
233
+ All rights reserved.
234
+
235
+ All contributions from Caffe:
236
+ Copyright(c) 2013, 2014, 2015, the respective contributors
237
+ All rights reserved.
238
+
239
+ All other contributions:
240
+ Copyright(c) 2015, 2016 the respective contributors
241
+ All rights reserved.
242
+
243
+ Caffe2 uses a copyright model similar to Caffe: each contributor holds
244
+ copyright over their contributions to Caffe2. The project versioning records
245
+ all such contribution and copyright details. If a contributor wants to further
246
+ mark their specific copyright on a particular contribution, they should
247
+ indicate their copyright solely in the commit message of the change when it is
248
+ committed.
249
+
250
+ All rights reserved.
251
+
252
+
253
+ Terms of the Modified BSD License:
254
+ -------------------------------------------------------------------
255
+ This project is licensed under the terms of the Modified BSD License, as follows:
256
+
257
+ Redistribution and use in source and binary forms, with or without
258
+ modification, are permitted provided that the following conditions are met:
259
+
260
+ 1. Redistributions of source code must retain the above copyright
261
+ notice, this list of conditions and the following disclaimer.
262
+
263
+ 2. Redistributions in binary form must reproduce the above copyright
264
+ notice, this list of conditions and the following disclaimer in the
265
+ documentation and/or other materials provided with the distribution.
266
+
267
+ 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
268
+ and IDIAP Research Institute nor the names of its contributors may be
269
+ used to endorse or promote products derived from this software without
270
+ specific prior written permission.
271
+
272
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
273
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
274
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
275
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
276
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
277
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
278
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
279
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
280
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
281
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
282
+ POSSIBILITY OF SUCH DAMAGE.
283
+
284
+
285
+ Open Source Software Licensed under the Python Software Foundation License Version 2:
286
+ --------------------------------------------------------------------------
287
+ 1. Python/cpython
288
+ Copyright © 2001-2023 Python Software Foundation. All rights reserved
289
+
290
+
291
+ A. HISTORY OF THE SOFTWARE
292
+ ==========================
293
+
294
+ Python was created in the early 1990s by Guido van Rossum at Stichting
295
+ Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands
296
+ as a successor of a language called ABC. Guido remains Python's
297
+ principal author, although it includes many contributions from others.
298
+
299
+ In 1995, Guido continued his work on Python at the Corporation for
300
+ National Research Initiatives (CNRI, see https://www.cnri.reston.va.us)
301
+ in Reston, Virginia where he released several versions of the
302
+ software.
303
+
304
+ In May 2000, Guido and the Python core development team moved to
305
+ BeOpen.com to form the BeOpen PythonLabs team. In October of the same
306
+ year, the PythonLabs team moved to Digital Creations, which became
307
+ Zope Corporation. In 2001, the Python Software Foundation (PSF, see
308
+ https://www.python.org/psf/) was formed, a non-profit organization
309
+ created specifically to own Python-related Intellectual Property.
310
+ Zope Corporation was a sponsoring member of the PSF.
311
+
312
+ All Python releases are Open Source (see https://opensource.org for
313
+ the Open Source Definition). Historically, most, but not all, Python
314
+ releases have also been GPL-compatible; the table below summarizes
315
+ the various releases.
316
+
317
+ Release Derived Year Owner GPL-
318
+ from compatible? (1)
319
+
320
+ 0.9.0 thru 1.2 1991-1995 CWI yes
321
+ 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes
322
+ 1.6 1.5.2 2000 CNRI no
323
+ 2.0 1.6 2000 BeOpen.com no
324
+ 1.6.1 1.6 2001 CNRI yes (2)
325
+ 2.1 2.0+1.6.1 2001 PSF no
326
+ 2.0.1 2.0+1.6.1 2001 PSF yes
327
+ 2.1.1 2.1+2.0.1 2001 PSF yes
328
+ 2.1.2 2.1.1 2002 PSF yes
329
+ 2.1.3 2.1.2 2002 PSF yes
330
+ 2.2 and above 2.1.1 2001-now PSF yes
331
+
332
+ Footnotes:
333
+
334
+ (1) GPL-compatible doesn't mean that we're distributing Python under
335
+ the GPL. All Python licenses, unlike the GPL, let you distribute
336
+ a modified version without making your changes open source. The
337
+ GPL-compatible licenses make it possible to combine Python with
338
+ other software that is released under the GPL; the others don't.
339
+
340
+ (2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
341
+ because its license has a choice of law clause. According to
342
+ CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
343
+ is "not incompatible" with the GPL.
344
+
345
+ Thanks to the many outside volunteers who have worked under Guido's
346
+ direction to make these releases possible.
347
+
348
+
349
+ B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
350
+ ===============================================================
351
+
352
+ Python software and documentation are licensed under the
353
+ Python Software Foundation License Version 2.
354
+
355
+ Starting with Python 3.8.6, examples, recipes, and other code in
356
+ the documentation are dual licensed under the PSF License Version 2
357
+ and the Zero-Clause BSD license.
358
+
359
+ Some software incorporated into Python is under different licenses.
360
+ The licenses are listed with code falling under that license.
361
+
362
+
363
+ PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
364
+ --------------------------------------------
365
+
366
+ 1. This LICENSE AGREEMENT is between the Python Software Foundation
367
+ ("PSF"), and the Individual or Organization ("Licensee") accessing and
368
+ otherwise using this software ("Python") in source or binary form and
369
+ its associated documentation.
370
+
371
+ 2. Subject to the terms and conditions of this License Agreement, PSF hereby
372
+ grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
373
+ analyze, test, perform and/or display publicly, prepare derivative works,
374
+ distribute, and otherwise use Python alone or in any derivative version,
375
+ provided, however, that PSF's License Agreement and PSF's notice of copyright,
376
+ i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
377
+ 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation;
378
+ All Rights Reserved" are retained in Python alone or in any derivative version
379
+ prepared by Licensee.
380
+
381
+ 3. In the event Licensee prepares a derivative work that is based on
382
+ or incorporates Python or any part thereof, and wants to make
383
+ the derivative work available to others as provided herein, then
384
+ Licensee hereby agrees to include in any such work a brief summary of
385
+ the changes made to Python.
386
+
387
+ 4. PSF is making Python available to Licensee on an "AS IS"
388
+ basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
389
+ IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
390
+ DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
391
+ FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
392
+ INFRINGE ANY THIRD PARTY RIGHTS.
393
+
394
+ 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
395
+ FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
396
+ A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
397
+ OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
398
+
399
+ 6. This License Agreement will automatically terminate upon a material
400
+ breach of its terms and conditions.
401
+
402
+ 7. Nothing in this License Agreement shall be deemed to create any
403
+ relationship of agency, partnership, or joint venture between PSF and
404
+ Licensee. This License Agreement does not grant permission to use PSF
405
+ trademarks or trade name in a trademark sense to endorse or promote
406
+ products or services of Licensee, or any third party.
407
+
408
+ 8. By copying, installing or otherwise using Python, Licensee
409
+ agrees to be bound by the terms and conditions of this License
410
+ Agreement.
411
+
412
+
413
+ Open Source Software:
414
+ --------------------------------------------------------------------
415
+ 1. icetk
416
+ File:https://github.com/THUDM/icetk
417
+
418
+
419
+
420
+
README.md ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language: en
4
+ tags:
5
+ - LLM
6
+ - ChatGLM6B
7
+ ---
8
+ ## New Features (2023-06-20)
9
+ - We now support cuda version of both 11.X and 12.X
10
+ - lyraChatGLM has been further optimized, with faster model load speed from few minutes to less than 10s for non-int8 mode, and around 1 min for int8 mode!
11
+
12
+ ## Breakings!
13
+
14
+ **We know what you want, and here you go!**
15
+
16
+ - Newly released lyraChatGLM model, suitable for Ampere (A100/A10) as well as Volta (V100)
17
+ - lyraChatGLM has been further optimized, reaching **9000 tokens/s** on A100 and **3900 tokens/s** on V100, about **5.5x** faster than the up-to-date official version (2023/6/1).
18
+ - The memory usage was optimized too, now we can set batch_size up to **256** on A100!
19
+ - INT8 weight only PTQ is supported
20
+
21
+ **Note that the code was fully updated too, you need to use the new API, see `Uses` below**
22
+
23
+ If you like our work and consider to join us, feel free to drop a line to [email protected].
24
+
25
+ P.S. Recently we have received a lot of inquiries on accelerating customized models. Actually, we **do not have plan** to release the convertion tool at this moment, nor do we think it would be possible to apply your customized models based on our current release.
26
+
27
+ ****
28
+
29
+ ## Model Card for lyraChatGLM
30
+
31
+ lyraChatGLM is currently the **fastest ChatGLM-6B** available. To the best of our knowledge, it is the **first accelerated version of ChatGLM-6B**.
32
+
33
+ The inference speed of lyraChatGLM has achieved **300x** acceleration upon the early original version. We are still working hard to further improve the performance.
34
+
35
+ Among its main features are:
36
+ - weights: original ChatGLM-6B weights released by THUDM.
37
+ - device: Nvidia GPU with Amperer architecture or Volta architecture (A100, A10, V100...).
38
+ - batch_size: compiled with dynamic batch size, maximum depends on device. 
39
+
40
+ ## Speed
41
+ - orginal version(fixed batch infer): commit id 1d240ba
42
+
43
+ ### test on A100 40G
44
+ 1. The maximum batch size and maximum speed table for each version of the model.
45
+ |version|max_batch_size|max_speed|
46
+ |:-:|:-:|:-:|
47
+ |original|1|30 tokens/s|
48
+ |original(fxied batch infer)|192|1638.52 tokens/s|
49
+ |lyraChatGLM(current)|256|9082.60 tokens/s|
50
+ 2. The speed table for the same batch size.
51
+ |version|1 batch_size|8 batch_size| 64 batch_size | 128 batch_size |
52
+ |:-:|:-:|:-:|:-:|:-:|
53
+ |original|30 tokens/s| - | - | - |
54
+ |original(fxied batch infer)|34.48 tokens/s|356.29 tokens/s|1638.52 tokens/s|1338.45 tokens/s|
55
+ |lyraChatGLM(current)|110.05 tokens/s|843.60 tokens/s|4926.92 tokens/s|7235.04 tokens/s|
56
+
57
+ ### test on V100
58
+ 1. The maximum batch size and maximum speed table for each version of the model.
59
+ |version|max_batch_size|max_speed|
60
+ |:-:|:-:|:-:|
61
+ |original|1|17.83 tokens/s|
62
+ |original(fxied batch infer)|128|992.20 tokens/s|
63
+ |lyraChatGLM(current)|192|3958.39 tokens/s|
64
+ 2. The speed table for the same batch size.
65
+ |version|1 batch_size|8 batch_size| 64 batch_size | 128 batch_size |
66
+ |:-:|:-:|:-:|:-:|:-:|
67
+ |original|17.83 tokens/s| - | - | - |
68
+ |original(fxied batch infer)|17.83 tokens/s|228.95 tokens/s|889.7 tokens/s|922.20 tokens/s|
69
+ |lyraChatGLM(current)|59.33 tokens/s|514.15 tokens/s|2849.88 tokens/s|3958.39 tokens/s|
70
+
71
+ ## Model Sources
72
+
73
+ - **Repository:** https://huggingface.co/THUDM/chatglm-6b
74
+
75
+ ## Docker Environment Recommendation
76
+
77
+ - For Cuda 11.X: we recommend ```nvcr.io/nvidia/pytorch:22.12-py3```
78
+ - For Cuda 12.0: we recommend ```nvcr.io/nvidia/pytorch:23.02-py3```
79
+
80
+ ```bash
81
+ docker pull nvcr.io/nvidia/pytorch:23.02-py3
82
+ docker run --rm -it --gpus all -v ./:/lyraChatGLM nvcr.io/nvidia/pytorch:23.02-py3
83
+
84
+ pip install -r requirements.txt
85
+ python demo.py
86
+ ```
87
+
88
+ ## Uses
89
+
90
+ ```python
91
+ from lyraChatGLM import LyraChatGLM6B
92
+
93
+ model_path = "./models/1-gpu-fp16.h5"
94
+ tokenizer_path = "./models"
95
+ data_type = "fp16"
96
+ int8_mode = 0 # 1 for INT8 WEIGHT ONLY PTQ
97
+ max_output_length = 150
98
+ arch = "Ampere" # Ampere or Volta
99
+ cuda_version = 12
100
+
101
+ model = LyraChatGLM6B(model_path, tokenizer_path, data_type, int8_mode, arch, cuda_version)
102
+ prompt = "列出3个不同的机器学习算法,并说明它们的适用范围."
103
+ test_batch_size = 256
104
+
105
+ prompts = [prompt, ]
106
+
107
+ # If you want to get different output in same batch, you can set do_sample to True
108
+ output_texts = model.generate(prompts, output_length=max_output_length,top_k=30, top_p=0.85, temperature=0.35, repetition_penalty=1.2, do_sample=False)
109
+
110
+ print(output_texts)
111
+
112
+ ```
113
+ ## Demo output
114
+
115
+ ### input
116
+ 列出3个不同的机器学习算法,并说明它们的适用范围.
117
+
118
+ ### output
119
+ 以下是三个常见的机器学习算法及其适用范围:
120
+
121
+ 1. 决策树(Decision Tree):决策树是一种基于分类和回归问题的朴素贝叶斯模型。它通过构建一系列逐步分裂的分支来预测结果。适用于那些具有简单特征、大量数据且数据集大小在可接受范围内的情况。
122
+
123
+ 2. 随机森林(Random Forest):随���森林是一种集成学习算法,由多个决策树组成。它的优点是能够处理大规模数据和高维度的特征。适用于需要对多个变量进行建模的场景,例如医疗诊断、金融风险评估等。
124
+
125
+ 3. 支持向量机(Support Vector Machine):支持向量机是一种监督学习方法,通常用于分类问题。它可以处理高维数据,并且具有较高的准确性。适用于需要对高维数据进行分类或回归的问题,例如图像识别、自然语言处理等。
126
+
127
+ ## INT8
128
+
129
+ **Int8 usage**:
130
+
131
+ Our current version supports INT8 weight only PTQ. To enable this mode, simply modify the `int8_mode` to `1` in the demo.py file.
132
+
133
+ **In this mode, gpu memory can be further reduced by about half and the speed can be doubled.**
134
+
135
+ This solves the issue mentioned in https://github.com/THUDM/ChatGLM-6B/issues/1042.
136
+
137
+ However, the speed gain is best achieved with a batch size of no more than 128. If you don't use A100 GPU, you can adjust the
138
+ batch size to reduce it and get the benefits. We recommend a batch size of 64.This mode is very suitable for GPUs with
139
+ limited VRAM or scenarios where it is difficult to use larger batch sizes in real-time services.
140
+
141
+ It should be noted that although we have aligned the accuracy in our test cases, there may be slight differences
142
+ in accuracy in some untested scenarios with int8. Please be aware of this.
143
+
144
+
145
+ ## Citation
146
+ ``` bibtex
147
+ @Misc{lyraChatGLM2023,
148
+   author =       {Kangjian Wu, Zhengtao Wang, Yibo Lu, Bin Wu},
149
+   title =        {lyraChatGLM: Accelerating ChatGLM to 9000+ tokens/s},
150
+   howpublished = {\url{https://huggingface.co/TMElyralab/lyraChatGLM}},
151
+   year =         {2023}
152
+ }
153
+ ```
154
+
155
+ ## Report bug
156
+ - start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraChatGLM/discussions
157
+ - report bug with a `[bug]` mark in the title.
demo.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lyraChatGLM import LyraChatGLM6B
2
+ import numpy as np
3
+
4
+ model_path = "./models/1-gpu-fp16.bin"
5
+ tokenizer_path = "./models"
6
+ data_type = "fp16"
7
+ int8_mode = 0
8
+ max_output_length = 150
9
+ arch = "Ampere" # Ampere or Volta
10
+ cuda_version = 12 # cuda version, we currently support 11 and 12
11
+
12
+ model = LyraChatGLM6B(model_path, tokenizer_path, data_type, int8_mode, arch, cuda_version)
13
+
14
+ prompt = "今天天气大概 25度,有点小雨,吹着风,我想去户外散步,应该穿什么样的衣服裤子鞋子搭配。"
15
+ # test_batch_size = 256
16
+
17
+ prompts = [prompt, ]
18
+
19
+ # # If you want to get different output in same batch, you can set do_sample to True
20
+ output_texts = model.generate(prompts, output_length=max_output_length,top_k=30, top_p=0.85, temperature=0.35, repetition_penalty=1.2, do_sample=False)
21
+
22
+ print(output_texts)
lyraChatGLM/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .lyra_glm import LyraChatGLM6B
lyraChatGLM/config.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ from typing import Optional
3
+
4
+
5
+ @dataclasses.dataclass
6
+ class ChatGLM6BParam:
7
+ num_heads: int = 32
8
+ size_per_head: int = 128
9
+ inter_size: int = 16384
10
+ num_layers: int = 28
11
+ vocab_size: int = 130528
12
+ start_id: Optional[int] = 130004
13
+ end_id: Optional[int] = 130005
14
+ tensor_para_size: int = 1
15
+ pipeline_para_size: int = 1
16
+ remove_padding: bool = True
17
+ shared_contexts_ratio: float = 0.0
18
+ layernorm_eps: float = 1e-5
19
+ weights_data_type: str = "fp16"
20
+
21
+ def __post_init__(self):
22
+ if not 0.0 <= self.shared_contexts_ratio <= 1.0:
23
+ raise ValueError(
24
+ f'Got an invalid value of shared_context_ratio '
25
+ f'{self.shared_contexts_ratio} - range: [0.0, 1.0]')
26
+
27
+ def asdict(self):
28
+ return dataclasses.asdict(self)
29
+
30
+
31
+ CHATGLM_6B_PARAM = ChatGLM6BParam()
lyraChatGLM/ftlib/libth_transformer_sm70_cu11.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4a778897f6c5f77b0ea1cb14bb63732da9c3cc4e16ff16d9f911dcc8b6f6be5
3
+ size 114267536
lyraChatGLM/ftlib/libth_transformer_sm70_cu12.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99ac80b2f4c161bbacbf64a7607f323c612c7c5f26b83eaec7f559425f3a818b
3
+ size 114186112
lyraChatGLM/ftlib/libth_transformer_sm80_cu11.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1d6cd03321b671275fcabb4136562845233875564047ccde20401fca4df45c2
3
+ size 200834616
lyraChatGLM/ftlib/libth_transformer_sm80_cu12.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2da10aad8e92bcdf45b15884cee63e845f582cd28bcc0f7f1c2a4f6a101e9646
3
+ size 200916960
lyraChatGLM/lyra_glm.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import configparser
4
+ import pathlib
5
+ import typing
6
+
7
+ import torch
8
+ import transformers
9
+
10
+ from .config import CHATGLM_6B_PARAM
11
+ from .model import ChatGLM6BModel
12
+
13
+ class LyraChatGLM6B:
14
+ def __init__(self, model_path, tokenizer_path=None, dtype='fp16', int8_mode=0, arch="Ampere", cuda_version="11") -> None:
15
+ self.model_path = model_path
16
+ self.tokenizer_path = tokenizer_path
17
+ self.dtype = dtype
18
+ self.arch=arch
19
+ # if dtype != 'int8':
20
+ # int8_mode = 0
21
+ self.cuda_version = cuda_version
22
+ self.int8_mode = int8_mode
23
+
24
+ self.model, self.tokenizer = self.load_model_and_tokenizer()
25
+ if not (arch in ["Ampere", "Volta"]):
26
+ raise ValueError("Only support GPU device Ampere(A100,A10) or Volta(V100)")
27
+
28
+ print("Got model and tokenizer")
29
+
30
+ def load_model_and_tokenizer(self):
31
+ if self.tokenizer_path is None:
32
+ tokenizer_path = self.model_path
33
+ else:
34
+ tokenizer_path = self.tokenizer_path
35
+
36
+ print(f'Loading tokenizer from {pathlib.Path(tokenizer_path).parent}')
37
+ tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
38
+
39
+ checkpoint_path = pathlib.Path(self.model_path)
40
+
41
+ config_path = checkpoint_path.parent / 'config.ini'
42
+
43
+ if config_path.exists():
44
+ # Read model params from config.
45
+ cfg = configparser.ConfigParser()
46
+ cfg.read(config_path)
47
+ model_name = 'glm6b'
48
+ inference_data_type = self.dtype
49
+ if inference_data_type == None:
50
+ inference_data_type = cfg.get(model_name, "weight_data_type")
51
+ model_args = dict(
52
+ head_num=cfg.getint(model_name, 'head_num'),
53
+ size_per_head=cfg.getint(model_name, "size_per_head"),
54
+ layer_num=cfg.getint(model_name, "num_layer"),
55
+ tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
56
+ vocab_size=cfg.getint(model_name, "vocab_size"),
57
+ start_id=cfg.getint(model_name, "start_id"),
58
+ end_id=cfg.getint(model_name, "end_id"),
59
+ weights_data_type=cfg.get(model_name, "weight_data_type"),
60
+ layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
61
+ inference_data_type=inference_data_type)
62
+ else:
63
+ inference_data_type = self.dtype
64
+ if inference_data_type == None:
65
+ inference_data_type = CHATGLM_6B_PARAM.weights_data_type
66
+ model_args = dict(head_num=CHATGLM_6B_PARAM.num_heads,
67
+ size_per_head=CHATGLM_6B_PARAM.size_per_head,
68
+ vocab_size=CHATGLM_6B_PARAM.vocab_size,
69
+ start_id=CHATGLM_6B_PARAM.start_id or tokenizer.bos_token_id,
70
+ end_id=CHATGLM_6B_PARAM.end_id or tokenizer.eos_token_id,
71
+ layer_num=CHATGLM_6B_PARAM.num_layers,
72
+ tensor_para_size=CHATGLM_6B_PARAM.tensor_para_size,
73
+ weights_data_type=CHATGLM_6B_PARAM.weights_data_type,
74
+ layernorm_eps=CHATGLM_6B_PARAM.layernorm_eps,
75
+ inference_data_type=inference_data_type,
76
+ )
77
+
78
+ # update common parameters
79
+ model_args.update(dict(
80
+ rotary_embedding_dim=64,
81
+ max_seq_len=0, # for position seq embedding
82
+ pipeline_para_size=CHATGLM_6B_PARAM.pipeline_para_size,
83
+ shared_contexts_ratio=CHATGLM_6B_PARAM.shared_contexts_ratio,
84
+ int8_mode=self.int8_mode,
85
+ model_path=self.model_path,
86
+ cuda_version=self.cuda_version,
87
+ ))
88
+
89
+ print('[INFO] Load Our Highly Optimized LyraChatGLM6B model')
90
+ for k, v in model_args.items():
91
+ print(f' - {k.ljust(25, ".")}: {v}')
92
+
93
+ # Check sanity and consistency between the model and tokenizer.
94
+ checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
95
+ 'tensor_para_size', 'tensor_para_size', 'weights_data_type']
96
+ if None in [model_args[k] for k in checklist]:
97
+ none_params = [p for p in checklist if model_args[p] is None]
98
+ print(f'[WARNING] Found None parameters {none_params}. They must '
99
+ f'be provided either by config file or CLI arguments.')
100
+ if model_args['start_id'] != tokenizer.bos_token_id:
101
+ print('[WARNING] Given start_id is not matched with the bos token '
102
+ 'id of the pretrained tokenizer.')
103
+ if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
104
+ print('[WARNING] Given end_id is not matched with neither pad '
105
+ 'token id nor eos token id of the pretrained tokenizer.')
106
+
107
+ print(f'Loading tokenizer from {self.model_path}')
108
+ model = ChatGLM6BModel(arch=self.arch,**model_args)
109
+
110
+ return model, tokenizer
111
+
112
+ def generate(self, prompts: typing.List[str] | str,
113
+ output_length: int = 512,
114
+ beam_width: int = 1,
115
+ top_k: typing.Optional[torch.IntTensor] = 1,
116
+ top_p: typing.Optional[torch.FloatTensor] = 1.0,
117
+ beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
118
+ temperature: typing.Optional[torch.FloatTensor] = 1.0,
119
+ len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
120
+ repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
121
+ presence_penalty: typing.Optional[torch.FloatTensor] = None,
122
+ min_length: typing.Optional[torch.IntTensor] = None,
123
+ bad_words_list: typing.Optional[torch.IntTensor] = None,
124
+ do_sample: bool = False,
125
+ return_output_length: bool = False,
126
+ return_cum_log_probs: int = 0):
127
+ #
128
+ if isinstance(prompts, str):
129
+ prompts = [prompts, ]
130
+
131
+ inputs = prompts
132
+
133
+ batch_size = len(inputs)
134
+ ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
135
+ ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
136
+
137
+ input_token_ids = self.tokenizer(prompts, return_tensors="pt", padding=True).input_ids.int()
138
+ input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
139
+ mask_positions = torch.IntTensor([seq.index(130001) for seq in input_token_ids.tolist()])
140
+
141
+ random_seed = None
142
+ if do_sample:
143
+ random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
144
+
145
+ outputs = self.model(start_ids=input_token_ids,
146
+ start_lengths=input_lengths,
147
+ mask_positions=mask_positions,
148
+ output_len=output_length,
149
+ beam_width=beam_width,
150
+ top_k=top_k*ones_int,
151
+ top_p=top_p*ones_float,
152
+ beam_search_diversity_rate=beam_search_diversity_rate*ones_float,
153
+ temperature=temperature*ones_float,
154
+ len_penalty=len_penalty*ones_float,
155
+ repetition_penalty=repetition_penalty*ones_float,
156
+ presence_penalty=presence_penalty,
157
+ min_length=min_length,
158
+ random_seed=random_seed,
159
+ bad_words_list=bad_words_list,
160
+ return_output_length=return_output_length,
161
+ return_cum_log_probs=return_cum_log_probs)
162
+
163
+ if return_cum_log_probs > 0:
164
+ outputs = outputs[0] # output_token_ids.
165
+
166
+ # Slice the generated token ids of the 1st beam result.
167
+ # output = input tokens + generated tokens.
168
+ output_token_ids = [out[0, length:].cpu()
169
+ for out, length in zip(outputs, input_lengths)]
170
+
171
+ output_texts = self.tokenizer.batch_decode(
172
+ output_token_ids, skip_special_tokens=False)
173
+
174
+ return output_texts
lyraChatGLM/model.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import h5py
3
+ import pathlib
4
+ import typing
5
+
6
+ import numpy as np
7
+ import torch
8
+ import torch.distributed as dist
9
+ import torch.nn as nn
10
+
11
+ class ChatGLM6BModel(nn.Module):
12
+ def __init__(self,
13
+ head_num, size_per_head,
14
+ vocab_size,
15
+ rotary_embedding_dim,
16
+ start_id, end_id, layer_num,
17
+ arch,
18
+ max_seq_len: int,
19
+ tensor_para_size: int,
20
+ pipeline_para_size: int,
21
+ inference_data_type: str,
22
+ model_path,
23
+ cuda_version,
24
+ inter_size: int = 0,
25
+ # glm_variant_params
26
+ layernorm_eps: float = 1e-5,
27
+ layernorm_type: typing.Literal['pre_layernorm', 'post_layernorm'] = "pre_layernorm",
28
+ activation_type: str = "Gelu",
29
+ gpt_with_moe: bool = False,
30
+ expert_num: int = 0,
31
+ moe_k: int = 0,
32
+ moe_layer_index: typing.List = [],
33
+ has_positional_encoding: bool = False,
34
+ has_pre_decoder_layernorm: bool = False,
35
+ has_post_decoder_layernorm: bool = True,
36
+ has_adapters: bool = False,
37
+ adapter_inter_size: int = 0,
38
+ use_attention_linear_bias: bool = False,
39
+ int8_mode: int = 0,
40
+ weights_data_type: typing.Union[str, np.dtype] = np.float32,
41
+ shared_contexts_ratio: float = 1.0):
42
+ super().__init__()
43
+ self.head_num = head_num
44
+ self.size_per_head = size_per_head
45
+ self.vocab_size = vocab_size
46
+ self.rotary_embedding_dim = rotary_embedding_dim
47
+ self.start_id = start_id
48
+ self.end_id = end_id
49
+ self.layer_num = layer_num
50
+ self.inter_size = inter_size if inter_size != 0 else 4 * self.head_num * self.size_per_head
51
+ self.arch = arch
52
+ self.model_path = model_path
53
+ # gpt_variant_params
54
+ self.layernorm_eps = layernorm_eps
55
+ self.layernorm_type = layernorm_type
56
+ self.activation_type = activation_type
57
+ self.gpt_with_moe = gpt_with_moe
58
+ self.expert_num = expert_num
59
+ self.moe_k = moe_k
60
+ self.moe_layer_index = moe_layer_index
61
+ self.has_positional_encoding = has_positional_encoding
62
+ self.has_pre_decoder_layernorm = has_pre_decoder_layernorm
63
+ self.has_post_decoder_layernorm = has_post_decoder_layernorm
64
+ self.has_adapters = has_adapters
65
+ self.adapter_inter_size = adapter_inter_size
66
+ self.use_attention_linear_bias = use_attention_linear_bias
67
+
68
+ # multi-gpu params
69
+ self.tensor_para_size = tensor_para_size
70
+ self.pipeline_para_size = pipeline_para_size
71
+ self.use_sparse_gemm = False
72
+ self.build_model = False
73
+ self.int8_mode = int8_mode
74
+ self.weights_data_type = weights_data_type
75
+ self.shared_contexts_ratio = shared_contexts_ratio
76
+
77
+ assert torch.cuda.is_available(), "CUDA is required for this model."
78
+
79
+ assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
80
+ assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
81
+
82
+ self.device = 0
83
+
84
+ # Load the C++ model into Pytorch model.
85
+ sm = "sm80"
86
+
87
+ if arch == "Ampere":
88
+ sm = "sm80"
89
+ elif arch == "Volta":
90
+ sm = "sm70"
91
+ else:
92
+ raise Exception(f"unsupported arch: {arch}")
93
+
94
+ cu = 'cu11'
95
+ if cuda_version == 11:
96
+ cu = 'cu11'
97
+ elif cuda_version == 12:
98
+ cu = 'cu12'
99
+ else:
100
+ raise Exception(f"unsupported cuda version: {cuda_version}")
101
+
102
+ lib_path = pathlib.Path(__file__).parent / "ftlib" / f"libth_transformer_{sm}_{cu}.so"
103
+ torch.classes.load_library(os.path.abspath(lib_path))
104
+
105
+ self.model = torch.classes.FasterTransformer.GlmOp(
106
+ self.head_num, self.size_per_head, self.inter_size,
107
+ self.layer_num,
108
+ self.expert_num,
109
+ self.moe_k,
110
+ self.moe_layer_index,
111
+ self.vocab_size,
112
+ self.rotary_embedding_dim,
113
+ self.start_id, self.end_id,
114
+ self.tensor_para_size, self.pipeline_para_size, self.int8_mode,
115
+ # GLM variant parameters
116
+ self.layernorm_eps,
117
+ self.layernorm_type,
118
+ self.activation_type,
119
+ self.has_positional_encoding,
120
+ self.has_pre_decoder_layernorm,
121
+ self.has_post_decoder_layernorm,
122
+ self.has_adapters,
123
+ self.adapter_inter_size,
124
+ self.use_attention_linear_bias,
125
+ self.model_path,
126
+ inference_data_type,
127
+ self.shared_contexts_ratio)
128
+ self.build_model = True
129
+
130
+ def forward(self,
131
+ start_ids: torch.IntTensor,
132
+ start_lengths: torch.IntTensor,
133
+ mask_positions: torch.IntTensor,
134
+ output_len: int,
135
+ beam_width: int = 1,
136
+ top_k: typing.Optional[torch.IntTensor] = None,
137
+ top_p: typing.Optional[torch.FloatTensor] = None,
138
+ beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = None,
139
+ temperature: typing.Optional[torch.FloatTensor] = None,
140
+ len_penalty: typing.Optional[torch.FloatTensor] = None,
141
+ repetition_penalty: typing.Optional[torch.FloatTensor] = None,
142
+ presence_penalty: typing.Optional[torch.FloatTensor] = None,
143
+ min_length: typing.Optional[torch.IntTensor] = None,
144
+ random_seed: typing.Optional[torch.LongTensor] = None,
145
+ bad_words_list: typing.Optional[torch.IntTensor] = None,
146
+ return_output_length: bool = False,
147
+ return_cum_log_probs: int = 0):
148
+
149
+ input_len = start_ids.size(1)
150
+ assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
151
+
152
+ # Inputs to device
153
+ start_ids = start_ids.cuda(self.device)
154
+ start_lengths = start_lengths.cuda(self.device)
155
+ mask_positions = mask_positions.cuda(self.device)
156
+
157
+ # outputs: output_ids, output_lengths, output_cum_log_probs (optional)
158
+ outputs = self.model.forward(start_ids,
159
+ start_lengths,
160
+ mask_positions,
161
+ output_len,
162
+ beam_width, # optional, can be None
163
+ top_k, # optional, can be None
164
+ top_p, # optional, can be None
165
+ beam_search_diversity_rate, # optional, can be None
166
+ temperature, # optional, can be None
167
+ len_penalty, # optional, can be None
168
+ repetition_penalty, # optional, can be None
169
+ presence_penalty, # optional, can be None
170
+ min_length, # optional, can be None
171
+ random_seed, # optional, can be None
172
+ bad_words_list, # optional, can be None
173
+ return_cum_log_probs) # optional, can be None
174
+ if return_cum_log_probs == 0:
175
+ output_ids, output_lengths = outputs
176
+ else:
177
+ output_ids, output_lengths, output_cum_log_probs = outputs
178
+ if return_output_length:
179
+ if return_cum_log_probs > 0:
180
+ return output_ids, output_lengths, output_cum_log_probs
181
+ else:
182
+ return output_ids, output_lengths
183
+ else:
184
+ return output_ids
185
+
186
+ def set_input_tensor(self, input_tensor):
187
+ """Set input tensor to be used instead of forward()'s input.
188
+
189
+ When doing pipeline parallelism the input from the previous
190
+ stage comes from communication, not from the input, so the
191
+ model's forward_step_func won't have it. This function is thus
192
+ used by internal code to bypass the input provided by the
193
+ forward_step_func"""
194
+ self.input_tensor = input_tensor
models/config.ini ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [glm6b]
2
+ model_name = chatglm-6b
3
+ head_num = 32
4
+ size_per_head = 128
5
+ inter_size = 16384
6
+ max_pos_seq_len = 2048
7
+ num_layer = 28
8
+ vocab_size = 130528
9
+ start_id = 130004
10
+ end_id = 130005
11
+ weight_data_type = fp16
12
+ tensor_para_size = 1
13
+ layernorm_eps = 1e-5
models/ice_text.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e974d9a69c242ce014c88c2b26089270f6198f3c0b700a887666cd3e816f17e
3
+ size 2706249
models/tokenization_chatglm.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tokenization classes for ChatGLM."""
2
+ from typing import List, Optional, Union
3
+ import os
4
+
5
+ from transformers.tokenization_utils import PreTrainedTokenizer
6
+ from transformers.utils import logging, PaddingStrategy
7
+ from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
8
+ from typing import Dict
9
+ import sentencepiece as spm
10
+ import numpy as np
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
15
+ "THUDM/chatglm-6b": 2048,
16
+ }
17
+
18
+
19
+ class TextTokenizer:
20
+ def __init__(self, model_path):
21
+ self.sp = spm.SentencePieceProcessor()
22
+ self.sp.Load(model_path)
23
+ self.num_tokens = self.sp.vocab_size()
24
+
25
+ def encode(self, text):
26
+ return self.sp.EncodeAsIds(text)
27
+
28
+ def decode(self, ids: List[int]):
29
+ return self.sp.DecodeIds(ids)
30
+
31
+ def tokenize(self, text):
32
+ return self.sp.EncodeAsPieces(text)
33
+
34
+ def convert_tokens_to_string(self, tokens):
35
+ return self.sp.DecodePieces(tokens)
36
+
37
+ def convert_tokens_to_ids(self, tokens):
38
+ return [self.sp.PieceToId(token) for token in tokens]
39
+
40
+ def convert_token_to_id(self, token):
41
+ return self.sp.PieceToId(token)
42
+
43
+ def convert_id_to_token(self, idx):
44
+ return self.sp.IdToPiece(idx)
45
+
46
+ def __len__(self):
47
+ return self.num_tokens
48
+
49
+
50
+ class SPTokenizer:
51
+ def __init__(
52
+ self,
53
+ vocab_file,
54
+ num_image_tokens=20000,
55
+ max_blank_length=80,
56
+ byte_fallback=True,
57
+ ):
58
+ assert vocab_file is not None
59
+ self.vocab_file = vocab_file
60
+ self.num_image_tokens = num_image_tokens
61
+ self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
62
+ self.max_blank_length = max_blank_length
63
+ self.byte_fallback = byte_fallback
64
+ self.text_tokenizer = TextTokenizer(vocab_file)
65
+
66
+ def _get_text_tokenizer(self):
67
+ return self.text_tokenizer
68
+
69
+ @staticmethod
70
+ def get_blank_token(length: int):
71
+ assert length >= 2
72
+ return f"<|blank_{length}|>"
73
+
74
+ @staticmethod
75
+ def get_tab_token():
76
+ return f"<|tab|>"
77
+
78
+ @property
79
+ def num_text_tokens(self):
80
+ return self.text_tokenizer.num_tokens
81
+
82
+ @property
83
+ def num_tokens(self):
84
+ return self.num_image_tokens + self.num_text_tokens
85
+
86
+ @staticmethod
87
+ def _encode_whitespaces(text: str, max_len: int = 80):
88
+ text = text.replace("\t", SPTokenizer.get_tab_token())
89
+ for i in range(max_len, 1, -1):
90
+ text = text.replace(" " * i, SPTokenizer.get_blank_token(i))
91
+ return text
92
+
93
+ def _preprocess(self, text: str, linebreak=True, whitespaces=True):
94
+ if linebreak:
95
+ text = text.replace("\n", "<n>")
96
+ if whitespaces:
97
+ text = self._encode_whitespaces(text, max_len=self.max_blank_length)
98
+ return text
99
+
100
+ def encode(
101
+ self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
102
+ ) -> List[int]:
103
+ """
104
+ @param text: Text to encode.
105
+ @param linebreak: Whether to encode newline (\n) in text.
106
+ @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
107
+ @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
108
+ @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
109
+ """
110
+ text = self._preprocess(text, linebreak, whitespaces)
111
+ if not add_dummy_prefix:
112
+ text = "<n>" + text
113
+ tmp = self._get_text_tokenizer().encode(text)
114
+ tokens = [x + self.num_image_tokens for x in tmp]
115
+ return tokens if add_dummy_prefix else tokens[2:]
116
+
117
+ def postprocess(self, text):
118
+ text = text.replace("<n>", "\n")
119
+ text = text.replace(SPTokenizer.get_tab_token(), "\t")
120
+ for i in range(2, self.max_blank_length + 1):
121
+ text = text.replace(self.get_blank_token(i), " " * i)
122
+ return text
123
+
124
+ def decode(self, text_ids: List[int]) -> str:
125
+ ids = [int(_id) - self.num_image_tokens for _id in text_ids]
126
+ ids = [_id for _id in ids if _id >= 0]
127
+ text = self._get_text_tokenizer().decode(ids)
128
+ text = self.postprocess(text)
129
+ return text
130
+
131
+ def decode_tokens(self, tokens: List[str]) -> str:
132
+ text = self._get_text_tokenizer().convert_tokens_to_string(tokens)
133
+ text = self.postprocess(text)
134
+ return text
135
+
136
+ def tokenize(
137
+ self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
138
+ ) -> List[str]:
139
+ """
140
+ @param text: Text to encode.
141
+ @param linebreak: Whether to encode newline (\n) in text.
142
+ @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
143
+ @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
144
+ @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
145
+ """
146
+ text = self._preprocess(text, linebreak, whitespaces)
147
+ if not add_dummy_prefix:
148
+ text = "<n>" + text
149
+ tokens = self._get_text_tokenizer().tokenize(text)
150
+ return tokens if add_dummy_prefix else tokens[2:]
151
+
152
+ def __getitem__(self, x: Union[int, str]):
153
+ if isinstance(x, int):
154
+ if x < self.num_image_tokens:
155
+ return "<image_{}>".format(x)
156
+ else:
157
+ return self.text_tokenizer.convert_id_to_token(x - self.num_image_tokens)
158
+ elif isinstance(x, str):
159
+ if x.startswith("<image_") and x.endswith(">") and x[7:-1].isdigit():
160
+ return int(x[7:-1])
161
+ else:
162
+ return self.text_tokenizer.convert_token_to_id(x) + self.num_image_tokens
163
+ else:
164
+ raise ValueError("The key should be str or int.")
165
+
166
+
167
+ class ChatGLMTokenizer(PreTrainedTokenizer):
168
+ """
169
+ Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.
170
+
171
+ Args:
172
+ vocab_file (`str`):
173
+ Path to the vocabulary file.
174
+ """
175
+
176
+ vocab_files_names = {"vocab_file": "ice_text.model"}
177
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
178
+ model_input_names = ["input_ids", "attention_mask", "position_ids"]
179
+
180
+ def __init__(
181
+ self,
182
+ vocab_file,
183
+ do_lower_case=False,
184
+ remove_space=False,
185
+ bos_token='<sop>',
186
+ eos_token='<eop>',
187
+ end_token='</s>',
188
+ mask_token='[MASK]',
189
+ gmask_token='[gMASK]',
190
+ padding_side="left",
191
+ pad_token="<pad>",
192
+ unk_token="<unk>",
193
+ num_image_tokens=20000,
194
+ **kwargs
195
+ ) -> None:
196
+ super().__init__(
197
+ do_lower_case=do_lower_case,
198
+ remove_space=remove_space,
199
+ padding_side=padding_side,
200
+ bos_token=bos_token,
201
+ eos_token=eos_token,
202
+ end_token=end_token,
203
+ mask_token=mask_token,
204
+ gmask_token=gmask_token,
205
+ pad_token=pad_token,
206
+ unk_token=unk_token,
207
+ num_image_tokens=num_image_tokens,
208
+ **kwargs
209
+ )
210
+
211
+ self.do_lower_case = do_lower_case
212
+ self.remove_space = remove_space
213
+ self.vocab_file = vocab_file
214
+
215
+ self.bos_token = bos_token
216
+ self.eos_token = eos_token
217
+ self.end_token = end_token
218
+ self.mask_token = mask_token
219
+ self.gmask_token = gmask_token
220
+
221
+ self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens)
222
+
223
+ """ Initialisation """
224
+
225
+ @property
226
+ def gmask_token_id(self) -> Optional[int]:
227
+ if self.gmask_token is None:
228
+ return None
229
+ return self.convert_tokens_to_ids(self.gmask_token)
230
+
231
+ @property
232
+ def end_token_id(self) -> Optional[int]:
233
+ """
234
+ `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
235
+ set.
236
+ """
237
+ if self.end_token is None:
238
+ return None
239
+ return self.convert_tokens_to_ids(self.end_token)
240
+
241
+ @property
242
+ def vocab_size(self):
243
+ """ Returns vocab size """
244
+ return self.sp_tokenizer.num_tokens
245
+
246
+ def get_vocab(self):
247
+ """ Returns vocab as a dict """
248
+ vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
249
+ vocab.update(self.added_tokens_encoder)
250
+ return vocab
251
+
252
+ def preprocess_text(self, inputs):
253
+ if self.remove_space:
254
+ outputs = " ".join(inputs.strip().split())
255
+ else:
256
+ outputs = inputs
257
+
258
+ if self.do_lower_case:
259
+ outputs = outputs.lower()
260
+
261
+ return outputs
262
+
263
+ def _tokenize(self, text, **kwargs):
264
+ """ Returns a tokenized string. """
265
+ text = self.preprocess_text(text)
266
+
267
+ seq = self.sp_tokenizer.tokenize(text)
268
+
269
+ return seq
270
+
271
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
272
+ return self.sp_tokenizer.decode_tokens(tokens)
273
+
274
+ def _decode(
275
+ self,
276
+ token_ids: Union[int, List[int]],
277
+ **kwargs
278
+ ) -> str:
279
+ if isinstance(token_ids, int):
280
+ token_ids = [token_ids]
281
+ if len(token_ids) == 0:
282
+ return ""
283
+ if self.pad_token_id in token_ids: # remove pad
284
+ token_ids = list(filter((self.pad_token_id).__ne__, token_ids))
285
+ return super()._decode(token_ids, **kwargs)
286
+
287
+ def _convert_token_to_id(self, token):
288
+ """ Converts a token (str) in an id using the vocab. """
289
+ return self.sp_tokenizer[token]
290
+
291
+ def _convert_id_to_token(self, index):
292
+ """Converts an index (integer) in a token (str) using the vocab."""
293
+ return self.sp_tokenizer[index]
294
+
295
+ def save_vocabulary(self, save_directory, filename_prefix=None):
296
+ """
297
+ Save the vocabulary and special tokens file to a directory.
298
+
299
+ Args:
300
+ save_directory (`str`):
301
+ The directory in which to save the vocabulary.
302
+ filename_prefix (`str`, *optional*):
303
+ An optional prefix to add to the named of the saved files.
304
+
305
+ Returns:
306
+ `Tuple(str)`: Paths to the files saved.
307
+ """
308
+ if os.path.isdir(save_directory):
309
+ vocab_file = os.path.join(
310
+ save_directory, self.vocab_files_names["vocab_file"]
311
+ )
312
+ else:
313
+ vocab_file = save_directory
314
+
315
+ with open(self.vocab_file, 'rb') as fin:
316
+ proto_str = fin.read()
317
+
318
+ with open(vocab_file, "wb") as writer:
319
+ writer.write(proto_str)
320
+
321
+ return (vocab_file,)
322
+
323
+ def build_inputs_with_special_tokens(
324
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
325
+ ) -> List[int]:
326
+ """
327
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
328
+ adding special tokens. A BERT sequence has the following format:
329
+
330
+ - single sequence: `[CLS] X [SEP]`
331
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
332
+
333
+ Args:
334
+ token_ids_0 (`List[int]`):
335
+ List of IDs to which the special tokens will be added.
336
+ token_ids_1 (`List[int]`, *optional*):
337
+ Optional second list of IDs for sequence pairs.
338
+
339
+ Returns:
340
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
341
+ """
342
+ gmask_id = self.sp_tokenizer[self.gmask_token]
343
+ eos_id = self.sp_tokenizer[self.eos_token]
344
+ token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]]
345
+ if token_ids_1 is not None:
346
+ token_ids_0 = token_ids_0 + token_ids_1 + [eos_id]
347
+ return token_ids_0
348
+
349
+ def _pad(
350
+ self,
351
+ encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
352
+ max_length: Optional[int] = None,
353
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
354
+ pad_to_multiple_of: Optional[int] = None,
355
+ return_attention_mask: Optional[bool] = None,
356
+ ) -> dict:
357
+ """
358
+ Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
359
+
360
+ Args:
361
+ encoded_inputs:
362
+ Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
363
+ max_length: maximum length of the returned list and optionally padding length (see below).
364
+ Will truncate by taking into account the special tokens.
365
+ padding_strategy: PaddingStrategy to use for padding.
366
+
367
+ - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
368
+ - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
369
+ - PaddingStrategy.DO_NOT_PAD: Do not pad
370
+ The tokenizer padding sides are defined in self.padding_side:
371
+
372
+ - 'left': pads on the left of the sequences
373
+ - 'right': pads on the right of the sequences
374
+ pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
375
+ This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
376
+ `>= 7.5` (Volta).
377
+ return_attention_mask:
378
+ (optional) Set to False to avoid returning attention mask (default: set to model specifics)
379
+ """
380
+ # Load from model defaults
381
+ bos_token_id = self.sp_tokenizer[self.bos_token]
382
+ mask_token_id = self.sp_tokenizer[self.mask_token]
383
+ gmask_token_id = self.sp_tokenizer[self.gmask_token]
384
+ assert self.padding_side == "left"
385
+
386
+ required_input = encoded_inputs[self.model_input_names[0]]
387
+ seq_length = len(required_input)
388
+
389
+ if padding_strategy == PaddingStrategy.LONGEST:
390
+ max_length = len(required_input)
391
+
392
+ if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
393
+ max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
394
+
395
+ needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
396
+
397
+ # Initialize attention mask if not present.
398
+ if max_length is not None:
399
+ if "attention_mask" not in encoded_inputs:
400
+ if bos_token_id in required_input:
401
+ context_length = required_input.index(bos_token_id)
402
+ else:
403
+ context_length = seq_length
404
+ attention_mask = np.ones((1, seq_length, seq_length))
405
+ attention_mask = np.tril(attention_mask)
406
+ attention_mask[:, :, :context_length] = 1
407
+ attention_mask = np.bool_(attention_mask < 0.5)
408
+ encoded_inputs["attention_mask"] = attention_mask
409
+
410
+ if "position_ids" not in encoded_inputs:
411
+ if bos_token_id in required_input:
412
+ context_length = required_input.index(bos_token_id)
413
+ else:
414
+ context_length = seq_length
415
+ position_ids = np.arange(seq_length, dtype=np.int64)
416
+ mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
417
+ if mask_token in required_input:
418
+ mask_position = required_input.index(mask_token)
419
+ position_ids[context_length:] = mask_position
420
+ block_position_ids = np.concatenate(
421
+ [np.zeros(context_length, dtype=np.int64),
422
+ np.arange(1, seq_length - context_length + 1, dtype=np.int64)])
423
+ encoded_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
424
+
425
+ if needs_to_be_padded:
426
+ difference = max_length - len(required_input)
427
+
428
+ if "attention_mask" in encoded_inputs:
429
+ encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
430
+ pad_width=[(0, 0), (difference, 0), (difference, 0)],
431
+ mode='constant', constant_values=True)
432
+ if "token_type_ids" in encoded_inputs:
433
+ encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
434
+ "token_type_ids"
435
+ ]
436
+ if "special_tokens_mask" in encoded_inputs:
437
+ encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
438
+ if "position_ids" in encoded_inputs:
439
+ encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
440
+ pad_width=[(0, 0), (difference, 0)])
441
+ encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
442
+
443
+ return encoded_inputs
models/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name_or_path": "THUDM/chatglm-6b",
3
+ "bos_token": "<sop>",
4
+ "eos_token": "<eop>",
5
+ "end_token": "</s>",
6
+ "gmask_token": "[gMASK]",
7
+ "mask_token": "[MASK]",
8
+ "pad_token": "<pad>",
9
+ "unk_token": "<unk>",
10
+ "remove_space": false,
11
+ "do_lower_case": false,
12
+ "tokenizer_class": "ChatGLMTokenizer",
13
+ "num_image_tokens": 0,
14
+ "auto_map": {
15
+ "AutoTokenizer": [
16
+ "tokenization_chatglm.ChatGLMTokenizer",
17
+ null
18
+ ]
19
+ }
20
+ }
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ icetk
2
+ cpm_kernels
3
+ transformers
4
+ huggingface_hub
5
+ numpy
6
+ setuptools
7
+ torch
8
+ h5py
9
+ protobuf==3.20.3