Arrcttacsrks commited on
Commit
f392106
·
verified ·
1 Parent(s): 2403e2b

Upload llama.cpp/ggml/src/ggml-amx.cpp with huggingface_hub

Browse files
Files changed (1) hide show
  1. llama.cpp/ggml/src/ggml-amx.cpp +436 -0
llama.cpp/ggml/src/ggml-amx.cpp ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml-amx.h"
2
+ #include "ggml-amx/common.h"
3
+ #include "ggml-amx/mmq.h"
4
+ #include "ggml-backend-impl.h"
5
+ #include "ggml-impl.h"
6
+
7
+ #if defined(__gnu_linux__)
8
+ #include <sys/syscall.h>
9
+ #include <unistd.h>
10
+ #endif
11
+
12
+ #include <cstdlib>
13
+ #include <cstring>
14
+ #include <memory>
15
+
16
+ #if defined(__AMX_INT8__)
17
+
18
+ // AMX buffer interface
19
+ static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
20
+ free(buffer->context);
21
+ }
22
+
23
+ static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
24
+ return (void *)(buffer->context);
25
+ }
26
+
27
+ static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
28
+ memset((char *)tensor->data + offset, value, size);
29
+
30
+ GGML_UNUSED(buffer);
31
+ }
32
+
33
+ static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
34
+ if (qtype_has_amx_kernels(tensor->type)) {
35
+ ggml_backend_amx_convert_weight(tensor, data, offset, size);
36
+ } else {
37
+ memcpy((char *)tensor->data + offset, data, size);
38
+ }
39
+
40
+ GGML_UNUSED(buffer);
41
+ }
42
+
43
+ static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
44
+ GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
45
+ memcpy(data, (const char *)tensor->data + offset, size);
46
+
47
+ GGML_UNUSED(buffer);
48
+ }
49
+
50
+ static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
51
+ if (ggml_backend_buffer_is_host(src->buffer)) {
52
+ if (qtype_has_amx_kernels(src->type)) {
53
+ ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst));
54
+ } else {
55
+ memcpy(dst->data, src->data, ggml_nbytes(src));
56
+ }
57
+ return true;
58
+ }
59
+ return false;
60
+
61
+ GGML_UNUSED(buffer);
62
+ }
63
+
64
+ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
65
+ memset(buffer->context, value, buffer->size);
66
+ }
67
+
68
+ static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
69
+ /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
70
+ /* .get_base = */ ggml_backend_amx_buffer_get_base,
71
+ /* .init_tensor = */ NULL, // no initialization required
72
+ /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
73
+ /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
74
+ /* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
75
+ /* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
76
+ /* .clear = */ ggml_backend_amx_buffer_clear,
77
+ /* .reset = */ NULL,
78
+ };
79
+
80
+ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
81
+ return "AMX";
82
+
83
+ GGML_UNUSED(buft);
84
+ }
85
+
86
+ static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
87
+ void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
88
+ if (data == NULL) {
89
+ fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
90
+ return NULL;
91
+ }
92
+
93
+ return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
94
+ }
95
+
96
+ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
97
+ return TENSOR_ALIGNMENT;
98
+
99
+ GGML_UNUSED(buft);
100
+ }
101
+
102
+ static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
103
+ return ggml_backend_amx_get_alloc_size(tensor);
104
+
105
+ GGML_UNUSED(buft);
106
+ }
107
+
108
+ static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
109
+ return false;
110
+
111
+ GGML_UNUSED(buft);
112
+ }
113
+
114
+ ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
115
+ static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
116
+ /* .iface = */ {
117
+ /* .get_name = */ ggml_backend_amx_buffer_type_get_name,
118
+ /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
119
+ /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
120
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
121
+ /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
122
+ /* .is_host = */ ggml_backend_amx_buffer_type_is_host,
123
+ },
124
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
125
+ /* .context = */ NULL,
126
+ };
127
+
128
+ return &ggml_backend_buffer_type_amx;
129
+ }
130
+
131
+ // backend interface
132
+
133
+ static const char * ggml_backend_amx_name(ggml_backend_t backend) {
134
+ return "AMX";
135
+
136
+ GGML_UNUSED(backend);
137
+ }
138
+
139
+ static void ggml_backend_amx_free(ggml_backend_t backend) {
140
+ ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
141
+ delete ctx;
142
+ delete backend;
143
+ }
144
+
145
+ static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
146
+ ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
147
+
148
+ for (int i = 0; i < cgraph->n_nodes; i++) {
149
+ struct ggml_tensor * node = cgraph->nodes[i];
150
+
151
+ switch (node->op) {
152
+ case GGML_OP_MUL_MAT:
153
+ ggml_backend_amx_mul_mat(ctx, node);
154
+ break;
155
+
156
+ case GGML_OP_NONE:
157
+ case GGML_OP_RESHAPE:
158
+ case GGML_OP_VIEW:
159
+ case GGML_OP_PERMUTE:
160
+ case GGML_OP_TRANSPOSE:
161
+ break;
162
+
163
+ default:
164
+ fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
165
+ GGML_ASSERT(false);
166
+ }
167
+ }
168
+
169
+ return GGML_STATUS_SUCCESS;
170
+
171
+ GGML_UNUSED(backend);
172
+ }
173
+
174
+ static struct ggml_backend_i ggml_backend_amx_i = {
175
+ /* .get_name = */ ggml_backend_amx_name,
176
+ /* .free = */ ggml_backend_amx_free,
177
+ /* .set_tensor_async = */ NULL,
178
+ /* .get_tensor_async = */ NULL,
179
+ /* .cpy_tensor_async = */ NULL,
180
+ /* .synchronize = */ NULL,
181
+ /* .graph_plan_create = */ NULL,
182
+ /* .graph_plan_free = */ NULL,
183
+ /* .graph_plan_update = */ NULL,
184
+ /* .graph_plan_compute = */ NULL,
185
+ /* .graph_compute = */ ggml_backend_amx_graph_compute,
186
+ /* .event_record = */ NULL,
187
+ /* .event_wait = */ NULL,
188
+ };
189
+
190
+ static ggml_guid_t ggml_backend_amx_guid() {
191
+ static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e };
192
+ return &guid;
193
+ }
194
+
195
+ #define ARCH_GET_XCOMP_PERM 0x1022
196
+ #define ARCH_REQ_XCOMP_PERM 0x1023
197
+ #define XFEATURE_XTILECFG 17
198
+ #define XFEATURE_XTILEDATA 18
199
+
200
+ static bool ggml_amx_init() {
201
+ #if defined(__gnu_linux__)
202
+ if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
203
+ fprintf(stderr, "AMX is not ready to be used!\n");
204
+ return false;
205
+ }
206
+ return true;
207
+ #elif defined(_WIN32)
208
+ return true;
209
+ #endif
210
+ }
211
+
212
+ ggml_backend_t ggml_backend_amx_init() {
213
+
214
+ // invoke a Linux system call to request access to AMX features
215
+ ggml_amx_init();
216
+
217
+ // backend context
218
+ ggml_backend_amx_context * ctx = new ggml_backend_amx_context;
219
+
220
+ // ggml amx backend
221
+ ggml_backend_t backend = new ggml_backend {
222
+ /* .guid = */ ggml_backend_amx_guid(),
223
+ /* .interface = */ ggml_backend_amx_i,
224
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
225
+ /* .context = */ ctx,
226
+ };
227
+
228
+ return backend;
229
+ }
230
+
231
+ bool ggml_backend_is_amx(ggml_backend_t backend) {
232
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid());
233
+ }
234
+
235
+ void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
236
+ GGML_ASSERT(ggml_backend_is_amx(backend_amx));
237
+
238
+ ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context;
239
+ ctx->n_threads = n_threads;
240
+ }
241
+
242
+ // device interface
243
+
244
+ static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) {
245
+ return "AMX";
246
+
247
+ GGML_UNUSED(dev);
248
+ }
249
+
250
+ static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) {
251
+ return "Intel Advanced Matrix Extensions";
252
+
253
+ GGML_UNUSED(dev);
254
+ }
255
+
256
+ static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
257
+ // TODO
258
+ *free = 0;
259
+ *total = 0;
260
+
261
+ GGML_UNUSED(dev);
262
+ }
263
+
264
+ static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
265
+ return GGML_BACKEND_DEVICE_TYPE_ACCEL;
266
+
267
+ GGML_UNUSED(dev);
268
+ }
269
+
270
+ static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
271
+ props->name = ggml_backend_amx_device_get_name(dev);
272
+ props->description = ggml_backend_amx_device_get_description(dev);
273
+ props->type = ggml_backend_amx_device_get_type(dev);
274
+ ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total);
275
+
276
+ // `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged
277
+ props->caps = {
278
+ /* .async = */ false,
279
+ /* .host_buffer = */ false,
280
+ /* .buffer_from_host_ptr = */ false,
281
+ /* .events = */ false,
282
+ };
283
+ }
284
+
285
+ static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) {
286
+ return ggml_backend_amx_init();
287
+
288
+ GGML_UNUSED(dev);
289
+ GGML_UNUSED(params);
290
+ }
291
+
292
+ static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) {
293
+ return ggml_backend_amx_buffer_type();
294
+
295
+ GGML_UNUSED(dev);
296
+ }
297
+
298
+ static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
299
+
300
+ // handle only 2d gemm for now
301
+ auto is_contiguous_2d = [](const struct ggml_tensor * t) {
302
+ return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
303
+ };
304
+
305
+ switch (op->op) {
306
+ case GGML_OP_NONE:
307
+ case GGML_OP_RESHAPE:
308
+ case GGML_OP_VIEW:
309
+ case GGML_OP_PERMUTE:
310
+ case GGML_OP_TRANSPOSE:
311
+ return true;
312
+
313
+ case GGML_OP_MUL_MAT: {
314
+ const struct ggml_tensor * src0 = op->src[0];
315
+ const struct ggml_tensor * src1 = op->src[1];
316
+
317
+ const enum ggml_type type = src0->type;
318
+ const int64_t ne0 = op->ne[0];
319
+
320
+ bool is_training = src0->grad || src1->grad;
321
+
322
+ // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
323
+ // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
324
+ bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
325
+
326
+ bool can_use_amx =
327
+ is_contiguous_2d(src0) && // src0 must be contiguous
328
+ is_contiguous_2d(src1) && // src1 must be contiguous
329
+ !is_training && // inference only
330
+ src1->type == GGML_TYPE_F32 && // src1 must be float32
331
+ has_amx_kernels && // with amx kernel impls
332
+ ne0 % (TILE_N * 2) == 0; // out_features is 32x
333
+
334
+ return can_use_amx;
335
+ }
336
+ default:
337
+ return false;
338
+ }
339
+
340
+ GGML_UNUSED(dev);
341
+ }
342
+
343
+ static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
344
+ return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
345
+
346
+ GGML_UNUSED(dev);
347
+ }
348
+
349
+ static const struct ggml_backend_device_i ggml_backend_amx_device_i = {
350
+ /* .get_name = */ ggml_backend_amx_device_get_name,
351
+ /* .get_description = */ ggml_backend_amx_device_get_description,
352
+ /* .get_memory = */ ggml_backend_amx_device_get_memory,
353
+ /* .get_type = */ ggml_backend_amx_device_get_type,
354
+ /* .get_props = */ ggml_backend_amx_device_get_props,
355
+ /* .init_backend = */ ggml_backend_amx_device_init,
356
+ /* .get_buffer_type = */ ggml_backend_amx_device_get_buffer_type,
357
+ /* .get_host_buffer_type = */ NULL,
358
+ /* .buffer_from_host_ptr = */ NULL,
359
+ /* .supports_op = */ ggml_backend_amx_device_supports_op,
360
+ /* .supports_buft = */ ggml_backend_amx_device_supports_buft,
361
+ /* .offload_op = */ NULL,
362
+ /* .event_new = */ NULL,
363
+ /* .event_free = */ NULL,
364
+ /* .event_synchronize = */ NULL,
365
+ };
366
+
367
+ // backend reg interface
368
+
369
+ static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) {
370
+ return "AMX";
371
+
372
+ GGML_UNUSED(reg);
373
+ }
374
+
375
+ static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) {
376
+ return 1;
377
+
378
+ GGML_UNUSED(reg);
379
+ }
380
+
381
+ static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) {
382
+ GGML_ASSERT(index == 0);
383
+
384
+ static ggml_backend_device ggml_backend_amx_device = {
385
+ /* .iface = */ ggml_backend_amx_device_i,
386
+ /* .reg = */ reg,
387
+ /* .context = */ nullptr,
388
+ };
389
+
390
+ return &ggml_backend_amx_device;
391
+
392
+ GGML_UNUSED(reg);
393
+ GGML_UNUSED(index);
394
+ }
395
+
396
+ static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) {
397
+ if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
398
+ return (void *)ggml_backend_amx_set_n_threads;
399
+ }
400
+ return NULL;
401
+
402
+ GGML_UNUSED(reg);
403
+ GGML_UNUSED(name);
404
+ }
405
+
406
+ static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
407
+ /* .get_name = */ ggml_backend_amx_reg_get_name,
408
+ /* .get_device_count = */ ggml_backend_amx_reg_get_device_count,
409
+ /* .get_device = */ ggml_backend_amx_reg_get_device,
410
+ /* .get_proc_address = */ ggml_backend_amx_get_proc_address,
411
+ };
412
+
413
+ ggml_backend_reg_t ggml_backend_amx_reg(void) {
414
+ static struct ggml_backend_reg ggml_backend_amx_reg = {
415
+ /* .iface = */ ggml_backend_amx_reg_i,
416
+ /* .context = */ NULL,
417
+ };
418
+
419
+ return &ggml_backend_amx_reg;
420
+ }
421
+
422
+ #else // if defined(__AMX_INT8__)
423
+
424
+ ggml_backend_t ggml_backend_amx_init(void) {
425
+ fprintf(stderr, "GGML is not compiled with AMX support!\n");
426
+ return ggml_backend_t{};
427
+ }
428
+
429
+ void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
430
+ fprintf(stderr, "GGML is not compiled with AMX support!\n");
431
+
432
+ GGML_UNUSED(backend_amx);
433
+ GGML_UNUSED(n_threads);
434
+ }
435
+
436
+ #endif