File size: 4,089 Bytes
8b7c501 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
// Copyright 2022 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <xnnpack/assembly.h>
# void xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32(
# size_t mr, x0
# size_t nc, x1
# size_t kc, x2 / x0
# size_t ks, x3 / x9
# const void** restrict a, x4
# const void* restrict w, x5
# void* restrict c, x6
# size_t cm_stride, (x7) - unused
# size_t cn_stride, [sp] -> x10
# size_t a_offset, [sp + 8] -> x11
# const void* zero, [sp + 16] -> x12
# const xnn_f16_minmax_params params [sp + 24] -> (x8)
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
// Register usage
// A0 x8 v0
// B x5 v20 v21 v22 v23
// C0 x6 v24 v25
// clamp v4, v5
BEGIN_FUNCTION xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32
# Load cn_stride, a_offset
LDP x10, x11, [sp]
# Load zero, params pointer
LDP x12, x8, [sp, 16]
# Load params values
LD2R {v4.8h, v5.8h}, [x8]
0:
# Load initial bias from w into accumulators
LDR q24, [x5], 16
LDR q25, [x5], 16
MOVI v26.8h, 0 // second set of C for pipelining FMLA
MOVI v27.8h, 0
MOV x9, x3 // p = ks
1:
# Load next A pointer
LDR x8, [x4], 8
CMP x8, x12 // if a0 == zero
ADD x8, x8, x11 // a0 += a_offset
CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset
# Is there at least 2 halffloats (4 bytes)?
SUBS x0, x2, 4 // k = kc - 4
B.LO 4f
.p2align 3
# Main loop - 2 halffloats of A (4 bytes)
2:
LDR s0, [x8], 4
LDR q20, [x5, 0]
LDR q21, [x5, 16]
LDR q22, [x5, 32]
LDR q23, [x5, 48]
SUBS x0, x0, 4
FMLA v24.8h, v20.8h, v0.h[0]
FMLA v25.8h, v21.8h, v0.h[0]
FMLA v26.8h, v22.8h, v0.h[1]
FMLA v27.8h, v23.8h, v0.h[1]
ADD x5, x5, 64
B.HS 2b
# Is there a remainder?- 1 halffloat of A (2 bytes)
TBNZ x0, 1, 4f
3:
# ks loop
SUBS x9, x9, 8 // ks -= MR * sizeof(void*)
B.HI 1b
FADD v24.8h, v24.8h, v26.8h
FADD v25.8h, v25.8h, v27.8h
# Clamp
FMAX v24.8h, v24.8h, v4.8h
FMAX v25.8h, v25.8h, v4.8h
FMIN v24.8h, v24.8h, v5.8h
FMIN v25.8h, v25.8h, v5.8h
# Store full 1 x 16
SUBS x1, x1, 16
B.LO 5f
STP q24, q25, [x6]
ADD x6, x6, x10
SUB x4, x4, x3 // a -= ks
# nc loop
B.HI 0b
RET
# Remainder- 1 halffloat of A
4:
LDR h0, [x8], 2
LDR q20, [x5], 16
LDR q21, [x5], 16
FMLA v24.8h, v20.8h, v0.h[0]
FMLA v25.8h, v21.8h, v0.h[0]
B 3b
# Store odd width
5:
TBZ x1, 3, 6f
STR q24, [x6], 16
MOV v24.16b, v25.16b
6:
TBZ x1, 2, 7f
STR d24, [x6], 8
DUP d24, v24.d[1]
7:
TBZ x1, 1, 8f
STR s24, [x6], 4
DUP s24, v24.s[1]
8:
TBZ x1, 0, 9f
STR h24, [x6]
9:
RET
END_FUNCTION xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32
#ifdef __ELF__
.section ".note.GNU-stack","",%progbits
#endif
|