|
// Copyright 2019 Google LLC |
|
// |
|
// This source code is licensed under the BSD-style license found in the |
|
// LICENSE file in the root directory of this source tree. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BEGIN_FUNCTION xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma |
|
|
|
|
|
LDP x17, x16, [sp] |
|
|
|
|
|
STP x19, x20, [sp, -16]! |
|
|
|
|
|
LD2R {v30.4s, v31.4s}, [x16] |
|
|
|
0: |
|
|
|
LDP x8, x9, [x2] |
|
LDP x10, x11, [x2, 16] |
|
LDP x12, x13, [x2, 32] |
|
LDP x14, x15, [x2, 48] |
|
LDR x16, [x2, 64] |
|
|
|
CMP x8, x17 // if i0 == zero |
|
ADD x8, x8, x7 // i0 += input_offset |
|
CSEL x8, x17, x8, EQ // i0 = zero, else += i0 + input_offset |
|
CMP x9, x17 // if i1 == zero |
|
ADD x9, x9, x7 // i1 += input_offset |
|
CSEL x9, x17, x9, EQ // i1 = zero, else += i1 + input_offset |
|
CMP x10, x17 // if i2 == zero |
|
ADD x10, x10, x7 // i2 += input_offset |
|
CSEL x10, x17, x10, EQ // i2 = zero, else += i2 + input_offset |
|
CMP x11, x17 // if i3 == zero |
|
ADD x11, x11, x7 // i3 += input_offset |
|
CSEL x11, x17, x11, EQ // i3 = zero, else += i3 + input_offset |
|
CMP x12, x17 // if i4 == zero |
|
ADD x12, x12, x7 // i4 += input_offset |
|
CSEL x12, x17, x12, EQ // i4 = zero, else += i4 + input_offset |
|
CMP x13, x17 // if i5 == zero |
|
ADD x13, x13, x7 // i5 += input_offset |
|
CSEL x13, x17, x13, EQ // i5 = zero, else += i5 + input_offset |
|
CMP x14, x17 // if i6 == zero |
|
ADD x14, x14, x7 // i6 += input_offset |
|
CSEL x14, x17, x14, EQ // i6 = zero, else += i6 + input_offset |
|
CMP x15, x17 // if i7 == zero |
|
ADD x15, x15, x7 // i7 += input_offset |
|
CSEL x15, x17, x15, EQ // i7 = zero, else += i7 + input_offset |
|
CMP x16, x17 // if i8 == zero |
|
ADD x16, x16, x7 // i8 += input_offset |
|
CSEL x16, x17, x16, EQ // i8 = zero, else += i8 + input_offset |
|
|
|
|
|
ADD x2, x2, x5 |
|
|
|
|
|
|
|
SUBS x20, x0, 4 |
|
|
|
MOV x19, x3 |
|
|
|
|
|
B.LO 2f |
|
1: |
|
LDR q21, [x8], 16 // load 9 inputs |
|
LDP q0, q1, [x19], 32 // load bias and 9 weights |
|
LDR q22, [x9], 16 |
|
LDR q23, [x10], 16 |
|
LDR q24, [x11], 16 |
|
LDR q25, [x12], 16 |
|
LDR q26, [x13], 16 |
|
LDR q27, [x14], 16 |
|
LDR q28, [x15], 16 |
|
LDR q29, [x16], 16 |
|
LDP q2, q3, [x19], 32 |
|
LDP q4, q5, [x19], 32 |
|
LDP q6, q7, [x19], 32 |
|
LDP q16, q17, [x19], 32 |
|
|
|
FMLA v0.4S, v1.4S, v21.4S |
|
FMLA v0.4S, v2.4S, v22.4S |
|
FMLA v0.4S, v3.4S, v23.4S |
|
FMLA v0.4S, v4.4S, v24.4S |
|
FMLA v0.4S, v5.4S, v25.4S |
|
FMLA v0.4S, v6.4S, v26.4S |
|
FMLA v0.4S, v7.4S, v27.4S |
|
FMLA v0.4S, v16.4S, v28.4S |
|
FMLA v0.4S, v17.4S, v29.4S |
|
SUBS x20, x20, 4 |
|
|
|
FMAX v0.4S, v0.4S, v30.4S |
|
FMIN v0.4S, v0.4S, v31.4S |
|
STR q0, [x4], 16 |
|
B.HS 1b |
|
|
|
2: |
|
|
|
TST x20, 3 |
|
B.EQ 4f |
|
|
|
LDR q21, [x8], 16 // load 9 inputs |
|
LDP q0, q1, [x19], 32 // load bias and 9 weights |
|
LDR q22, [x9], 16 |
|
LDR q23, [x10], 16 |
|
LDR q24, [x11], 16 |
|
LDR q25, [x12], 16 |
|
LDR q26, [x13], 16 |
|
LDR q27, [x14], 16 |
|
LDR q28, [x15], 16 |
|
LDR q29, [x16], 16 |
|
LDP q2, q3, [x19], 32 |
|
LDP q4, q5, [x19], 32 |
|
LDP q6, q7, [x19], 32 |
|
LDP q16, q17, [x19], 32 |
|
|
|
FMLA v0.4S, v1.4S, v21.4S |
|
FMLA v0.4S, v2.4S, v22.4S |
|
FMLA v0.4S, v3.4S, v23.4S |
|
FMLA v0.4S, v4.4S, v24.4S |
|
FMLA v0.4S, v5.4S, v25.4S |
|
FMLA v0.4S, v6.4S, v26.4S |
|
FMLA v0.4S, v7.4S, v27.4S |
|
FMLA v0.4S, v16.4S, v28.4S |
|
FMLA v0.4S, v17.4S, v29.4S |
|
|
|
FMAX v0.4S, v0.4S, v30.4S |
|
FMIN v0.4S, v0.4S, v31.4S |
|
|
|
TBZ x20, 1, 3f |
|
|
|
STR d0, [x4], 8 |
|
DUP d0, v0.D[1] |
|
TBZ x20, 0, 4f |
|
3: |
|
STR s0, [x4], 4 |
|
4: |
|
|
|
SUBS x1, x1, 1 |
|
|
|
ADD x4, x4, x6 |
|
|
|
B.NE 0b |
|
|
|
|
|
LDP x19, x20, [sp], 16 |
|
RET |
|
|
|
END_FUNCTION xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma |
|
|
|
|
|
.section ".note.GNU-stack","",%progbits |
|
|
|
|