File size: 2,367 Bytes
8b7c501 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
// Copyright 2022 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include <xnnpack/math.h>
#include <xnnpack/fft.h>
#include <arm_neon.h>
void xnn_cs16_fftr_ukernel__neon_x4(
size_t samples,
int16_t* data,
const int16_t* twiddle)
{
assert(samples != 0);
assert(samples % 8 == 0);
assert(data != NULL);
assert(twiddle != NULL);
int16_t* dl = data;
int16_t* dr = data + samples * 2;
int32_t vdcr = (int32_t) dl[0];
int32_t vdci = (int32_t) dl[1];
vdcr = math_asr_s32(vdcr * 16383 + 16384, 15);
vdci = math_asr_s32(vdci * 16383 + 16384, 15);
dl[0] = vdcr + vdci;
dl[1] = 0;
dl += 2;
dr[0] = vdcr - vdci;
dr[1] = 0;
const int16x4_t vdiv2 = vdup_n_s16(16383);
do {
dr -= 8;
const int16x4x2_t vil = vld2_s16(dl);
const int16x4x2_t vir = vld2_s16(dr);
const int16x4x2_t vtw = vld2_s16(twiddle); twiddle += 8;
int16x4_t virr = vrev64_s16(vir.val[0]);
int16x4_t viri = vrev64_s16(vir.val[1]);
const int16x4_t vilr = vqrdmulh_s16(vil.val[0], vdiv2);
const int16x4_t vili = vqrdmulh_s16(vil.val[1], vdiv2);
virr = vqrdmulh_s16(virr, vdiv2);
viri = vqrdmulh_s16(viri, vdiv2);
const int32x4_t vacc1r = vaddl_s16(vilr, virr);
const int32x4_t vacc1i = vsubl_s16(vili, viri);
const int16x4_t vacc2r = vsub_s16(vilr, virr);
const int16x4_t vacc2i = vadd_s16(vili, viri);
int32x4_t vaccr = vmull_s16(vacc2r, vtw.val[0]);
int32x4_t vacci = vmull_s16(vacc2r, vtw.val[1]);
vaccr = vmlsl_s16(vaccr, vacc2i, vtw.val[1]);
vacci = vmlal_s16(vacci, vacc2i, vtw.val[0]);
vaccr = vrshrq_n_s32(vaccr, 15);
vacci = vrshrq_n_s32(vacci, 15);
const int32x4_t vacclr = vhaddq_s32(vacc1r, vaccr);
const int32x4_t vaccli = vhaddq_s32(vacc1i, vacci);
const int32x4_t vaccrr = vhsubq_s32(vacc1r, vaccr);
const int32x4_t vaccri = vhsubq_s32(vacci, vacc1i);
int16x4x2_t voutl;
int16x4x2_t voutr;
voutl.val[0] = vmovn_s32(vacclr);
voutl.val[1] = vmovn_s32(vaccli);
voutr.val[0] = vrev64_s16(vmovn_s32(vaccrr));
voutr.val[1] = vrev64_s16(vmovn_s32(vaccri));
vst2_s16(dl, voutl);
vst2_s16(dr, voutr);
dl += 8;
samples -= 8;
} while(samples != 0);
}
|