File size: 6,019 Bytes
8b7c501 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
// Auto-generated file. Do not edit!
// Template: src/cs16-fftr/scalar.c.in
// Generator: tools/xngen
//
// Copyright 2022 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include <xnnpack/math.h>
#include <xnnpack/fft.h>
void xnn_cs16_fftr_ukernel__scalar_x4(
size_t samples,
int16_t* data,
const int16_t* twiddle)
{
assert(samples != 0);
assert(samples % 2 == 0);
assert(data != NULL);
assert(twiddle != NULL);
int16_t* dl = data;
int16_t* dr = data + samples * 2;
int32_t vdcr = (int32_t) dl[0];
int32_t vdci = (int32_t) dl[1];
vdcr = math_asr_s32(vdcr * 16383 + 16384, 15);
vdci = math_asr_s32(vdci * 16383 + 16384, 15);
dl[0] = vdcr + vdci;
dl[1] = 0;
dl += 2;
dr[0] = vdcr - vdci;
dr[1] = 0;
samples >>= 1;
for (; samples >= 4; samples -= 4) {
dr -= 4 * 2;
int32_t vilr0 = (int32_t) dl[0];
int32_t vili0 = (int32_t) dl[1];
int32_t vilr1 = (int32_t) dl[2];
int32_t vili1 = (int32_t) dl[3];
int32_t vilr2 = (int32_t) dl[4];
int32_t vili2 = (int32_t) dl[5];
int32_t vilr3 = (int32_t) dl[6];
int32_t vili3 = (int32_t) dl[7];
int32_t virr0 = (int32_t) dr[6];
int32_t viri0 = (int32_t) dr[7];
int32_t virr1 = (int32_t) dr[4];
int32_t viri1 = (int32_t) dr[5];
int32_t virr2 = (int32_t) dr[2];
int32_t viri2 = (int32_t) dr[3];
int32_t virr3 = (int32_t) dr[0];
int32_t viri3 = (int32_t) dr[1];
const int32_t vtwr0 = twiddle[0];
const int32_t vtwi0 = twiddle[1];
const int32_t vtwr1 = twiddle[2];
const int32_t vtwi1 = twiddle[3];
const int32_t vtwr2 = twiddle[4];
const int32_t vtwi2 = twiddle[5];
const int32_t vtwr3 = twiddle[6];
const int32_t vtwi3 = twiddle[7];
twiddle += 4 * 2;
vilr0 = math_asr_s32(vilr0 * 16383 + 16384, 15);
vili0 = math_asr_s32(vili0 * 16383 + 16384, 15);
virr0 = math_asr_s32(virr0 * 16383 + 16384, 15);
viri0 = math_asr_s32(viri0 * 16383 + 16384, 15);
vilr1 = math_asr_s32(vilr1 * 16383 + 16384, 15);
vili1 = math_asr_s32(vili1 * 16383 + 16384, 15);
virr1 = math_asr_s32(virr1 * 16383 + 16384, 15);
viri1 = math_asr_s32(viri1 * 16383 + 16384, 15);
vilr2 = math_asr_s32(vilr2 * 16383 + 16384, 15);
vili2 = math_asr_s32(vili2 * 16383 + 16384, 15);
virr2 = math_asr_s32(virr2 * 16383 + 16384, 15);
viri2 = math_asr_s32(viri2 * 16383 + 16384, 15);
vilr3 = math_asr_s32(vilr3 * 16383 + 16384, 15);
vili3 = math_asr_s32(vili3 * 16383 + 16384, 15);
virr3 = math_asr_s32(virr3 * 16383 + 16384, 15);
viri3 = math_asr_s32(viri3 * 16383 + 16384, 15);
const int32_t vacc1r0 = vilr0 + virr0;
const int32_t vacc1i0 = vili0 - viri0;
const int32_t vacc2r0 = vilr0 - virr0;
const int32_t vacc2i0 = vili0 + viri0;
const int32_t vacc1r1 = vilr1 + virr1;
const int32_t vacc1i1 = vili1 - viri1;
const int32_t vacc2r1 = vilr1 - virr1;
const int32_t vacc2i1 = vili1 + viri1;
const int32_t vacc1r2 = vilr2 + virr2;
const int32_t vacc1i2 = vili2 - viri2;
const int32_t vacc2r2 = vilr2 - virr2;
const int32_t vacc2i2 = vili2 + viri2;
const int32_t vacc1r3 = vilr3 + virr3;
const int32_t vacc1i3 = vili3 - viri3;
const int32_t vacc2r3 = vilr3 - virr3;
const int32_t vacc2i3 = vili3 + viri3;
const int32_t vaccr0 = math_asr_s32(vacc2r0 * vtwr0 - vacc2i0 * vtwi0 + 16384, 15);
const int32_t vacci0 = math_asr_s32(vacc2r0 * vtwi0 + vacc2i0 * vtwr0 + 16384, 15);
const int32_t vaccr1 = math_asr_s32(vacc2r1 * vtwr1 - vacc2i1 * vtwi1 + 16384, 15);
const int32_t vacci1 = math_asr_s32(vacc2r1 * vtwi1 + vacc2i1 * vtwr1 + 16384, 15);
const int32_t vaccr2 = math_asr_s32(vacc2r2 * vtwr2 - vacc2i2 * vtwi2 + 16384, 15);
const int32_t vacci2 = math_asr_s32(vacc2r2 * vtwi2 + vacc2i2 * vtwr2 + 16384, 15);
const int32_t vaccr3 = math_asr_s32(vacc2r3 * vtwr3 - vacc2i3 * vtwi3 + 16384, 15);
const int32_t vacci3 = math_asr_s32(vacc2r3 * vtwi3 + vacc2i3 * vtwr3 + 16384, 15);
dl[0] = math_asr_s32(vacc1r0 + vaccr0, 1);
dl[1] = math_asr_s32(vacc1i0 + vacci0, 1);
dl[2] = math_asr_s32(vacc1r1 + vaccr1, 1);
dl[3] = math_asr_s32(vacc1i1 + vacci1, 1);
dl[4] = math_asr_s32(vacc1r2 + vaccr2, 1);
dl[5] = math_asr_s32(vacc1i2 + vacci2, 1);
dl[6] = math_asr_s32(vacc1r3 + vaccr3, 1);
dl[7] = math_asr_s32(vacc1i3 + vacci3, 1);
dr[6] = math_asr_s32(vacc1r0 - vaccr0, 1);
dr[7] = math_asr_s32(vacci0 - vacc1i0, 1);
dr[4] = math_asr_s32(vacc1r1 - vaccr1, 1);
dr[5] = math_asr_s32(vacci1 - vacc1i1, 1);
dr[2] = math_asr_s32(vacc1r2 - vaccr2, 1);
dr[3] = math_asr_s32(vacci2 - vacc1i2, 1);
dr[0] = math_asr_s32(vacc1r3 - vaccr3, 1);
dr[1] = math_asr_s32(vacci3 - vacc1i3, 1);
dl += 4 * 2;
}
if XNN_UNLIKELY(samples != 0) {
do {
dr -= 2;
int32_t vilr = (int32_t) dl[0];
int32_t vili = (int32_t) dl[1];
int32_t virr = (int32_t) dr[0];
int32_t viri = (int32_t) dr[1];
const int32_t vtwr = twiddle[0];
const int32_t vtwi = twiddle[1];
twiddle += 2;
vilr = math_asr_s32(vilr * 16383 + 16384, 15);
vili = math_asr_s32(vili * 16383 + 16384, 15);
virr = math_asr_s32(virr * 16383 + 16384, 15);
viri = math_asr_s32(viri * 16383 + 16384, 15);
const int32_t vacc1r = vilr + virr;
const int32_t vacc1i = vili - viri;
const int32_t vacc2r = vilr - virr;
const int32_t vacc2i = vili + viri;
const int32_t vaccr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15);
const int32_t vacci = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15);
dl[0] = math_asr_s32(vacc1r + vaccr, 1);
dl[1] = math_asr_s32(vacc1i + vacci, 1);
dr[0] = math_asr_s32(vacc1r - vaccr, 1);
dr[1] = math_asr_s32(vacci - vacc1i, 1);
dl += 2;
} while (--samples != 0);
}
}
|