|
|
|
|
|
|
|
|
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable |
|
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable |
|
|
|
|
|
#define USE_ATOMICS 0 |
|
|
|
void atomic_add_global(volatile __global float *source, const float operand) |
|
{ |
|
union { |
|
unsigned int intVal; |
|
float floatVal; |
|
} newVal; |
|
union { |
|
unsigned int intVal; |
|
float floatVal; |
|
} prevVal; |
|
|
|
do { |
|
prevVal.floatVal = *source; |
|
newVal.floatVal = prevVal.floatVal + operand; |
|
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); |
|
} |
|
|
|
__kernel void reduction_mean( |
|
__global const half *restrict src, |
|
__global float *restrict mean, |
|
__global float *restrict variance, |
|
int W, |
|
int H, |
|
int across_channels) |
|
{ |
|
__local half src_line[4 * 1024]; |
|
event_t e; |
|
|
|
e = async_work_group_copy_2D2D( |
|
src_line, |
|
src + get_group_id(1) * get_local_size(1) * W |
|
+ get_group_id(2) * get_local_size(2) * W * get_global_size(1), |
|
W * get_local_size(1), |
|
get_local_size(2), |
|
W * (get_global_size(1) - get_local_size(1)), |
|
0, |
|
0); |
|
|
|
wait_group_events(1, &e); |
|
|
|
int h = get_global_id(1); |
|
int c = get_global_id(2); |
|
|
|
const int MAX_LOCAL_SIZE = 8; |
|
|
|
__local float mbuf[MAX_LOCAL_SIZE]; |
|
__local float vbuf[MAX_LOCAL_SIZE]; |
|
|
|
mbuf[get_local_id(1)] = 0; |
|
vbuf[get_local_id(1)] = 0; |
|
|
|
if (h < H) { |
|
float sum = 0.f; |
|
float sum2 = 0.f; |
|
|
|
float8 sum4 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; |
|
float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; |
|
|
|
const __local half8 *restrict lsrc = ((const __local half8 *)(src_line + get_local_id(1) * W)); |
|
|
|
#pragma unroll 16 |
|
for (size_t w = 0; w < W / 8; w++) { |
|
half8 sh = lsrc[w]; |
|
float8 valf = convert_float8(sh); |
|
|
|
sum4 += valf; |
|
sum24 += valf * valf; |
|
} |
|
|
|
for (size_t w = W / 8 * 8; w < W; w++) { |
|
float val = (float)src_line[get_local_id(1) * W + w]; |
|
sum += val; |
|
sum2 += val * val; |
|
} |
|
|
|
mbuf[get_local_id(1)] = sum4.s0 + sum4.s1 + sum4.s2 + sum4.s3 + sum4.s4 + sum4.s5 + sum4.s6 + sum4.s7 + sum; |
|
vbuf[get_local_id(1)] = |
|
sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2; |
|
} |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
if (get_local_id(1) == 0) { |
|
float res = 0; |
|
float res2 = 0; |
|
|
|
for (int i = 0; i < get_local_size(1); i++) { |
|
res += mbuf[i]; |
|
res2 += vbuf[i]; |
|
} |
|
|
|
|
|
#if USE_ATOMICS |
|
int idx = (across_channels == 0) ? c : 0; |
|
|
|
atomic_add_global(mean + idx, res); |
|
atomic_add_global(variance + idx, res2); |
|
#else |
|
int idx = c * get_num_groups(1) + get_group_id(1); |
|
|
|
mean[idx] = res; |
|
variance[idx] = res2; |
|
#endif |
|
} |
|
} |
|
|