Spaces:

FaceOnLive
/

Face-Liveness-Detection-SDK

Running

App Files Files Community

Face-Liveness-Detection-SDK / openvino /vpu_custom_kernels /mvn_reduction.cl

Zhu-FaceOnLive

Initial commit.

2ded60b about 1 year ago

raw

history blame

3.25 kB

	// Copyright (C) 2018-2022 Intel Corporation
	// SPDX-License-Identifier: Apache-2.0
	//

	#pragma OPENCL EXTENSION cl_khr_fp16 : enable
	#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

	// Set to 1 only if output is zerroed before kernel execution
	#define USE_ATOMICS 0

	void atomic_add_global(volatile __global float *source, const float operand)
	{
	union {
	unsigned int intVal;
	float floatVal;
	} newVal;
	union {
	unsigned int intVal;
	float floatVal;
	} prevVal;

	do {
	prevVal.floatVal = *source;
	newVal.floatVal = prevVal.floatVal + operand;
	} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
	}

	__kernel void reduction_mean(
	__global const half *restrict src,
	__global float *restrict mean,
	__global float *restrict variance,
	int W,
	int H,
	int across_channels)
	{
	__local half src_line[4 * 1024];
	event_t e;

	e = async_work_group_copy_2D2D(
	src_line, // dst
	src + get_group_id(1) * get_local_size(1) * W
	+ get_group_id(2) * get_local_size(2) * W * get_global_size(1), // src
	W * get_local_size(1), // num_elements_per_line,
	get_local_size(2), // num_lines,
	W * (get_global_size(1) - get_local_size(1)), // src_line_stride,
	0, // dst_line_stride,
	0);

	wait_group_events(1, &e);

	int h = get_global_id(1);
	int c = get_global_id(2);

	const int MAX_LOCAL_SIZE = 8;

	__local float mbuf[MAX_LOCAL_SIZE];
	__local float vbuf[MAX_LOCAL_SIZE];

	mbuf[get_local_id(1)] = 0;
	vbuf[get_local_id(1)] = 0;

	if (h < H) {
	float sum = 0.f;
	float sum2 = 0.f;

	float8 sum4 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
	float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};

	const __local half8 restrict lsrc = ((const __local half8 )(src_line + get_local_id(1) * W));

	#pragma unroll 16
	for (size_t w = 0; w < W / 8; w++) {
	half8 sh = lsrc[w];
	float8 valf = convert_float8(sh);

	sum4 += valf;
	sum24 += valf * valf;
	}

	for (size_t w = W / 8 * 8; w < W; w++) {
	float val = (float)src_line[get_local_id(1) * W + w];
	sum += val;
	sum2 += val * val;
	}

	mbuf[get_local_id(1)] = sum4.s0 + sum4.s1 + sum4.s2 + sum4.s3 + sum4.s4 + sum4.s5 + sum4.s6 + sum4.s7 + sum;
	vbuf[get_local_id(1)] =
	sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2;
	}

	barrier(CLK_LOCAL_MEM_FENCE);

	if (get_local_id(1) == 0) {
	float res = 0;
	float res2 = 0;

	for (int i = 0; i < get_local_size(1); i++) {
	res += mbuf[i];
	res2 += vbuf[i];
	}

	// requires memory reset before layer execution
	#if USE_ATOMICS
	int idx = (across_channels == 0) ? c : 0;

	atomic_add_global(mean + idx, res);
	atomic_add_global(variance + idx, res2);
	#else
	int idx = c * get_num_groups(1) + get_group_id(1);

	mean[idx] = res;
	variance[idx] = res2;
	#endif
	}
	}