File size: 1,331 Bytes
8b7c501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from peachpy import *
from peachpy.x86_64 import *


def fp16_alt_xmm_to_fp32_xmm(xmm_half):
	xmm_zero = XMMRegister()
	VPXOR(xmm_zero, xmm_zero, xmm_zero)

	xmm_word = XMMRegister()
	VPUNPCKLWD(xmm_word, xmm_zero, xmm_half)

	xmm_shl1_half = XMMRegister()
	VPADDW(xmm_shl1_half, xmm_half, xmm_half)

	xmm_shl1_nonsign = XMMRegister()
	VPADDD(xmm_shl1_nonsign, xmm_word, xmm_word)

	sign_mask = Constant.float32x4(-0.0)

	xmm_sign = XMMRegister()
	VANDPS(xmm_sign, xmm_word, sign_mask)

	xmm_shr3_nonsign = XMMRegister()
	VPSRLD(xmm_shr3_nonsign, xmm_shl1_nonsign, 4)

	exp_offset = Constant.uint32x4(0x38000000)

	xmm_norm_nonsign = XMMRegister()
	VPADDD(xmm_norm_nonsign, xmm_shr3_nonsign, exp_offset)

	magic_mask = Constant.uint16x8(0x3E80)
	xmm_denorm_nonsign = XMMRegister()
	VPUNPCKLWD(xmm_denorm_nonsign, xmm_shl1_half, magic_mask)

	magic_bias = Constant.float32x4(0.25)
	VSUBPS(xmm_denorm_nonsign, xmm_denorm_nonsign, magic_bias)

	xmm_denorm_cutoff = XMMRegister()
	VMOVDQA(xmm_denorm_cutoff, Constant.uint32x4(0x00800000))
	
	xmm_denorm_mask = XMMRegister()
	VPCMPGTD(xmm_denorm_mask, xmm_denorm_cutoff, xmm_shr3_nonsign)

	xmm_nonsign = XMMRegister()
	VBLENDVPS(xmm_nonsign, xmm_norm_nonsign, xmm_denorm_nonsign, xmm_denorm_mask)

	xmm_float = XMMRegister()
	VORPS(xmm_float, xmm_nonsign, xmm_sign)

	return xmm_float