File size: 1,331 Bytes
8b7c501 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
from peachpy import *
from peachpy.x86_64 import *
def fp16_alt_xmm_to_fp32_xmm(xmm_half):
xmm_zero = XMMRegister()
VPXOR(xmm_zero, xmm_zero, xmm_zero)
xmm_word = XMMRegister()
VPUNPCKLWD(xmm_word, xmm_zero, xmm_half)
xmm_shl1_half = XMMRegister()
VPADDW(xmm_shl1_half, xmm_half, xmm_half)
xmm_shl1_nonsign = XMMRegister()
VPADDD(xmm_shl1_nonsign, xmm_word, xmm_word)
sign_mask = Constant.float32x4(-0.0)
xmm_sign = XMMRegister()
VANDPS(xmm_sign, xmm_word, sign_mask)
xmm_shr3_nonsign = XMMRegister()
VPSRLD(xmm_shr3_nonsign, xmm_shl1_nonsign, 4)
exp_offset = Constant.uint32x4(0x38000000)
xmm_norm_nonsign = XMMRegister()
VPADDD(xmm_norm_nonsign, xmm_shr3_nonsign, exp_offset)
magic_mask = Constant.uint16x8(0x3E80)
xmm_denorm_nonsign = XMMRegister()
VPUNPCKLWD(xmm_denorm_nonsign, xmm_shl1_half, magic_mask)
magic_bias = Constant.float32x4(0.25)
VSUBPS(xmm_denorm_nonsign, xmm_denorm_nonsign, magic_bias)
xmm_denorm_cutoff = XMMRegister()
VMOVDQA(xmm_denorm_cutoff, Constant.uint32x4(0x00800000))
xmm_denorm_mask = XMMRegister()
VPCMPGTD(xmm_denorm_mask, xmm_denorm_cutoff, xmm_shr3_nonsign)
xmm_nonsign = XMMRegister()
VBLENDVPS(xmm_nonsign, xmm_norm_nonsign, xmm_denorm_nonsign, xmm_denorm_mask)
xmm_float = XMMRegister()
VORPS(xmm_float, xmm_nonsign, xmm_sign)
return xmm_float
|