android13/external/FP16/include/fp16/avx2.py

54 lines
1.4 KiB
Python

from peachpy import *
from peachpy.x86_64 import *
def fp16_alt_xmm_to_fp32_ymm(xmm_half):
ymm_half = YMMRegister()
VPERMQ(ymm_half, xmm_half.as_ymm, 0b01010000)
ymm_zero = YMMRegister()
VPXOR(ymm_zero.as_xmm, ymm_zero.as_xmm, ymm_zero.as_xmm)
ymm_word = YMMRegister()
VPUNPCKLWD(ymm_word, ymm_zero, ymm_half)
ymm_shl1_half = YMMRegister()
VPADDW(ymm_shl1_half, ymm_half, ymm_half)
ymm_shl1_nonsign = YMMRegister()
VPADDD(ymm_shl1_nonsign, ymm_word, ymm_word)
sign_mask = Constant.float32x8(-0.0)
ymm_sign = YMMRegister()
VANDPS(ymm_sign, ymm_word, sign_mask)
ymm_shr3_nonsign = YMMRegister()
VPSRLD(ymm_shr3_nonsign, ymm_shl1_nonsign, 4)
exp_offset = Constant.uint32x8(0x38000000)
ymm_norm_nonsign = YMMRegister()
VPADDD(ymm_norm_nonsign, ymm_shr3_nonsign, exp_offset)
magic_mask = Constant.uint16x16(0x3E80)
ymm_denorm_nonsign = YMMRegister()
VPUNPCKLWD(ymm_denorm_nonsign, ymm_shl1_half, magic_mask)
magic_bias = Constant.float32x8(0.25)
VSUBPS(ymm_denorm_nonsign, ymm_denorm_nonsign, magic_bias)
ymm_denorm_cutoff = YMMRegister()
VMOVDQA(ymm_denorm_cutoff, Constant.uint32x8(0x00800000))
ymm_denorm_mask = YMMRegister()
VPCMPGTD(ymm_denorm_mask, ymm_denorm_cutoff, ymm_shr3_nonsign)
ymm_nonsign = YMMRegister()
VBLENDVPS(ymm_nonsign, ymm_norm_nonsign, ymm_denorm_nonsign, ymm_denorm_mask)
ymm_float = YMMRegister()
VORPS(ymm_float, ymm_nonsign, ymm_sign)
return ymm_float