54 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			54 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
| from peachpy import *
 | |
| from peachpy.x86_64 import *
 | |
| 
 | |
| 
 | |
| def fp16_alt_xmm_to_fp32_ymm(xmm_half):
 | |
| 	ymm_half = YMMRegister()
 | |
| 	VPERMQ(ymm_half, xmm_half.as_ymm, 0b01010000)
 | |
| 
 | |
| 	ymm_zero = YMMRegister()
 | |
| 	VPXOR(ymm_zero.as_xmm, ymm_zero.as_xmm, ymm_zero.as_xmm)
 | |
| 
 | |
| 	ymm_word = YMMRegister()
 | |
| 	VPUNPCKLWD(ymm_word, ymm_zero, ymm_half)
 | |
| 
 | |
| 	ymm_shl1_half = YMMRegister()
 | |
| 	VPADDW(ymm_shl1_half, ymm_half, ymm_half)
 | |
| 
 | |
| 	ymm_shl1_nonsign = YMMRegister()
 | |
| 	VPADDD(ymm_shl1_nonsign, ymm_word, ymm_word)
 | |
| 
 | |
| 	sign_mask = Constant.float32x8(-0.0)
 | |
| 
 | |
| 	ymm_sign = YMMRegister()
 | |
| 	VANDPS(ymm_sign, ymm_word, sign_mask)
 | |
| 
 | |
| 	ymm_shr3_nonsign = YMMRegister()
 | |
| 	VPSRLD(ymm_shr3_nonsign, ymm_shl1_nonsign, 4)
 | |
| 
 | |
| 	exp_offset = Constant.uint32x8(0x38000000)
 | |
| 
 | |
| 	ymm_norm_nonsign = YMMRegister()
 | |
| 	VPADDD(ymm_norm_nonsign, ymm_shr3_nonsign, exp_offset)
 | |
| 
 | |
| 	magic_mask = Constant.uint16x16(0x3E80)
 | |
| 	ymm_denorm_nonsign = YMMRegister()
 | |
| 	VPUNPCKLWD(ymm_denorm_nonsign, ymm_shl1_half, magic_mask)
 | |
| 
 | |
| 	magic_bias = Constant.float32x8(0.25)
 | |
| 	VSUBPS(ymm_denorm_nonsign, ymm_denorm_nonsign, magic_bias)
 | |
| 
 | |
| 	ymm_denorm_cutoff = YMMRegister()
 | |
| 	VMOVDQA(ymm_denorm_cutoff, Constant.uint32x8(0x00800000))
 | |
| 	
 | |
| 	ymm_denorm_mask = YMMRegister()
 | |
| 	VPCMPGTD(ymm_denorm_mask, ymm_denorm_cutoff, ymm_shr3_nonsign)
 | |
| 
 | |
| 	ymm_nonsign = YMMRegister()
 | |
| 	VBLENDVPS(ymm_nonsign, ymm_norm_nonsign, ymm_denorm_nonsign, ymm_denorm_mask)
 | |
| 
 | |
| 	ymm_float = YMMRegister()
 | |
| 	VORPS(ymm_float, ymm_nonsign, ymm_sign)
 | |
| 
 | |
| 	return ymm_float
 |