51 lines
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			51 lines
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
| from peachpy import *
 | |
| from peachpy.x86_64 import *
 | |
| 
 | |
| 
 | |
| def fp16_alt_xmm_to_fp32_xmm(xmm_half):
 | |
| 	xmm_zero = XMMRegister()
 | |
| 	VPXOR(xmm_zero, xmm_zero, xmm_zero)
 | |
| 
 | |
| 	xmm_word = XMMRegister()
 | |
| 	VPUNPCKLWD(xmm_word, xmm_zero, xmm_half)
 | |
| 
 | |
| 	xmm_shl1_half = XMMRegister()
 | |
| 	VPADDW(xmm_shl1_half, xmm_half, xmm_half)
 | |
| 
 | |
| 	xmm_shl1_nonsign = XMMRegister()
 | |
| 	VPADDD(xmm_shl1_nonsign, xmm_word, xmm_word)
 | |
| 
 | |
| 	sign_mask = Constant.float32x4(-0.0)
 | |
| 
 | |
| 	xmm_sign = XMMRegister()
 | |
| 	VANDPS(xmm_sign, xmm_word, sign_mask)
 | |
| 
 | |
| 	xmm_shr3_nonsign = XMMRegister()
 | |
| 	VPSRLD(xmm_shr3_nonsign, xmm_shl1_nonsign, 4)
 | |
| 
 | |
| 	exp_offset = Constant.uint32x4(0x38000000)
 | |
| 
 | |
| 	xmm_norm_nonsign = XMMRegister()
 | |
| 	VPADDD(xmm_norm_nonsign, xmm_shr3_nonsign, exp_offset)
 | |
| 
 | |
| 	magic_mask = Constant.uint16x8(0x3E80)
 | |
| 	xmm_denorm_nonsign = XMMRegister()
 | |
| 	VPUNPCKLWD(xmm_denorm_nonsign, xmm_shl1_half, magic_mask)
 | |
| 
 | |
| 	magic_bias = Constant.float32x4(0.25)
 | |
| 	VSUBPS(xmm_denorm_nonsign, xmm_denorm_nonsign, magic_bias)
 | |
| 
 | |
| 	xmm_denorm_cutoff = XMMRegister()
 | |
| 	VMOVDQA(xmm_denorm_cutoff, Constant.uint32x4(0x00800000))
 | |
| 	
 | |
| 	xmm_denorm_mask = XMMRegister()
 | |
| 	VPCMPGTD(xmm_denorm_mask, xmm_denorm_cutoff, xmm_shr3_nonsign)
 | |
| 
 | |
| 	xmm_nonsign = XMMRegister()
 | |
| 	VBLENDVPS(xmm_nonsign, xmm_norm_nonsign, xmm_denorm_nonsign, xmm_denorm_mask)
 | |
| 
 | |
| 	xmm_float = XMMRegister()
 | |
| 	VORPS(xmm_float, xmm_nonsign, xmm_sign)
 | |
| 
 | |
| 	return xmm_float
 |