1211 lines
		
	
	
		
			29 KiB
		
	
	
	
		
			ArmAsm
		
	
	
		
			Executable File
		
	
	
			
		
		
	
	
			1211 lines
		
	
	
		
			29 KiB
		
	
	
	
		
			ArmAsm
		
	
	
		
			Executable File
		
	
	
| /*
 | |
|  * Copyright (C) 2014 The Android Open Source Project
 | |
|  *
 | |
|  * Licensed under the Apache License, Version 2.0 (the "License");
 | |
|  * you may not use this file except in compliance with the License.
 | |
|  * You may obtain a copy of the License at
 | |
|  *
 | |
|  *      http://www.apache.org/licenses/LICENSE-2.0
 | |
|  *
 | |
|  * Unless required by applicable law or agreed to in writing, software
 | |
|  * distributed under the License is distributed on an "AS IS" BASIS,
 | |
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
|  * See the License for the specific language governing permissions and
 | |
|  * limitations under the License.
 | |
|  */
 | |
| 
 | |
| #include "asm_support_x86_64.S"
 | |
| 
 | |
| #define MEMCMP  __memcmp16
 | |
| 
 | |
| /*
 | |
|  * Half of Silvermont L1 Data Cache size
 | |
|  *(see original file cache.h in bionic/libc/arch-x86_64/).
 | |
|  * This value is used for specific optimization on big lengths.
 | |
|  */
 | |
| #define DATA_CACHE_SIZE_HALF    (12*1024)
 | |
| 
 | |
| #ifndef L
 | |
| # define L(label)    .L##label
 | |
| #endif
 | |
| 
 | |
| #ifndef ALIGN
 | |
| # define ALIGN(n)    .p2align n
 | |
| #endif
 | |
| 
 | |
| #define JMPTBL(I, B)    (I - B)
 | |
| 
 | |
| #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)        \
 | |
|   lea        TABLE(%rip), %r11;                \
 | |
|   movslq    (%r11, INDEX, SCALE), %rcx;            \
 | |
|   add        %r11, %rcx;                    \
 | |
|   jmp        *%rcx;                        \
 | |
|   ud2
 | |
| 
 | |
| DEFINE_FUNCTION MEMCMP
 | |
|     pxor      %xmm0, %xmm0
 | |
|     shl       $1, %rdx
 | |
|     cmp       $79, %rdx
 | |
|     ja        L(79bytesormore)
 | |
|     add       %rdx, %rsi
 | |
|     add       %rdx, %rdi
 | |
|     BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(79bytesormore):
 | |
|     movdqu    (%rsi), %xmm1
 | |
|     movdqu    (%rdi), %xmm2
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(16bytesin256)
 | |
|     mov       %rsi, %rcx
 | |
|     and       $-16, %rsi
 | |
|     add       $16, %rsi
 | |
|     sub       %rsi, %rcx
 | |
| 
 | |
|     sub       %rcx, %rdi
 | |
|     add       %rcx, %rdx
 | |
|     test      $0xf, %rdi
 | |
|     jz        L(2aligned)
 | |
| 
 | |
|     cmp       $128, %rdx
 | |
|     ja        L(128bytesormore)
 | |
| L(less128bytes):
 | |
|     sub       $64, %rdx
 | |
| 
 | |
|     movdqu    (%rdi), %xmm2
 | |
|     pxor      (%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(16bytesin256)
 | |
| 
 | |
|     movdqu    16(%rdi), %xmm2
 | |
|     pxor      16(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(32bytesin256)
 | |
| 
 | |
|     movdqu    32(%rdi), %xmm2
 | |
|     pxor      32(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(48bytesin256)
 | |
| 
 | |
|     movdqu    48(%rdi), %xmm2
 | |
|     pxor      48(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(64bytesin256)
 | |
|     cmp       $32, %rdx
 | |
|     jb        L(less32bytesin64)
 | |
| 
 | |
|     movdqu    64(%rdi), %xmm2
 | |
|     pxor      64(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(80bytesin256)
 | |
| 
 | |
|     movdqu    80(%rdi), %xmm2
 | |
|     pxor      80(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(96bytesin256)
 | |
|     sub       $32, %rdx
 | |
|     add       $32, %rdi
 | |
|     add       $32, %rsi
 | |
| L(less32bytesin64):
 | |
|     add       $64, %rdi
 | |
|     add       $64, %rsi
 | |
|     add       %rdx, %rsi
 | |
|     add       %rdx, %rdi
 | |
|     BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
 | |
| 
 | |
| L(128bytesormore):
 | |
|     cmp       $512, %rdx
 | |
|     ja        L(512bytesormore)
 | |
|     cmp       $256, %rdx
 | |
|     ja        L(less512bytes)
 | |
| L(less256bytes):
 | |
|     sub       $128, %rdx
 | |
| 
 | |
|     movdqu    (%rdi), %xmm2
 | |
|     pxor      (%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(16bytesin256)
 | |
| 
 | |
|     movdqu    16(%rdi), %xmm2
 | |
|     pxor      16(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(32bytesin256)
 | |
| 
 | |
|     movdqu    32(%rdi), %xmm2
 | |
|     pxor      32(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(48bytesin256)
 | |
| 
 | |
|     movdqu    48(%rdi), %xmm2
 | |
|     pxor      48(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(64bytesin256)
 | |
| 
 | |
|     movdqu    64(%rdi), %xmm2
 | |
|     pxor      64(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(80bytesin256)
 | |
| 
 | |
|     movdqu    80(%rdi), %xmm2
 | |
|     pxor      80(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(96bytesin256)
 | |
| 
 | |
|     movdqu    96(%rdi), %xmm2
 | |
|     pxor      96(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(112bytesin256)
 | |
| 
 | |
|     movdqu    112(%rdi), %xmm2
 | |
|     pxor      112(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(128bytesin256)
 | |
| 
 | |
|     add       $128, %rsi
 | |
|     add       $128, %rdi
 | |
| 
 | |
|     cmp       $64, %rdx
 | |
|     jae       L(less128bytes)
 | |
| 
 | |
|     cmp       $32, %rdx
 | |
|     jb        L(less32bytesin128)
 | |
| 
 | |
|     movdqu    (%rdi), %xmm2
 | |
|     pxor      (%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(16bytesin256)
 | |
| 
 | |
|     movdqu    16(%rdi), %xmm2
 | |
|     pxor      16(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(32bytesin256)
 | |
|     sub       $32, %rdx
 | |
|     add       $32, %rdi
 | |
|     add       $32, %rsi
 | |
| L(less32bytesin128):
 | |
|     add       %rdx, %rsi
 | |
|     add       %rdx, %rdi
 | |
|     BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
 | |
| 
 | |
| L(less512bytes):
 | |
|     sub       $256, %rdx
 | |
|     movdqu    (%rdi), %xmm2
 | |
|     pxor      (%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(16bytesin256)
 | |
| 
 | |
|     movdqu    16(%rdi), %xmm2
 | |
|     pxor      16(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(32bytesin256)
 | |
| 
 | |
|     movdqu    32(%rdi), %xmm2
 | |
|     pxor      32(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(48bytesin256)
 | |
| 
 | |
|     movdqu    48(%rdi), %xmm2
 | |
|     pxor      48(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(64bytesin256)
 | |
| 
 | |
|     movdqu    64(%rdi), %xmm2
 | |
|     pxor      64(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(80bytesin256)
 | |
| 
 | |
|     movdqu    80(%rdi), %xmm2
 | |
|     pxor      80(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(96bytesin256)
 | |
| 
 | |
|     movdqu    96(%rdi), %xmm2
 | |
|     pxor      96(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(112bytesin256)
 | |
| 
 | |
|     movdqu    112(%rdi), %xmm2
 | |
|     pxor      112(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(128bytesin256)
 | |
| 
 | |
|     movdqu    128(%rdi), %xmm2
 | |
|     pxor      128(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(144bytesin256)
 | |
| 
 | |
|     movdqu    144(%rdi), %xmm2
 | |
|     pxor      144(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(160bytesin256)
 | |
| 
 | |
|     movdqu    160(%rdi), %xmm2
 | |
|     pxor      160(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(176bytesin256)
 | |
| 
 | |
|     movdqu    176(%rdi), %xmm2
 | |
|     pxor      176(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(192bytesin256)
 | |
| 
 | |
|     movdqu    192(%rdi), %xmm2
 | |
|     pxor      192(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(208bytesin256)
 | |
| 
 | |
|     movdqu    208(%rdi), %xmm2
 | |
|     pxor      208(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(224bytesin256)
 | |
| 
 | |
|     movdqu    224(%rdi), %xmm2
 | |
|     pxor      224(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(240bytesin256)
 | |
| 
 | |
|     movdqu    240(%rdi), %xmm2
 | |
|     pxor      240(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(256bytesin256)
 | |
| 
 | |
|     add       $256, %rsi
 | |
|     add       $256, %rdi
 | |
| 
 | |
|     cmp       $128, %rdx
 | |
|     jae       L(less256bytes)
 | |
| 
 | |
|     cmp       $64, %rdx
 | |
|     jae       L(less128bytes)
 | |
| 
 | |
|     cmp       $32, %rdx
 | |
|     jb        L(less32bytesin256)
 | |
| 
 | |
|     movdqu    (%rdi), %xmm2
 | |
|     pxor      (%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(16bytesin256)
 | |
| 
 | |
|     movdqu    16(%rdi), %xmm2
 | |
|     pxor      16(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(32bytesin256)
 | |
|     sub       $32, %rdx
 | |
|     add       $32, %rdi
 | |
|     add       $32, %rsi
 | |
| L(less32bytesin256):
 | |
|     add       %rdx, %rsi
 | |
|     add       %rdx, %rdi
 | |
|     BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(512bytesormore):
 | |
| #ifdef DATA_CACHE_SIZE_HALF
 | |
|     mov       $DATA_CACHE_SIZE_HALF, %r8
 | |
| #else
 | |
|     mov       __x86_64_data_cache_size_half(%rip), %r8
 | |
| #endif
 | |
|     mov       %r8, %r9
 | |
|     shr       $1, %r8
 | |
|     add       %r9, %r8
 | |
|     cmp       %r8, %rdx
 | |
|     ja        L(L2_L3_cache_unaglined)
 | |
|     sub       $64, %rdx
 | |
|     ALIGN (4)
 | |
| L(64bytesormore_loop):
 | |
|     movdqu    (%rdi), %xmm2
 | |
|     pxor      (%rsi), %xmm2
 | |
|     movdqa    %xmm2, %xmm1
 | |
| 
 | |
|     movdqu    16(%rdi), %xmm3
 | |
|     pxor      16(%rsi), %xmm3
 | |
|     por       %xmm3, %xmm1
 | |
| 
 | |
|     movdqu    32(%rdi), %xmm4
 | |
|     pxor      32(%rsi), %xmm4
 | |
|     por       %xmm4, %xmm1
 | |
| 
 | |
|     movdqu    48(%rdi), %xmm5
 | |
|     pxor      48(%rsi), %xmm5
 | |
|     por       %xmm5, %xmm1
 | |
| 
 | |
|     ptest     %xmm1, %xmm0
 | |
|     jnc       L(64bytesormore_loop_end)
 | |
|     add       $64, %rsi
 | |
|     add       $64, %rdi
 | |
|     sub       $64, %rdx
 | |
|     jae       L(64bytesormore_loop)
 | |
| 
 | |
|     add       $64, %rdx
 | |
|     add       %rdx, %rsi
 | |
|     add       %rdx, %rdi
 | |
|     BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
 | |
| 
 | |
| L(L2_L3_cache_unaglined):
 | |
|     sub       $64, %rdx
 | |
|     ALIGN (4)
 | |
| L(L2_L3_unaligned_128bytes_loop):
 | |
|     prefetchnta 0x1c0(%rdi)
 | |
|     prefetchnta 0x1c0(%rsi)
 | |
|     movdqu    (%rdi), %xmm2
 | |
|     pxor      (%rsi), %xmm2
 | |
|     movdqa    %xmm2, %xmm1
 | |
| 
 | |
|     movdqu    16(%rdi), %xmm3
 | |
|     pxor      16(%rsi), %xmm3
 | |
|     por       %xmm3, %xmm1
 | |
| 
 | |
|     movdqu    32(%rdi), %xmm4
 | |
|     pxor      32(%rsi), %xmm4
 | |
|     por       %xmm4, %xmm1
 | |
| 
 | |
|     movdqu    48(%rdi), %xmm5
 | |
|     pxor      48(%rsi), %xmm5
 | |
|     por       %xmm5, %xmm1
 | |
| 
 | |
|     ptest     %xmm1, %xmm0
 | |
|     jnc       L(64bytesormore_loop_end)
 | |
|     add       $64, %rsi
 | |
|     add       $64, %rdi
 | |
|     sub       $64, %rdx
 | |
|     jae       L(L2_L3_unaligned_128bytes_loop)
 | |
| 
 | |
|     add       $64, %rdx
 | |
|     add       %rdx, %rsi
 | |
|     add       %rdx, %rdi
 | |
|     BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
 | |
| 
 | |
| /*
 | |
|  * This case is for machines which are sensitive for unaligned instructions.
 | |
|  */
 | |
|     ALIGN (4)
 | |
| L(2aligned):
 | |
|     cmp       $128, %rdx
 | |
|     ja        L(128bytesormorein2aligned)
 | |
| L(less128bytesin2aligned):
 | |
|     sub       $64, %rdx
 | |
| 
 | |
|     movdqa    (%rdi), %xmm2
 | |
|     pxor      (%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(16bytesin256)
 | |
| 
 | |
|     movdqa    16(%rdi), %xmm2
 | |
|     pxor      16(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(32bytesin256)
 | |
| 
 | |
|     movdqa    32(%rdi), %xmm2
 | |
|     pxor      32(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(48bytesin256)
 | |
| 
 | |
|     movdqa    48(%rdi), %xmm2
 | |
|     pxor      48(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(64bytesin256)
 | |
|     cmp       $32, %rdx
 | |
|     jb        L(less32bytesin64in2alinged)
 | |
| 
 | |
|     movdqa    64(%rdi), %xmm2
 | |
|     pxor      64(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(80bytesin256)
 | |
| 
 | |
|     movdqa    80(%rdi), %xmm2
 | |
|     pxor      80(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(96bytesin256)
 | |
|     sub       $32, %rdx
 | |
|     add       $32, %rdi
 | |
|     add       $32, %rsi
 | |
| L(less32bytesin64in2alinged):
 | |
|     add       $64, %rdi
 | |
|     add       $64, %rsi
 | |
|     add       %rdx, %rsi
 | |
|     add       %rdx, %rdi
 | |
|     BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(128bytesormorein2aligned):
 | |
|     cmp       $512, %rdx
 | |
|     ja        L(512bytesormorein2aligned)
 | |
|     cmp       $256, %rdx
 | |
|     ja        L(256bytesormorein2aligned)
 | |
| L(less256bytesin2alinged):
 | |
|     sub       $128, %rdx
 | |
| 
 | |
|     movdqa    (%rdi), %xmm2
 | |
|     pxor      (%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(16bytesin256)
 | |
| 
 | |
|     movdqa    16(%rdi), %xmm2
 | |
|     pxor      16(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(32bytesin256)
 | |
| 
 | |
|     movdqa    32(%rdi), %xmm2
 | |
|     pxor      32(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(48bytesin256)
 | |
| 
 | |
|     movdqa    48(%rdi), %xmm2
 | |
|     pxor      48(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(64bytesin256)
 | |
| 
 | |
|     movdqa    64(%rdi), %xmm2
 | |
|     pxor      64(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(80bytesin256)
 | |
| 
 | |
|     movdqa    80(%rdi), %xmm2
 | |
|     pxor      80(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(96bytesin256)
 | |
| 
 | |
|     movdqa    96(%rdi), %xmm2
 | |
|     pxor      96(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(112bytesin256)
 | |
| 
 | |
|     movdqa    112(%rdi), %xmm2
 | |
|     pxor      112(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(128bytesin256)
 | |
| 
 | |
|     add       $128, %rsi
 | |
|     add       $128, %rdi
 | |
| 
 | |
|     cmp       $64, %rdx
 | |
|     jae       L(less128bytesin2aligned)
 | |
| 
 | |
|     cmp       $32, %rdx
 | |
|     jb        L(less32bytesin128in2aligned)
 | |
| 
 | |
|     movdqu    (%rdi), %xmm2
 | |
|     pxor      (%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(16bytesin256)
 | |
| 
 | |
|     movdqu    16(%rdi), %xmm2
 | |
|     pxor      16(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(32bytesin256)
 | |
|     sub       $32, %rdx
 | |
|     add       $32, %rdi
 | |
|     add       $32, %rsi
 | |
| L(less32bytesin128in2aligned):
 | |
|     add       %rdx, %rsi
 | |
|     add       %rdx, %rdi
 | |
|     BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(256bytesormorein2aligned):
 | |
| 
 | |
|     sub       $256, %rdx
 | |
|     movdqa    (%rdi), %xmm2
 | |
|     pxor      (%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(16bytesin256)
 | |
| 
 | |
|     movdqa    16(%rdi), %xmm2
 | |
|     pxor      16(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(32bytesin256)
 | |
| 
 | |
|     movdqa    32(%rdi), %xmm2
 | |
|     pxor      32(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(48bytesin256)
 | |
| 
 | |
|     movdqa    48(%rdi), %xmm2
 | |
|     pxor      48(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(64bytesin256)
 | |
| 
 | |
|     movdqa    64(%rdi), %xmm2
 | |
|     pxor      64(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(80bytesin256)
 | |
| 
 | |
|     movdqa    80(%rdi), %xmm2
 | |
|     pxor      80(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(96bytesin256)
 | |
| 
 | |
|     movdqa    96(%rdi), %xmm2
 | |
|     pxor      96(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(112bytesin256)
 | |
| 
 | |
|     movdqa    112(%rdi), %xmm2
 | |
|     pxor      112(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(128bytesin256)
 | |
| 
 | |
|     movdqa    128(%rdi), %xmm2
 | |
|     pxor      128(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(144bytesin256)
 | |
| 
 | |
|     movdqa    144(%rdi), %xmm2
 | |
|     pxor      144(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(160bytesin256)
 | |
| 
 | |
|     movdqa    160(%rdi), %xmm2
 | |
|     pxor      160(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(176bytesin256)
 | |
| 
 | |
|     movdqa    176(%rdi), %xmm2
 | |
|     pxor      176(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(192bytesin256)
 | |
| 
 | |
|     movdqa    192(%rdi), %xmm2
 | |
|     pxor      192(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(208bytesin256)
 | |
| 
 | |
|     movdqa    208(%rdi), %xmm2
 | |
|     pxor      208(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(224bytesin256)
 | |
| 
 | |
|     movdqa    224(%rdi), %xmm2
 | |
|     pxor      224(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(240bytesin256)
 | |
| 
 | |
|     movdqa    240(%rdi), %xmm2
 | |
|     pxor      240(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(256bytesin256)
 | |
| 
 | |
|     add       $256, %rsi
 | |
|     add       $256, %rdi
 | |
| 
 | |
|     cmp       $128, %rdx
 | |
|     jae       L(less256bytesin2alinged)
 | |
| 
 | |
|     cmp       $64, %rdx
 | |
|     jae       L(less128bytesin2aligned)
 | |
| 
 | |
|     cmp       $32, %rdx
 | |
|     jb        L(less32bytesin256in2alinged)
 | |
| 
 | |
|     movdqa    (%rdi), %xmm2
 | |
|     pxor      (%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(16bytesin256)
 | |
| 
 | |
|     movdqa    16(%rdi), %xmm2
 | |
|     pxor      16(%rsi), %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(32bytesin256)
 | |
|     sub       $32, %rdx
 | |
|     add       $32, %rdi
 | |
|     add       $32, %rsi
 | |
| L(less32bytesin256in2alinged):
 | |
|     add       %rdx, %rsi
 | |
|     add       %rdx, %rdi
 | |
|     BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(512bytesormorein2aligned):
 | |
| #ifdef DATA_CACHE_SIZE_HALF
 | |
|     mov       $DATA_CACHE_SIZE_HALF, %r8
 | |
| #else
 | |
|     mov       __x86_64_data_cache_size_half(%rip), %r8
 | |
| #endif
 | |
|     mov       %r8, %r9
 | |
|     shr       $1, %r8
 | |
|     add       %r9, %r8
 | |
|     cmp       %r8, %rdx
 | |
|     ja        L(L2_L3_cache_aglined)
 | |
| 
 | |
|     sub       $64, %rdx
 | |
|     ALIGN (4)
 | |
| L(64bytesormore_loopin2aligned):
 | |
|     movdqa    (%rdi), %xmm2
 | |
|     pxor      (%rsi), %xmm2
 | |
|     movdqa    %xmm2, %xmm1
 | |
| 
 | |
|     movdqa    16(%rdi), %xmm3
 | |
|     pxor      16(%rsi), %xmm3
 | |
|     por       %xmm3, %xmm1
 | |
| 
 | |
|     movdqa    32(%rdi), %xmm4
 | |
|     pxor      32(%rsi), %xmm4
 | |
|     por       %xmm4, %xmm1
 | |
| 
 | |
|     movdqa    48(%rdi), %xmm5
 | |
|     pxor      48(%rsi), %xmm5
 | |
|     por       %xmm5, %xmm1
 | |
| 
 | |
|     ptest     %xmm1, %xmm0
 | |
|     jnc       L(64bytesormore_loop_end)
 | |
|     add       $64, %rsi
 | |
|     add       $64, %rdi
 | |
|     sub       $64, %rdx
 | |
|     jae       L(64bytesormore_loopin2aligned)
 | |
| 
 | |
|     add       $64, %rdx
 | |
|     add       %rdx, %rsi
 | |
|     add       %rdx, %rdi
 | |
|     BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
 | |
| L(L2_L3_cache_aglined):
 | |
|     sub       $64, %rdx
 | |
|     ALIGN (4)
 | |
| L(L2_L3_aligned_128bytes_loop):
 | |
|     prefetchnta 0x1c0(%rdi)
 | |
|     prefetchnta 0x1c0(%rsi)
 | |
|     movdqa    (%rdi), %xmm2
 | |
|     pxor      (%rsi), %xmm2
 | |
|     movdqa    %xmm2, %xmm1
 | |
| 
 | |
|     movdqa    16(%rdi), %xmm3
 | |
|     pxor      16(%rsi), %xmm3
 | |
|     por       %xmm3, %xmm1
 | |
| 
 | |
|     movdqa    32(%rdi), %xmm4
 | |
|     pxor      32(%rsi), %xmm4
 | |
|     por       %xmm4, %xmm1
 | |
| 
 | |
|     movdqa    48(%rdi), %xmm5
 | |
|     pxor      48(%rsi), %xmm5
 | |
|     por       %xmm5, %xmm1
 | |
| 
 | |
|     ptest     %xmm1, %xmm0
 | |
|     jnc       L(64bytesormore_loop_end)
 | |
|     add       $64, %rsi
 | |
|     add       $64, %rdi
 | |
|     sub       $64, %rdx
 | |
|     jae    L(L2_L3_aligned_128bytes_loop)
 | |
| 
 | |
|     add       $64, %rdx
 | |
|     add       %rdx, %rsi
 | |
|     add       %rdx, %rdi
 | |
|     BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
 | |
| 
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(64bytesormore_loop_end):
 | |
|     add       $16, %rdi
 | |
|     add       $16, %rsi
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(16bytes)
 | |
| 
 | |
|     add       $16, %rdi
 | |
|     add       $16, %rsi
 | |
|     ptest     %xmm3, %xmm0
 | |
|     jnc       L(16bytes)
 | |
| 
 | |
|     add       $16, %rdi
 | |
|     add       $16, %rsi
 | |
|     ptest     %xmm4, %xmm0
 | |
|     jnc       L(16bytes)
 | |
| 
 | |
|     add       $16, %rdi
 | |
|     add       $16, %rsi
 | |
|     jmp       L(16bytes)
 | |
| 
 | |
| L(256bytesin256):
 | |
|     add       $256, %rdi
 | |
|     add       $256, %rsi
 | |
|     jmp       L(16bytes)
 | |
| L(240bytesin256):
 | |
|     add       $240, %rdi
 | |
|     add       $240, %rsi
 | |
|     jmp       L(16bytes)
 | |
| L(224bytesin256):
 | |
|     add       $224, %rdi
 | |
|     add       $224, %rsi
 | |
|     jmp       L(16bytes)
 | |
| L(208bytesin256):
 | |
|     add       $208, %rdi
 | |
|     add       $208, %rsi
 | |
|     jmp       L(16bytes)
 | |
| L(192bytesin256):
 | |
|     add       $192, %rdi
 | |
|     add       $192, %rsi
 | |
|     jmp       L(16bytes)
 | |
| L(176bytesin256):
 | |
|     add       $176, %rdi
 | |
|     add       $176, %rsi
 | |
|     jmp       L(16bytes)
 | |
| L(160bytesin256):
 | |
|     add       $160, %rdi
 | |
|     add       $160, %rsi
 | |
|     jmp       L(16bytes)
 | |
| L(144bytesin256):
 | |
|     add       $144, %rdi
 | |
|     add       $144, %rsi
 | |
|     jmp       L(16bytes)
 | |
| L(128bytesin256):
 | |
|     add       $128, %rdi
 | |
|     add       $128, %rsi
 | |
|     jmp       L(16bytes)
 | |
| L(112bytesin256):
 | |
|     add       $112, %rdi
 | |
|     add       $112, %rsi
 | |
|     jmp       L(16bytes)
 | |
| L(96bytesin256):
 | |
|     add       $96, %rdi
 | |
|     add       $96, %rsi
 | |
|     jmp       L(16bytes)
 | |
| L(80bytesin256):
 | |
|     add       $80, %rdi
 | |
|     add       $80, %rsi
 | |
|     jmp       L(16bytes)
 | |
| L(64bytesin256):
 | |
|     add       $64, %rdi
 | |
|     add       $64, %rsi
 | |
|     jmp       L(16bytes)
 | |
| L(48bytesin256):
 | |
|     add       $16, %rdi
 | |
|     add       $16, %rsi
 | |
| L(32bytesin256):
 | |
|     add       $16, %rdi
 | |
|     add       $16, %rsi
 | |
| L(16bytesin256):
 | |
|     add       $16, %rdi
 | |
|     add       $16, %rsi
 | |
| L(16bytes):
 | |
|     mov       -16(%rdi), %rax
 | |
|     mov       -16(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
| L(8bytes):
 | |
|     mov       -8(%rdi), %rax
 | |
|     mov       -8(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
|     xor       %eax, %eax
 | |
|     ret
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(12bytes):
 | |
|     mov       -12(%rdi), %rax
 | |
|     mov       -12(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
| L(4bytes):
 | |
|     mov       -4(%rsi), %ecx
 | |
|     mov       -4(%rdi), %eax
 | |
|     cmp       %eax, %ecx
 | |
|     jne       L(diffin4bytes)
 | |
| L(0bytes):
 | |
|     xor       %eax, %eax
 | |
|     ret
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(66bytes):
 | |
|     movdqu    -66(%rdi), %xmm1
 | |
|     movdqu    -66(%rsi), %xmm2
 | |
|     mov       $-66, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(50bytes):
 | |
|     movdqu    -50(%rdi), %xmm1
 | |
|     movdqu    -50(%rsi), %xmm2
 | |
|     mov       $-50, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(34bytes):
 | |
|     movdqu    -34(%rdi), %xmm1
 | |
|     movdqu    -34(%rsi), %xmm2
 | |
|     mov       $-34, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(18bytes):
 | |
|     mov       -18(%rdi), %rax
 | |
|     mov       -18(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
| L(10bytes):
 | |
|     mov       -10(%rdi), %rax
 | |
|     mov       -10(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
|     movzwl    -2(%rdi), %eax
 | |
|     movzwl    -2(%rsi), %ecx
 | |
|     cmp       %cl, %al
 | |
|     jne       L(end)
 | |
|     and       $0xffff, %eax
 | |
|     and       $0xffff, %ecx
 | |
|     sub       %ecx, %eax
 | |
|     ret
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(14bytes):
 | |
|     mov       -14(%rdi), %rax
 | |
|     mov       -14(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
|     mov       -8(%rdi), %rax
 | |
|     mov       -8(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
|     xor       %eax, %eax
 | |
|     ret
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(6bytes):
 | |
|     mov       -6(%rdi), %eax
 | |
|     mov       -6(%rsi), %ecx
 | |
|     cmp       %eax, %ecx
 | |
|     jne       L(diffin4bytes)
 | |
| L(2bytes):
 | |
|     movzwl    -2(%rsi), %ecx
 | |
|     movzwl    -2(%rdi), %eax
 | |
|     cmp       %cl, %al
 | |
|     jne       L(end)
 | |
|     and       $0xffff, %eax
 | |
|     and       $0xffff, %ecx
 | |
|     sub       %ecx, %eax
 | |
|     ret
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(68bytes):
 | |
|     movdqu    -68(%rdi), %xmm2
 | |
|     movdqu    -68(%rsi), %xmm1
 | |
|     mov       $-68, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(52bytes):
 | |
|     movdqu    -52(%rdi), %xmm2
 | |
|     movdqu    -52(%rsi), %xmm1
 | |
|     mov       $-52, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(36bytes):
 | |
|     movdqu    -36(%rdi), %xmm2
 | |
|     movdqu    -36(%rsi), %xmm1
 | |
|     mov       $-36, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(20bytes):
 | |
|     movdqu    -20(%rdi), %xmm2
 | |
|     movdqu    -20(%rsi), %xmm1
 | |
|     mov       $-20, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
|     mov       -4(%rdi), %eax
 | |
|     mov       -4(%rsi), %ecx
 | |
|     cmp       %eax, %ecx
 | |
|     jne       L(diffin4bytes)
 | |
|     xor       %eax, %eax
 | |
|     ret
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(70bytes):
 | |
|     movdqu    -70(%rsi), %xmm1
 | |
|     movdqu    -70(%rdi), %xmm2
 | |
|     mov       $-70, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(54bytes):
 | |
|     movdqu    -54(%rsi), %xmm1
 | |
|     movdqu    -54(%rdi), %xmm2
 | |
|     mov       $-54, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(38bytes):
 | |
|     movdqu    -38(%rsi), %xmm1
 | |
|     movdqu    -38(%rdi), %xmm2
 | |
|     mov       $-38, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(22bytes):
 | |
|     movdqu    -22(%rsi), %xmm1
 | |
|     movdqu    -22(%rdi), %xmm2
 | |
|     mov       $-22, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
|     mov       -8(%rdi), %rax
 | |
|     mov       -8(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
|     xor       %eax, %eax
 | |
|     ret
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(72bytes):
 | |
|     movdqu    -72(%rsi), %xmm1
 | |
|     movdqu    -72(%rdi), %xmm2
 | |
|     mov       $-72, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(56bytes):
 | |
|     movdqu    -56(%rdi), %xmm2
 | |
|     movdqu    -56(%rsi), %xmm1
 | |
|     mov       $-56, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(40bytes):
 | |
|     movdqu    -40(%rdi), %xmm2
 | |
|     movdqu    -40(%rsi), %xmm1
 | |
|     mov       $-40, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(24bytes):
 | |
|     movdqu    -24(%rdi), %xmm2
 | |
|     movdqu    -24(%rsi), %xmm1
 | |
|     mov       $-24, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
|     mov       -8(%rdi), %rax
 | |
|     mov       -8(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
|     xor       %eax, %eax
 | |
|     ret
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(74bytes):
 | |
|     movdqu    -74(%rsi), %xmm1
 | |
|     movdqu    -74(%rdi), %xmm2
 | |
|     mov       $-74, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(58bytes):
 | |
|     movdqu    -58(%rdi), %xmm2
 | |
|     movdqu    -58(%rsi), %xmm1
 | |
|     mov       $-58, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(42bytes):
 | |
|     movdqu    -42(%rdi), %xmm2
 | |
|     movdqu    -42(%rsi), %xmm1
 | |
|     mov       $-42, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(26bytes):
 | |
|     movdqu    -26(%rdi), %xmm2
 | |
|     movdqu    -26(%rsi), %xmm1
 | |
|     mov       $-26, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
|     mov       -10(%rdi), %rax
 | |
|     mov       -10(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
|     movzwl    -2(%rdi), %eax
 | |
|     movzwl    -2(%rsi), %ecx
 | |
|     jmp       L(end)
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(76bytes):
 | |
|     movdqu    -76(%rsi), %xmm1
 | |
|     movdqu    -76(%rdi), %xmm2
 | |
|     mov       $-76, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(60bytes):
 | |
|     movdqu    -60(%rdi), %xmm2
 | |
|     movdqu    -60(%rsi), %xmm1
 | |
|     mov       $-60, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(44bytes):
 | |
|     movdqu    -44(%rdi), %xmm2
 | |
|     movdqu    -44(%rsi), %xmm1
 | |
|     mov       $-44, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(28bytes):
 | |
|     movdqu    -28(%rdi), %xmm2
 | |
|     movdqu    -28(%rsi), %xmm1
 | |
|     mov       $-28, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
|     mov       -12(%rdi), %rax
 | |
|     mov       -12(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
|     mov       -4(%rdi), %eax
 | |
|     mov       -4(%rsi), %ecx
 | |
|     cmp       %eax, %ecx
 | |
|     jne       L(diffin4bytes)
 | |
|     xor       %eax, %eax
 | |
|     ret
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(78bytes):
 | |
|     movdqu    -78(%rsi), %xmm1
 | |
|     movdqu    -78(%rdi), %xmm2
 | |
|     mov       $-78, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(62bytes):
 | |
|     movdqu    -62(%rdi), %xmm2
 | |
|     movdqu    -62(%rsi), %xmm1
 | |
|     mov       $-62, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(46bytes):
 | |
|     movdqu    -46(%rdi), %xmm2
 | |
|     movdqu    -46(%rsi), %xmm1
 | |
|     mov       $-46, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(30bytes):
 | |
|     movdqu    -30(%rdi), %xmm2
 | |
|     movdqu    -30(%rsi), %xmm1
 | |
|     mov       $-30, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
|     mov       -14(%rdi), %rax
 | |
|     mov       -14(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
|     mov       -8(%rdi), %rax
 | |
|     mov       -8(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
|     xor       %eax, %eax
 | |
|     ret
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(64bytes):
 | |
|     movdqu    -64(%rdi), %xmm2
 | |
|     movdqu    -64(%rsi), %xmm1
 | |
|     mov       $-64, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(48bytes):
 | |
|     movdqu    -48(%rdi), %xmm2
 | |
|     movdqu    -48(%rsi), %xmm1
 | |
|     mov       $-48, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| L(32bytes):
 | |
|     movdqu    -32(%rdi), %xmm2
 | |
|     movdqu    -32(%rsi), %xmm1
 | |
|     mov       $-32, %dl
 | |
|     pxor      %xmm1, %xmm2
 | |
|     ptest     %xmm2, %xmm0
 | |
|     jnc       L(less16bytes)
 | |
| 
 | |
|     mov       -16(%rdi), %rax
 | |
|     mov       -16(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
| 
 | |
|     mov       -8(%rdi), %rax
 | |
|     mov       -8(%rsi), %rcx
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
|     xor       %eax, %eax
 | |
|     ret
 | |
| 
 | |
| /*
 | |
|  * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
 | |
|  */
 | |
|     ALIGN (3)
 | |
| L(less16bytes):
 | |
|     movsbq    %dl, %rdx
 | |
|     mov       (%rsi, %rdx), %rcx
 | |
|     mov       (%rdi, %rdx), %rax
 | |
|     cmp       %rax, %rcx
 | |
|     jne       L(diffin8bytes)
 | |
|     mov       8(%rsi, %rdx), %rcx
 | |
|     mov       8(%rdi, %rdx), %rax
 | |
| L(diffin8bytes):
 | |
|     cmp       %eax, %ecx
 | |
|     jne       L(diffin4bytes)
 | |
|     shr       $32, %rcx
 | |
|     shr       $32, %rax
 | |
| L(diffin4bytes):
 | |
|     cmp       %cx, %ax
 | |
|     jne       L(end)
 | |
|     shr       $16, %ecx
 | |
|     shr       $16, %eax
 | |
|     jmp       L(end)
 | |
| 
 | |
|     ALIGN (4)
 | |
| L(end):
 | |
|     and       $0xffff, %eax
 | |
|     and       $0xffff, %ecx
 | |
|     sub       %ecx, %eax
 | |
|     ret
 | |
| 
 | |
| END_FUNCTION MEMCMP
 | |
| 
 | |
|     ALIGN (3)
 | |
| L(table_64bytes):
 | |
|     .int    JMPTBL (L(0bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(2bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(4bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(6bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(8bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(10bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(12bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(14bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(16bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(18bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(20bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(22bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(24bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(26bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(28bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(30bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(32bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(34bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(36bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(38bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(40bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(42bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(44bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(46bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(48bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(50bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(52bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(54bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(56bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(58bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(60bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(62bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(64bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(66bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(68bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(70bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(72bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(74bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(76bytes), L(table_64bytes))
 | |
|     .int    JMPTBL (L(78bytes), L(table_64bytes))
 |