144 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			144 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| /*
 | |
|  * Copyright (C) 2014 The Android Open Source Project
 | |
|  *
 | |
|  * Licensed under the Apache License, Version 2.0 (the "License");
 | |
|  * you may not use this file except in compliance with the License.
 | |
|  * You may obtain a copy of the License at
 | |
|  *
 | |
|  *      http://www.apache.org/licenses/LICENSE-2.0
 | |
|  *
 | |
|  * Unless required by applicable law or agreed to in writing, software
 | |
|  * distributed under the License is distributed on an "AS IS" BASIS,
 | |
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
|  * See the License for the specific language governing permissions and
 | |
|  * limitations under the License.
 | |
|  */
 | |
| 
 | |
| /* Assumptions:
 | |
|  *
 | |
|  * ARMv8-a, AArch64
 | |
|  */
 | |
| 
 | |
| #ifndef ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_
 | |
| #define ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_
 | |
| 
 | |
| #include "asm_support_arm64.S"
 | |
| 
 | |
| /* Parameters and result.  */
 | |
| #define src1        x0
 | |
| #define src2        x1
 | |
| #define limit       x2
 | |
| #define result      x0
 | |
| 
 | |
| /* Internal variables.  */
 | |
| #define data1       x3
 | |
| #define data1w      w3
 | |
| #define data2       x4
 | |
| #define data2w      w4
 | |
| #define has_nul     x5
 | |
| #define diff        x6
 | |
| #define endloop     x7
 | |
| #define tmp1        x8
 | |
| #define tmp2        x9
 | |
| #define tmp3        x10
 | |
| #define limit_wd    x12
 | |
| #define mask        x13
 | |
| 
 | |
| // WARNING: If you change this code to use x14 and x15, you must also change
 | |
| //          art_quick_string_compareto, which relies on these temps being unused.
 | |
| 
 | |
| ENTRY __memcmp16
 | |
|   cbz     limit, .Lret0
 | |
|   lsl     limit, limit, #1  /* Half-words to bytes.  */
 | |
|   eor     tmp1, src1, src2
 | |
|   tst     tmp1, #7
 | |
|   b.ne    .Lmisaligned8
 | |
|   ands    tmp1, src1, #7
 | |
|   b.ne    .Lmutual_align
 | |
|   add     limit_wd, limit, #7
 | |
|   lsr     limit_wd, limit_wd, #3
 | |
|   /* Start of performance-critical section  -- one 64B cache line.  */
 | |
| .Lloop_aligned:
 | |
|   ldr     data1, [src1], #8
 | |
|   ldr     data2, [src2], #8
 | |
| .Lstart_realigned:
 | |
|   subs    limit_wd, limit_wd, #1
 | |
|   eor     diff, data1, data2  /* Non-zero if differences found.  */
 | |
|   csinv   endloop, diff, xzr, ne  /* Last Dword or differences.  */
 | |
|   cbz     endloop, .Lloop_aligned
 | |
|   /* End of performance-critical section  -- one 64B cache line.  */
 | |
| 
 | |
|   /* Not reached the limit, must have found a diff.  */
 | |
|   cbnz    limit_wd, .Lnot_limit
 | |
| 
 | |
|   /* Limit % 8 == 0 => all bytes significant.  */
 | |
|   ands    limit, limit, #7
 | |
|   b.eq    .Lnot_limit
 | |
| 
 | |
|   lsl     limit, limit, #3  /* Bits -> bytes.  */
 | |
|   mov     mask, #~0
 | |
|   lsl     mask, mask, limit
 | |
|   bic     data1, data1, mask
 | |
|   bic     data2, data2, mask
 | |
| 
 | |
| .Lnot_limit:
 | |
| 
 | |
|   // Swap the byte order of diff. Exact reverse is not important, as we only need to detect
 | |
|   // the half-word.
 | |
|   rev     diff, diff
 | |
|   // The most significant bit of DIFF marks the least significant bit of change between DATA1/2
 | |
|   clz     diff, diff
 | |
|   // Mask off 0xF to have shift amount. Why does ARM64 not have BIC with immediate?!?!
 | |
|   bfi     diff, xzr, #0, #4
 | |
|   // Create a 16b mask
 | |
|   mov     mask, #0xFFFF
 | |
|   // Shift to the right half-word.
 | |
|   lsr     data1, data1, diff
 | |
|   lsr     data2, data2, diff
 | |
|   // Mask the lowest half-word.
 | |
|   and     data1, data1, mask
 | |
|   and     data2, data2, mask
 | |
|   // Compute difference.
 | |
|   sub     result, data1, data2
 | |
|   ret
 | |
| 
 | |
| .Lmutual_align:
 | |
|   /* Sources are mutually aligned, but are not currently at an
 | |
|      alignment boundary.  Round down the addresses and then mask off
 | |
|      the bytes that precede the start point.  */
 | |
|   bic     src1, src1, #7
 | |
|   bic     src2, src2, #7
 | |
|   add     limit, limit, tmp1  /* Adjust the limit for the extra.  */
 | |
|   lsl     tmp1, tmp1, #3    /* Bytes beyond alignment -> bits.  */
 | |
|   ldr     data1, [src1], #8
 | |
|   neg     tmp1, tmp1    /* Bits to alignment -64.  */
 | |
|   ldr     data2, [src2], #8
 | |
|   mov     tmp2, #~0
 | |
|   /* Little-endian.  Early bytes are at LSB.  */
 | |
|   lsr     tmp2, tmp2, tmp1  /* Shift (tmp1 & 63).  */
 | |
|   add     limit_wd, limit, #7
 | |
|   orr     data1, data1, tmp2
 | |
|   orr     data2, data2, tmp2
 | |
|   lsr     limit_wd, limit_wd, #3
 | |
|   b       .Lstart_realigned
 | |
| 
 | |
| .Lret0:
 | |
|   mov     result, #0
 | |
|   ret
 | |
| 
 | |
|   .p2align 6
 | |
| .Lmisaligned8:
 | |
|   sub     limit, limit, #1
 | |
| 1:
 | |
|   /* Perhaps we can do better than this.  */
 | |
|   ldrh    data1w, [src1], #2
 | |
|   ldrh    data2w, [src2], #2
 | |
|   subs    limit, limit, #2
 | |
|   ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 | |
|   b.eq    1b
 | |
|   sub     result, data1, data2
 | |
|   ret
 | |
| END __memcmp16
 | |
| 
 | |
| #endif  // ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_
 |