mirror of
https://github.com/intel/isa-l.git
synced 2025-01-05 22:59:52 +01:00
1187583a97
- It should be fine to enable pmull always on Apple Silicon - macOS 12+ is required for PMULL instruction. - Changed the conditional macro to __APPLE__ - Rewritten dispatcher using sysctlbyname - Use __USER_LABEL_PREFIX__ - Use __TEXT,__const as readonly section - use ASM_DEF_RODATA macro - fix func decl Change-Id: I800593f21085d8187b480c8bb3ab2bd70c4a6974 Signed-off-by: Taiju Yamada <tyamada@bi.a.u-tokyo.ac.jp>
248 lines
5.4 KiB
ArmAsm
248 lines
5.4 KiB
ArmAsm
########################################################################
|
|
# Copyright(c) 2019 Arm Corporation All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in
|
|
# the documentation and/or other materials provided with the
|
|
# distribution.
|
|
# * Neither the name of Arm Corporation nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#########################################################################
|
|
|
|
#include "../include/aarch64_label.h"
|
|
|
|
.text
|
|
.arch armv8-a
|
|
|
|
/*int mem_zero_detect_neon(void *buf, size_t n)*/
|
|
|
|
// input: buf -> x0
|
|
// input: n -> x1
|
|
// output: -> x0 (true or false)
|
|
|
|
.global cdecl(mem_zero_detect_neon)
|
|
#ifndef __APPLE__
|
|
.type mem_zero_detect_neon, %function
|
|
#endif
|
|
|
|
cdecl(mem_zero_detect_neon):
|
|
cmp x1, #(16*24-1)
|
|
b.ls .loop_16x24_end
|
|
|
|
.loop_16x24: // 16x24 block loop
|
|
|
|
ldr q0, [x0]
|
|
ldr q1, [x0, #16]
|
|
ldr q2, [x0, #(16*2)]
|
|
ldr q3, [x0, #(16*3)]
|
|
ldr q4, [x0, #(16*4)]
|
|
ldr q5, [x0, #(16*5)]
|
|
ldr q6, [x0, #(16*6)]
|
|
ldr q7, [x0, #(16*7)]
|
|
ldr q16, [x0, #(16*8)]
|
|
ldr q17, [x0, #(16*9)]
|
|
ldr q18, [x0, #(16*10)]
|
|
ldr q19, [x0, #(16*11)]
|
|
ldr q20, [x0, #(16*12)]
|
|
ldr q21, [x0, #(16*13)]
|
|
ldr q22, [x0, #(16*14)]
|
|
ldr q23, [x0, #(16*15)]
|
|
ldr q24, [x0, #(16*16)]
|
|
ldr q25, [x0, #(16*17)]
|
|
ldr q26, [x0, #(16*18)]
|
|
ldr q27, [x0, #(16*19)]
|
|
ldr q28, [x0, #(16*20)]
|
|
ldr q29, [x0, #(16*21)]
|
|
ldr q30, [x0, #(16*22)]
|
|
ldr q31, [x0, #(16*23)]
|
|
|
|
add x0, x0, #(16*24)
|
|
|
|
orr v0.16b, v0.16b, v1.16b
|
|
orr v2.16b, v2.16b, v3.16b
|
|
orr v4.16b, v4.16b, v5.16b
|
|
orr v6.16b, v6.16b, v7.16b
|
|
orr v16.16b, v16.16b, v17.16b
|
|
orr v18.16b, v18.16b, v19.16b
|
|
orr v20.16b, v20.16b, v21.16b
|
|
orr v22.16b, v22.16b, v23.16b
|
|
orr v24.16b, v24.16b, v25.16b
|
|
orr v26.16b, v26.16b, v27.16b
|
|
orr v28.16b, v28.16b, v29.16b
|
|
orr v30.16b, v30.16b, v31.16b
|
|
|
|
orr v0.16b, v0.16b, v2.16b
|
|
orr v4.16b, v4.16b, v6.16b
|
|
orr v16.16b, v16.16b, v18.16b
|
|
orr v20.16b, v20.16b, v22.16b
|
|
orr v24.16b, v24.16b, v26.16b
|
|
orr v28.16b, v28.16b, v30.16b
|
|
|
|
orr v0.16b, v0.16b, v4.16b
|
|
orr v16.16b, v16.16b, v20.16b
|
|
orr v24.16b, v24.16b, v28.16b
|
|
|
|
orr v0.16b, v0.16b, v16.16b
|
|
orr v0.16b, v0.16b, v24.16b
|
|
|
|
mov x3, v0.d[0]
|
|
mov x2, v0.d[1]
|
|
orr x2, x3, x2
|
|
cbnz x2, .fail_exit
|
|
|
|
// loop condition check
|
|
sub x1, x1, #(16*24)
|
|
cmp x1, #(16*24-1)
|
|
b.hi .loop_16x24
|
|
|
|
.loop_16x24_end:
|
|
cmp x1, #(16*8-1)
|
|
b.ls .loop_16x8_end
|
|
|
|
.loop_16x8: // 16x8 block loop
|
|
ldr q0, [x0]
|
|
ldr q1, [x0, #16]
|
|
ldr q2, [x0, #(16*2)]
|
|
ldr q3, [x0, #(16*3)]
|
|
ldr q4, [x0, #(16*4)]
|
|
ldr q5, [x0, #(16*5)]
|
|
ldr q6, [x0, #(16*6)]
|
|
ldr q7, [x0, #(16*7)]
|
|
|
|
add x0, x0, #(16*8)
|
|
|
|
orr v0.16b, v0.16b, v1.16b
|
|
orr v2.16b, v2.16b, v3.16b
|
|
orr v4.16b, v4.16b, v5.16b
|
|
orr v6.16b, v6.16b, v7.16b
|
|
|
|
orr v0.16b, v0.16b, v2.16b
|
|
orr v4.16b, v4.16b, v6.16b
|
|
orr v0.16b, v0.16b, v4.16b
|
|
|
|
mov x3, v0.d[0]
|
|
mov x2, v0.d[1]
|
|
orr x2, x3, x2
|
|
cbnz x2, .fail_exit
|
|
|
|
sub x1, x1, #(16*8)
|
|
cmp x1, #(16*8-1)
|
|
b.hi .loop_16x8
|
|
|
|
.loop_16x8_end:
|
|
cmp x1, #(8*8-1)
|
|
b.ls .loop_8x8_end
|
|
|
|
.loop_8x8: // 8x8 block loop
|
|
ldp x2, x3, [x0]
|
|
ldp x4, x5, [x0, #16]
|
|
ldp x6, x7, [x0, #32]
|
|
ldp x8, x9, [x0, #48]
|
|
|
|
add x0, x0, #(8*8)
|
|
|
|
orr x2, x2, x3
|
|
orr x4, x4, x5
|
|
orr x6, x6, x7
|
|
orr x8, x8, x9
|
|
orr x2, x2, x4
|
|
orr x6, x6, x8
|
|
orr x2, x2, x6
|
|
|
|
cbnz x2, .fail_exit
|
|
|
|
sub x1, x1, #(8*8)
|
|
cmp x1, #(8*8-1)
|
|
b.hi .loop_8x8
|
|
|
|
.loop_8x8_end:
|
|
cmp x1, #(8-1)
|
|
b.ls .handle_remainder
|
|
|
|
.loop_8: // loop per 8bytes
|
|
ldr x2, [x0]
|
|
add x0, x0, #8
|
|
cbnz x2, .fail_exit
|
|
|
|
sub x1, x1, #8
|
|
cmp x1, #7
|
|
b.hi .loop_8
|
|
|
|
.loop_8_end:
|
|
|
|
// check remaining bytes
|
|
.handle_remainder:
|
|
mov w2, #0
|
|
|
|
cmp x1, #0
|
|
beq .handle_reminder_end
|
|
cmp x1, #1
|
|
beq .case1
|
|
cmp x1, #2
|
|
beq .case2
|
|
cmp x1, #3
|
|
beq .case3
|
|
cmp x1, #4
|
|
beq .case4
|
|
cmp x1, #5
|
|
beq .case5
|
|
cmp x1, #6
|
|
beq .case6
|
|
|
|
.case7: // case7 drop here directly
|
|
ldrb w3, [x0]
|
|
add x0, x0, #1
|
|
orr w2, w2, w3
|
|
.case6:
|
|
ldrb w3, [x0]
|
|
add x0, x0, #1
|
|
orr w2, w2, w3
|
|
.case5:
|
|
ldrb w3, [x0]
|
|
add x0, x0, #1
|
|
orr w2, w2, w3
|
|
.case4:
|
|
ldr w3, [x0]
|
|
orr w2, w2, w3
|
|
b .handle_reminder_end
|
|
.case3:
|
|
ldrb w3, [x0]
|
|
add x0, x0, #1
|
|
orr w2, w2, w3
|
|
.case2:
|
|
ldrh w3, [x0]
|
|
orr w2, w2, w3
|
|
b .handle_reminder_end
|
|
.case1:
|
|
ldrb w3, [x0]
|
|
orr w2, w2, w3
|
|
|
|
.handle_reminder_end:
|
|
cbz w2, .pass_exit
|
|
|
|
.fail_exit:
|
|
mov w0, #0xffffffff
|
|
ret
|
|
|
|
.pass_exit:
|
|
mov w0, #0x0
|
|
ret
|