diff --git a/libc/arch-arm64/arm64.mk b/libc/arch-arm64/arm64.mk index 20d9cf148..314358e1f 100644 --- a/libc/arch-arm64/arm64.mk +++ b/libc/arch-arm64/arm64.mk @@ -1,7 +1,6 @@ # arm64 specific configs libc_common_src_files_arm64 := \ - bionic/memchr.c \ bionic/memrchr.c \ bionic/strrchr.cpp \ upstream-freebsd/lib/libc/string/wcscat.c \ diff --git a/libc/arch-arm64/denver64/denver64.mk b/libc/arch-arm64/denver64/denver64.mk index c6ddb3f41..d619c1157 100644 --- a/libc/arch-arm64/denver64/denver64.mk +++ b/libc/arch-arm64/denver64/denver64.mk @@ -1,4 +1,5 @@ libc_bionic_src_files_arm64 += \ + arch-arm64/generic/bionic/memchr.S \ arch-arm64/generic/bionic/memcmp.S \ arch-arm64/denver64/bionic/memcpy.S \ arch-arm64/generic/bionic/memmove.S \ diff --git a/libc/arch-arm64/generic-neon/generic-neon.mk b/libc/arch-arm64/generic-neon/generic-neon.mk index f3bde5ce4..77e3861ea 100644 --- a/libc/arch-arm64/generic-neon/generic-neon.mk +++ b/libc/arch-arm64/generic-neon/generic-neon.mk @@ -1,4 +1,5 @@ libc_bionic_src_files_arm64 += \ + arch-arm64/generic/bionic/memchr.S \ arch-arm64/generic/bionic/memcmp.S \ arch-arm64/generic/bionic/memmove.S \ arch-arm64/generic/bionic/memset.S \ diff --git a/libc/arch-arm64/generic/bionic/memchr.S b/libc/arch-arm64/generic/bionic/memchr.S new file mode 100644 index 000000000..fbb00caaa --- /dev/null +++ b/libc/arch-arm64/generic/bionic/memchr.S @@ -0,0 +1,160 @@ +/* + * memchr - find a character in a memory zone + * + * Copyright (c) 2014, ARM Limited + * All rights Reserved. + * Copyright (c) 2014, Linaro Ltd. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the company nor the names of its contributors + * may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +#include + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 +#define cntin x2 + +#define result x0 + +#define src x3 +#define tmp x4 +#define wtmp2 w5 +#define synd x6 +#define soff x9 +#define cntrem x10 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_chr1 v3 +#define vhas_chr2 v4 +#define vrepmask v5 +#define vend v6 + +/* + * Core algorithm: + * + * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits + * per byte. For each tuple, bit 0 is set if the relevant byte matched the + * requested character and bit 1 is not used (faster than using a 32bit + * syndrome). Since the bits in the syndrome reflect exactly the order in which + * things occur in the original string, counting trailing zeros allows to + * identify exactly which byte has matched. + */ + +ENTRY(memchr) + /* + * Magic constant 0x40100401 allows us to identify which lane matches + * the requested byte. + */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + /* Work with aligned 32-byte chunks */ + bic src, srcin, #31 + dup vrepmask.4s, wtmp2 + ands soff, srcin, #31 + and cntrem, cntin, #31 + b.eq .Lloop + + /* + * Input string is not 32-byte aligned. We calculate the syndrome + * value for the aligned 32 bytes block containing the first bytes + * and mask the irrelevant part. + */ + + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + sub tmp, soff, #32 + adds cntin, cntin, tmp + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Clear the soff*2 lower bits */ + lsl tmp, soff, #1 + lsr synd, synd, tmp + lsl synd, synd, tmp + /* The first block can also be the last */ + b.ls .Lmasklast + /* Have we found something already? */ + cbnz synd, .Ltail + +.Lloop: + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + subs cntin, cntin, #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + /* If we're out of data we finish regardless of the result */ + b.ls .Lend + /* Use a fast check for the termination condition */ + orr vend.16b, vhas_chr1.16b, vhas_chr2.16b + addp vend.2d, vend.2d, vend.2d + mov synd, vend.2d[0] + /* We're not out of data, loop if we haven't found the character */ + cbz synd, .Lloop + +.Lend: + /* Termination condition found, let's calculate the syndrome value */ + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Only do the clear for the last possible block */ + b.hi .Ltail + +.Lmasklast: + /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ + add tmp, cntrem, soff + and tmp, tmp, #31 + sub tmp, tmp, #32 + neg tmp, tmp, lsl #1 + lsl synd, synd, tmp + lsr synd, synd, tmp + +.Ltail: + /* Count the trailing zeros using bit reversing */ + rbit synd, synd + /* Compensate the last post-increment */ + sub src, src, #32 + /* Check that we have found a character */ + cmp synd, #0 + /* And count the leading zeros */ + clz synd, synd + /* Compute the potential result */ + add result, src, synd, lsr #1 + /* Select result or NULL */ + csel result, xzr, result, eq + ret +END(memchr) diff --git a/libc/arch-arm64/generic/generic.mk b/libc/arch-arm64/generic/generic.mk index 878dcdf3d..1b595aacb 100644 --- a/libc/arch-arm64/generic/generic.mk +++ b/libc/arch-arm64/generic/generic.mk @@ -1,4 +1,5 @@ libc_bionic_src_files_arm64 += \ + arch-arm64/generic/bionic/memchr.S \ arch-arm64/generic/bionic/memcmp.S \ arch-arm64/generic/bionic/memcpy.S \ arch-arm64/generic/bionic/memmove.S \