From 8c20c13100d159ff505af9e6e19cab30f368a074 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bernhard=20Rosenkr=C3=A4nzer?=
 <Bernhard.Rosenkranzer@linaro.org>
Date: Fri, 11 Jul 2014 00:17:07 +0200
Subject: [PATCH] Add optimized memchr implementation from newlib
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add NEON optimized memchr adapted from newlib

Change-Id: I91b2fafa243e4ab35fa56bb6171d48433c947cfd
Signed-off-by: Bernhard Rosenkränzer <Bernhard.Rosenkranzer@linaro.org>
---
 libc/arch-arm64/arm64.mk                     |   1 -
 libc/arch-arm64/denver64/denver64.mk         |   1 +
 libc/arch-arm64/generic-neon/generic-neon.mk |   1 +
 libc/arch-arm64/generic/bionic/memchr.S      | 160 +++++++++++++++++++
 libc/arch-arm64/generic/generic.mk           |   1 +
 5 files changed, 163 insertions(+), 1 deletion(-)
 create mode 100644 libc/arch-arm64/generic/bionic/memchr.S

diff --git a/libc/arch-arm64/arm64.mk b/libc/arch-arm64/arm64.mk
index 20d9cf148..314358e1f 100644
--- a/libc/arch-arm64/arm64.mk
+++ b/libc/arch-arm64/arm64.mk
@@ -1,7 +1,6 @@
 # arm64 specific configs
 
 libc_common_src_files_arm64 := \
-    bionic/memchr.c \
     bionic/memrchr.c \
     bionic/strrchr.cpp \
     upstream-freebsd/lib/libc/string/wcscat.c \
diff --git a/libc/arch-arm64/denver64/denver64.mk b/libc/arch-arm64/denver64/denver64.mk
index c6ddb3f41..d619c1157 100644
--- a/libc/arch-arm64/denver64/denver64.mk
+++ b/libc/arch-arm64/denver64/denver64.mk
@@ -1,4 +1,5 @@
 libc_bionic_src_files_arm64 += \
+    arch-arm64/generic/bionic/memchr.S \
     arch-arm64/generic/bionic/memcmp.S \
     arch-arm64/denver64/bionic/memcpy.S \
     arch-arm64/generic/bionic/memmove.S \
diff --git a/libc/arch-arm64/generic-neon/generic-neon.mk b/libc/arch-arm64/generic-neon/generic-neon.mk
index f3bde5ce4..77e3861ea 100644
--- a/libc/arch-arm64/generic-neon/generic-neon.mk
+++ b/libc/arch-arm64/generic-neon/generic-neon.mk
@@ -1,4 +1,5 @@
 libc_bionic_src_files_arm64 += \
+    arch-arm64/generic/bionic/memchr.S \
     arch-arm64/generic/bionic/memcmp.S \
     arch-arm64/generic/bionic/memmove.S \
     arch-arm64/generic/bionic/memset.S \
diff --git a/libc/arch-arm64/generic/bionic/memchr.S b/libc/arch-arm64/generic/bionic/memchr.S
new file mode 100644
index 000000000..fbb00caaa
--- /dev/null
+++ b/libc/arch-arm64/generic/bionic/memchr.S
@@ -0,0 +1,160 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2014, ARM Limited
+ * All rights Reserved.
+ * Copyright (c) 2014, Linaro Ltd.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the company nor the names of its contributors
+ *       may be used to endorse or promote products derived from this
+ *       software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include <private/bionic_asm.h>
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_chr1	v3
+#define vhas_chr2	v4
+#define vrepmask	v5
+#define vend		v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY(memchr)
+	/*
+	 * Magic constant 0x40100401 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #31
+	and	cntrem, cntin, #31
+	b.eq	.Lloop
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	sub	tmp, soff, #32
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Clear the soff*2 lower bits */
+	lsl	tmp, soff, #1
+	lsr	synd, synd, tmp
+	lsl	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	.Lmasklast
+	/* Have we found something already? */
+	cbnz	synd, .Ltail
+
+.Lloop:
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	subs	cntin, cntin, #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* If we're out of data we finish regardless of the result */
+	b.ls	.Lend
+	/* Use a fast check for the termination condition */
+	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend.2d, vend.2d, vend.2d
+	mov	synd, vend.2d[0]
+	/* We're not out of data, loop if we haven't found the character */
+	cbz	synd, .Lloop
+
+.Lend:
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Only do the clear for the last possible block */
+	b.hi	.Ltail
+
+.Lmasklast:
+	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #31
+	sub	tmp, tmp, #32
+	neg	tmp, tmp, lsl #1
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+.Ltail:
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #1
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+END(memchr)
diff --git a/libc/arch-arm64/generic/generic.mk b/libc/arch-arm64/generic/generic.mk
index 878dcdf3d..1b595aacb 100644
--- a/libc/arch-arm64/generic/generic.mk
+++ b/libc/arch-arm64/generic/generic.mk
@@ -1,4 +1,5 @@
 libc_bionic_src_files_arm64 += \
+    arch-arm64/generic/bionic/memchr.S \
     arch-arm64/generic/bionic/memcmp.S \
     arch-arm64/generic/bionic/memcpy.S \
     arch-arm64/generic/bionic/memmove.S \