am 199e830a: am 1abb1b93: Merge "Add 64-bit Silvermont-optimized string/memory functions."

* commit '199e830a88548acf49f472ffa412b9d2d66bd4b1': Add 64-bit Silvermont-optimized string/memory functions.
2014-05-13 00:54:52 +00:00
parent 19e15c5bf6 199e830a88
commit 66d0e68514
17 changed files with 7418 additions and 14 deletions
--- a/libc/arch-x86_64/string/cache.h
+++ b/libc/arch-x86_64/string/cache.h
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Values are optimized for Silvermont */
+#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
+#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
+
+#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
+#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
--- a/libc/arch-x86_64/string/sse2-bcopy-slm.S
+++ b/libc/arch-x86_64/string/sse2-bcopy-slm.S
@@ -0,0 +1,33 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_BCOPY
+#define MEMMOVE		bcopy
+#include "sse2-memmove-slm.S"
--- a/libc/arch-x86_64/string/sse2-bzero-slm.S
+++ b/libc/arch-x86_64/string/sse2-bzero-slm.S
@@ -0,0 +1,33 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_BZERO_P
+#define MEMSET		bzero
+#include "sse2-memset-slm.S"
--- a/libc/arch-x86_64/string/sse2-memcpy-slm.S
+++ b/libc/arch-x86_64/string/sse2-memcpy-slm.S
@@ -0,0 +1,299 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMCPY
+# define MEMCPY		memcpy
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)		\
+	.type name,  @function;		\
+	.globl name;		\
+	.p2align 4;		\
+name:		\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)		\
+	cfi_endproc;		\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)		\
+	cfi_adjust_cfa_offset (4);		\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)		\
+	cfi_adjust_cfa_offset (-4);		\
+	cfi_restore (REG)
+
+#define PUSH(REG)	push REG;
+#define POP(REG)	pop REG;
+
+#define ENTRANCE	PUSH (%rbx);
+#define RETURN_END	POP (%rbx); ret
+#define RETURN		RETURN_END;
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (MEMCPY)
+	ENTRANCE
+	cmp	%rsi, %rdi
+	je	L(return)
+
+	cmp	$16, %rdx
+	jbe	L(len_0_16_bytes)
+
+	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
+	jae	L(large_page)
+
+	movdqu	(%rsi), %xmm0
+	movdqu	-16(%rsi, %rdx), %xmm1
+	cmp	$32, %rdx
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, -16(%rdi, %rdx)
+	jbe	L(return)
+
+	movdqu	16(%rsi), %xmm0
+	movdqu	-32(%rsi, %rdx), %xmm1
+	cmp	$64, %rdx
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm1, -32(%rdi, %rdx)
+	jbe	L(return)
+
+	movdqu	32(%rsi), %xmm0
+	movdqu	48(%rsi), %xmm1
+	movdqu	-48(%rsi, %rdx), %xmm2
+	movdqu	-64(%rsi, %rdx), %xmm3
+	cmp	$128, %rdx
+	movdqu	%xmm0, 32(%rdi)
+	movdqu	%xmm1, 48(%rdi)
+	movdqu	%xmm2, -48(%rdi, %rdx)
+	movdqu	%xmm3, -64(%rdi, %rdx)
+	jbe	L(return)
+
+/* Now the main loop: we align the address of the destination.  */
+	lea	64(%rdi), %r8
+	and	$-64, %r8
+
+	add	%rdi, %rdx
+	and	$-64, %rdx
+
+	sub	%rdi, %rsi
+
+/* We should stop two iterations before the termination
+	(in order not to misprefetch).  */
+	sub	$64, %rdx
+	cmp	%r8, %rdx
+	je	L(main_loop_just_one_iteration)
+
+	sub	$64, %rdx
+	cmp	%r8, %rdx
+	je	L(main_loop_last_two_iterations)
+
+
+	.p2align 4
+L(main_loop_cache):
+
+	prefetcht0 128(%r8, %rsi)
+
+	movdqu	(%r8, %rsi), %xmm0
+	movdqu	16(%r8, %rsi), %xmm1
+	movdqu	32(%r8, %rsi), %xmm2
+	movdqu	48(%r8, %rsi), %xmm3
+	movdqa	%xmm0, (%r8)
+	movdqa	%xmm1, 16(%r8)
+	movdqa	%xmm2, 32(%r8)
+	movdqa	%xmm3, 48(%r8)
+	lea	64(%r8), %r8
+	cmp	%r8, %rdx
+	jne	L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+	movdqu	(%r8, %rsi), %xmm0
+	movdqu	16(%r8, %rsi), %xmm1
+	movdqu	32(%r8, %rsi), %xmm2
+	movdqu	48(%r8, %rsi), %xmm3
+	movdqu	64(%r8, %rsi), %xmm4
+	movdqu	80(%r8, %rsi), %xmm5
+	movdqu	96(%r8, %rsi), %xmm6
+	movdqu	112(%r8, %rsi), %xmm7
+	movdqa	%xmm0, (%r8)
+	movdqa	%xmm1, 16(%r8)
+	movdqa	%xmm2, 32(%r8)
+	movdqa	%xmm3, 48(%r8)
+	movdqa	%xmm4, 64(%r8)
+	movdqa	%xmm5, 80(%r8)
+	movdqa	%xmm6, 96(%r8)
+	movdqa	%xmm7, 112(%r8)
+	jmp	L(return)
+
+L(main_loop_just_one_iteration):
+	movdqu	(%r8, %rsi), %xmm0
+	movdqu	16(%r8, %rsi), %xmm1
+	movdqu	32(%r8, %rsi), %xmm2
+	movdqu	48(%r8, %rsi), %xmm3
+	movdqa	%xmm0, (%r8)
+	movdqa	%xmm1, 16(%r8)
+	movdqa	%xmm2, 32(%r8)
+	movdqa	%xmm3, 48(%r8)
+	jmp	L(return)
+
+L(large_page):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+	movdqu	-64(%rsi, %rdx), %xmm4
+	movdqu	-48(%rsi, %rdx), %xmm5
+	movdqu	-32(%rsi, %rdx), %xmm6
+	movdqu	-16(%rsi, %rdx), %xmm7
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, 32(%rdi)
+	movdqu	%xmm3, 48(%rdi)
+	movdqu	%xmm4, -64(%rdi, %rdx)
+	movdqu	%xmm5, -48(%rdi, %rdx)
+	movdqu	%xmm6, -32(%rdi, %rdx)
+	movdqu	%xmm7, -16(%rdi, %rdx)
+
+	movdqu	64(%rsi), %xmm0
+	movdqu	80(%rsi), %xmm1
+	movdqu	96(%rsi), %xmm2
+	movdqu	112(%rsi), %xmm3
+	movdqu	-128(%rsi, %rdx), %xmm4
+	movdqu	-112(%rsi, %rdx), %xmm5
+	movdqu	-96(%rsi, %rdx), %xmm6
+	movdqu	-80(%rsi, %rdx), %xmm7
+	movdqu	%xmm0, 64(%rdi)
+	movdqu	%xmm1, 80(%rdi)
+	movdqu	%xmm2, 96(%rdi)
+	movdqu	%xmm3, 112(%rdi)
+	movdqu	%xmm4, -128(%rdi, %rdx)
+	movdqu	%xmm5, -112(%rdi, %rdx)
+	movdqu	%xmm6, -96(%rdi, %rdx)
+	movdqu	%xmm7, -80(%rdi, %rdx)
+
+/* Now the main loop with non temporal stores. We align
+	the address of the destination.  */
+	lea	128(%rdi), %r8
+	and	$-128, %r8
+
+	add	%rdi, %rdx
+	and	$-128, %rdx
+
+	sub	%rdi, %rsi
+
+	.p2align 4
+L(main_loop_large_page):
+	movdqu	(%r8, %rsi), %xmm0
+	movdqu	16(%r8, %rsi), %xmm1
+	movdqu	32(%r8, %rsi), %xmm2
+	movdqu	48(%r8, %rsi), %xmm3
+	movdqu	64(%r8, %rsi), %xmm4
+	movdqu	80(%r8, %rsi), %xmm5
+	movdqu	96(%r8, %rsi), %xmm6
+	movdqu	112(%r8, %rsi), %xmm7
+	movntdq	%xmm0, (%r8)
+	movntdq	%xmm1, 16(%r8)
+	movntdq	%xmm2, 32(%r8)
+	movntdq	%xmm3, 48(%r8)
+	movntdq	%xmm4, 64(%r8)
+	movntdq	%xmm5, 80(%r8)
+	movntdq	%xmm6, 96(%r8)
+	movntdq	%xmm7, 112(%r8)
+	lea	128(%r8), %r8
+	cmp	%r8, %rdx
+	jne	L(main_loop_large_page)
+	sfence
+	jmp	L(return)
+
+L(len_0_16_bytes):
+	testb	$24, %dl
+	jne	L(len_9_16_bytes)
+	testb	$4, %dl
+	.p2align 4,,5
+	jne	L(len_5_8_bytes)
+	test	%rdx, %rdx
+	.p2align 4,,2
+	je	L(return)
+	movzbl	(%rsi), %ebx
+	testb	$2, %dl
+	movb	%bl, (%rdi)
+	je	L(return)
+	movzwl	-2(%rsi,%rdx), %ebx
+	movw	%bx, -2(%rdi,%rdx)
+	jmp	L(return)
+
+L(len_9_16_bytes):
+	movq	(%rsi), %xmm0
+	movq	-8(%rsi, %rdx), %xmm1
+	movq	%xmm0, (%rdi)
+	movq	%xmm1, -8(%rdi, %rdx)
+	jmp	L(return)
+
+L(len_5_8_bytes):
+	movl	(%rsi), %ebx
+	movl	%ebx, (%rdi)
+	movl	-4(%rsi,%rdx), %ebx
+	movl	%ebx, -4(%rdi,%rdx)
+	jmp	L(return)
+
+L(return):
+	mov 	%rdi, %rax
+	RETURN
+
+END (MEMCPY)
--- a/libc/arch-x86_64/string/sse2-memmove-slm.S
+++ b/libc/arch-x86_64/string/sse2-memmove-slm.S
@@ -0,0 +1,635 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMMOVE
+# define MEMMOVE		memmove
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)		\
+	.type name,  @function;		\
+	.globl name;		\
+	.p2align 4;		\
+name:		\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)		\
+	cfi_endproc;		\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)		\
+	cfi_adjust_cfa_offset (4);		\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)		\
+	cfi_adjust_cfa_offset (-4);		\
+	cfi_restore (REG)
+
+#define PUSH(REG)	push REG;
+#define POP(REG)	pop REG;
+
+#define ENTRANCE	PUSH (%rbx);
+#define RETURN_END	POP (%rbx); ret
+#define RETURN		RETURN_END;
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (MEMMOVE)
+	ENTRANCE
+#ifdef USE_AS_BCOPY
+	xchg	%rsi, %rdi
+#endif
+	mov	%rdi, %rax
+
+/* Check whether we should copy backward or forward.  */
+	cmp	%rsi, %rdi
+	je	L(mm_return)
+	ja	L(mm_len_0_or_more_backward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+	separately.  */
+	cmp	$16, %rdx
+	jbe	L(mm_len_0_16_bytes_forward)
+
+	cmp	$32, %rdx
+	jg	L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	-16(%rsi, %rdx), %xmm1
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_32_or_more_forward):
+	cmp	$64, %rdx
+	jg	L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	-16(%rsi, %rdx), %xmm2
+	movdqu	-32(%rsi, %rdx), %xmm3
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, -16(%rdi, %rdx)
+	movdqu	%xmm3, -32(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_64_or_more_forward):
+	cmp	$128, %rdx
+	jg	L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+	movdqu	-64(%rsi, %rdx), %xmm4
+	movdqu	-48(%rsi, %rdx), %xmm5
+	movdqu	-32(%rsi, %rdx), %xmm6
+	movdqu	-16(%rsi, %rdx), %xmm7
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, 32(%rdi)
+	movdqu	%xmm3, 48(%rdi)
+	movdqu	%xmm4, -64(%rdi, %rdx)
+	movdqu	%xmm5, -48(%rdi, %rdx)
+	movdqu	%xmm6, -32(%rdi, %rdx)
+	movdqu	%xmm7, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_128_or_more_forward):
+
+	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
+	jae	L(mm_large_page_forward)
+
+	mov	%rsi, %r8  // copy src to r8
+	mov	%rdi, %r9  // copy dst to r9
+
+/* Aligning the address of destination.  */
+/*  save first unaligned 64 bytes */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+
+	lea	64(%r9), %rdi
+	and	$-64, %rdi  /* rdi now aligned to next 64 byte boundary */
+
+	sub	%r9, %rsi /* rsi = src - dst = diff */
+
+	movdqu	(%rdi, %rsi), %xmm4
+	movdqu	16(%rdi, %rsi), %xmm5
+	movdqu	32(%rdi, %rsi), %xmm6
+	movdqu	48(%rdi, %rsi), %xmm7
+
+	movdqu	%xmm0, (%r9)
+	movdqu	%xmm1, 16(%r9)
+	movdqu	%xmm2, 32(%r9)
+	movdqu	%xmm3, 48(%r9)
+	movdqa	%xmm4, (%rdi)
+	movdqa	%xmm5, 16(%rdi)
+	movdqa	%xmm6, 32(%rdi)
+	movdqa	%xmm7, 48(%rdi)
+	add	$64, %rdi
+
+	lea	(%r9, %rdx), %rbx
+	and	$-64, %rbx
+
+	cmp	%rdi, %rbx
+	jbe	L(mm_copy_remaining_forward)
+
+	.p2align 4
+L(mm_main_loop_forward):
+
+	prefetcht0 128(%rdi, %rsi)
+
+	movdqu	(%rdi, %rsi), %xmm0
+	movdqu	16(%rdi, %rsi), %xmm1
+	movdqu	32(%rdi, %rsi), %xmm2
+	movdqu	48(%rdi, %rsi), %xmm3
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 16(%rdi)
+	movdqa	%xmm2, 32(%rdi)
+	movdqa	%xmm3, 48(%rdi)
+	lea	64(%rdi), %rdi
+	cmp	%rdi, %rbx
+	ja	L(mm_main_loop_forward)
+
+L(mm_copy_remaining_forward):
+	add	%r9, %rdx
+	sub	%rdi, %rdx
+/* We copied all up till %rdi position in the dst.
+	In %rdx now is how many bytes are left to copy.
+	Now we need to advance %r8. */
+	lea	(%rdi, %rsi), %r8
+
+L(mm_remaining_0_64_bytes_forward):
+	cmp	$32, %rdx
+	ja	L(mm_remaining_33_64_bytes_forward)
+	cmp	$16, %rdx
+	ja	L(mm_remaining_17_32_bytes_forward)
+	test	%rdx, %rdx
+	.p2align 4,,2
+	je	L(mm_return)
+
+	cmpb	$8, %dl
+	ja	L(mm_remaining_9_16_bytes_forward)
+	cmpb	$4, %dl
+	.p2align 4,,5
+	ja	L(mm_remaining_5_8_bytes_forward)
+	cmpb	$2, %dl
+	.p2align 4,,1
+	ja	L(mm_remaining_3_4_bytes_forward)
+	movzbl	-1(%r8,%rdx), %esi
+	movzbl	(%r8), %ebx
+	movb	%sil, -1(%rdi,%rdx)
+	movb	%bl, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_remaining_33_64_bytes_forward):
+	movdqu	(%r8), %xmm0
+	movdqu	16(%r8), %xmm1
+	movdqu	-32(%r8, %rdx), %xmm2
+	movdqu	-16(%r8, %rdx), %xmm3
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, -32(%rdi, %rdx)
+	movdqu	%xmm3, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_remaining_17_32_bytes_forward):
+	movdqu	(%r8), %xmm0
+	movdqu	-16(%r8, %rdx), %xmm1
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_remaining_3_4_bytes_forward):
+	movzwl	-2(%r8,%rdx), %esi
+	movzwl	(%r8), %ebx
+	movw	%si, -2(%rdi,%rdx)
+	movw	%bx, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_remaining_5_8_bytes_forward):
+	movl	(%r8), %esi
+	movl	-4(%r8,%rdx), %ebx
+	movl	%esi, (%rdi)
+	movl	%ebx, -4(%rdi,%rdx)
+	jmp	L(mm_return)
+
+L(mm_remaining_9_16_bytes_forward):
+	mov	(%r8), %rsi
+	mov	-8(%r8, %rdx), %rbx
+	mov	%rsi, (%rdi)
+	mov	%rbx, -8(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_0_16_bytes_forward):
+	testb	$24, %dl
+	jne	L(mm_len_9_16_bytes_forward)
+	testb	$4, %dl
+	.p2align 4,,5
+	jne	L(mm_len_5_8_bytes_forward)
+	test	%rdx, %rdx
+	.p2align 4,,2
+	je	L(mm_return)
+	testb	$2, %dl
+	.p2align 4,,1
+	jne	L(mm_len_2_4_bytes_forward)
+	movzbl	-1(%rsi,%rdx), %ebx
+	movzbl	(%rsi), %esi
+	movb	%bl, -1(%rdi,%rdx)
+	movb	%sil, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_len_2_4_bytes_forward):
+	movzwl	-2(%rsi,%rdx), %ebx
+	movzwl	(%rsi), %esi
+	movw	%bx, -2(%rdi,%rdx)
+	movw	%si, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_len_5_8_bytes_forward):
+	movl	(%rsi), %ebx
+	movl	-4(%rsi,%rdx), %esi
+	movl	%ebx, (%rdi)
+	movl	%esi, -4(%rdi,%rdx)
+	jmp	L(mm_return)
+
+L(mm_len_9_16_bytes_forward):
+	mov	(%rsi), %rbx
+	mov	-8(%rsi, %rdx), %rsi
+	mov	%rbx, (%rdi)
+	mov	%rsi, -8(%rdi, %rdx)
+	jmp	L(mm_return)
+
+/* The code for copying backwards.  */
+L(mm_len_0_or_more_backward):
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+	separately.  */
+	cmp	$16, %rdx
+	jbe	L(mm_len_0_16_bytes_backward)
+
+	cmp	$32, %rdx
+	jg	L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	-16(%rsi, %rdx), %xmm1
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_32_or_more_backward):
+	cmp	$64, %rdx
+	jg	L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	-16(%rsi, %rdx), %xmm2
+	movdqu	-32(%rsi, %rdx), %xmm3
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, -16(%rdi, %rdx)
+	movdqu	%xmm3, -32(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_64_or_more_backward):
+	cmp	$128, %rdx
+	jg	L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+	movdqu	-64(%rsi, %rdx), %xmm4
+	movdqu	-48(%rsi, %rdx), %xmm5
+	movdqu	-32(%rsi, %rdx), %xmm6
+	movdqu	-16(%rsi, %rdx), %xmm7
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, 32(%rdi)
+	movdqu	%xmm3, 48(%rdi)
+	movdqu	%xmm4, -64(%rdi, %rdx)
+	movdqu	%xmm5, -48(%rdi, %rdx)
+	movdqu	%xmm6, -32(%rdi, %rdx)
+	movdqu	%xmm7, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_128_or_more_backward):
+
+	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
+	jae	L(mm_large_page_backward)
+
+/* Aligning the address of destination. We need to save
+	16 bits from the source in order not to overwrite them.  */
+	movdqu	-16(%rsi, %rdx), %xmm0
+	movdqu	-32(%rsi, %rdx), %xmm1
+	movdqu	-48(%rsi, %rdx), %xmm2
+	movdqu	-64(%rsi, %rdx), %xmm3
+
+	lea	(%rdi, %rdx), %r9
+	and	$-64, %r9 /* r9 = aligned dst */
+
+	mov	%rsi, %r8
+	sub	%rdi, %r8 /* r8 = src - dst, diff */
+
+	movdqu	-16(%r9, %r8), %xmm4
+	movdqu	-32(%r9, %r8), %xmm5
+	movdqu	-48(%r9, %r8), %xmm6
+	movdqu	-64(%r9, %r8), %xmm7
+
+	movdqu	%xmm0, -16(%rdi, %rdx)
+	movdqu	%xmm1, -32(%rdi, %rdx)
+	movdqu	%xmm2, -48(%rdi, %rdx)
+	movdqu	%xmm3, -64(%rdi, %rdx)
+	movdqa	%xmm4, -16(%r9)
+	movdqa	%xmm5, -32(%r9)
+	movdqa	%xmm6, -48(%r9)
+	movdqa	%xmm7, -64(%r9)
+	lea	-64(%r9), %r9
+
+	lea	64(%rdi), %rbx
+	and	$-64, %rbx
+
+/* Compute in %rdx how many bytes are left to copy after
+	the main loop stops.  */
+	mov 	%rbx, %rdx
+	sub 	%rdi, %rdx
+
+	cmp	%r9, %rbx
+	jb	L(mm_main_loop_backward)
+	jmp	L(mm_len_0_or_more_backward)
+
+	.p2align 4
+L(mm_main_loop_backward):
+
+	prefetcht0 -128(%r9, %r8)
+
+	movdqu	-64(%r9, %r8), %xmm0
+	movdqu	-48(%r9, %r8), %xmm1
+	movdqu	-32(%r9, %r8), %xmm2
+	movdqu	-16(%r9, %r8), %xmm3
+	movdqa	%xmm0, -64(%r9)
+	movdqa	%xmm1, -48(%r9)
+	movdqa	%xmm2, -32(%r9)
+	movdqa	%xmm3, -16(%r9)
+	lea	-64(%r9), %r9
+	cmp	%r9, %rbx
+	jb	L(mm_main_loop_backward)
+	jmp	L(mm_len_0_or_more_backward)
+
+/* Copy [0..16] and return.  */
+L(mm_len_0_16_bytes_backward):
+	testb	$24, %dl
+	jnz	L(mm_len_9_16_bytes_backward)
+	testb	$4, %dl
+	.p2align 4,,5
+	jnz	L(mm_len_5_8_bytes_backward)
+	test	%rdx, %rdx
+	.p2align 4,,2
+	je	L(mm_return)
+	testb	$2, %dl
+	.p2align 4,,1
+	jne	L(mm_len_3_4_bytes_backward)
+	movzbl	-1(%rsi,%rdx), %ebx
+	movzbl	(%rsi), %ecx
+	movb	%bl, -1(%rdi,%rdx)
+	movb	%cl, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_len_3_4_bytes_backward):
+	movzwl	-2(%rsi,%rdx), %ebx
+	movzwl	(%rsi), %ecx
+	movw	%bx, -2(%rdi,%rdx)
+	movw	%cx, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_len_9_16_bytes_backward):
+	movl	-4(%rsi,%rdx), %ebx
+	movl	-8(%rsi,%rdx), %ecx
+	movl	%ebx, -4(%rdi,%rdx)
+	movl	%ecx, -8(%rdi,%rdx)
+	sub	$8, %rdx
+	jmp	L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+	movl	(%rsi), %ebx
+	movl	-4(%rsi,%rdx), %ecx
+	movl	%ebx, (%rdi)
+	movl	%ecx, -4(%rdi,%rdx)
+
+L(mm_return):
+	RETURN
+
+/* Big length copy forward part.  */
+
+L(mm_large_page_forward):
+/* Aligning the address of destination. We need to save
+	16 bits from the source in order not to overwrite them.  */
+
+	mov	%rsi, %r8
+	mov	%rdi, %r9
+
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+
+	lea	64(%r9), %rdi
+	and	$-64, %rdi      /* rdi = aligned dst */
+
+	sub	%r9, %rsi        /* rsi = diff */
+
+	movdqu	(%rdi, %rsi), %xmm4
+	movdqu	16(%rdi, %rsi), %xmm5
+	movdqu	32(%rdi, %rsi), %xmm6
+	movdqu	48(%rdi, %rsi), %xmm7
+
+	movdqu	%xmm0, (%r9)
+	movdqu	%xmm1, 16(%r9)
+	movdqu	%xmm2, 32(%r9)
+	movdqu	%xmm3, 48(%r9)
+	movntdq	%xmm4, (%rdi)
+	movntdq	%xmm5, 16(%rdi)
+	movntdq	%xmm6, 32(%rdi)
+	movntdq	%xmm7, 48(%rdi)
+	add	$64, %rdi
+
+	lea	(%r9, %rdx), %rbx
+	and	$-128, %rbx
+
+	cmp	%rdi, %rbx
+	jbe	L(mm_copy_remaining_forward)
+
+	.p2align 4
+L(mm_large_page_loop_forward):
+	movdqu	(%rdi, %rsi), %xmm0
+	movdqu	16(%rdi, %rsi), %xmm1
+	movdqu	32(%rdi, %rsi), %xmm2
+	movdqu	48(%rdi, %rsi), %xmm3
+	movdqu	64(%rdi, %rsi), %xmm4
+	movdqu	80(%rdi, %rsi), %xmm5
+	movdqu	96(%rdi, %rsi), %xmm6
+	movdqu	112(%rdi, %rsi), %xmm7
+	movntdq	%xmm0, (%rdi)
+	movntdq	%xmm1, 16(%rdi)
+	movntdq	%xmm2, 32(%rdi)
+	movntdq	%xmm3, 48(%rdi)
+	movntdq	%xmm4, 64(%rdi)
+	movntdq	%xmm5, 80(%rdi)
+	movntdq	%xmm6, 96(%rdi)
+	movntdq	%xmm7, 112(%rdi)
+	lea 	128(%rdi), %rdi
+	cmp	%rdi, %rbx
+	ja	L(mm_large_page_loop_forward)
+	sfence
+
+	add 	%r9, %rdx
+	sub 	%rdi, %rdx
+/* We copied all up till %rdi position in the dst.
+	In %rdx now is how many bytes are left to copy.
+	Now we need to advance %r8. */
+	lea 	(%rdi, %rsi), %r8
+
+	cmp	$64, %rdx
+	jb	L(mm_remaining_0_64_bytes_forward)
+
+	movdqu	(%r8), %xmm0
+	movdqu	16(%r8), %xmm1
+	movdqu	32(%r8), %xmm2
+	movdqu	48(%r8), %xmm3
+	movdqu	-64(%r8, %rdx), %xmm4
+	movdqu	-48(%r8, %rdx), %xmm5
+	movdqu	-32(%r8, %rdx), %xmm6
+	movdqu	-16(%r8, %rdx), %xmm7
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, 32(%rdi)
+	movdqu	%xmm3, 48(%rdi)
+	movdqu	%xmm4, -64(%rdi, %rdx)
+	movdqu	%xmm5, -48(%rdi, %rdx)
+	movdqu	%xmm6, -32(%rdi, %rdx)
+	movdqu	%xmm7, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+
+/* Big length copy backward part.  */
+L(mm_large_page_backward):
+/* Aligning the address of destination. We need to save
+	16 bits from the source in order not to overwrite them.  */
+
+	movdqu	-16(%rsi, %rdx), %xmm0
+	movdqu	-32(%rsi, %rdx), %xmm1
+	movdqu	-48(%rsi, %rdx), %xmm2
+	movdqu	-64(%rsi, %rdx), %xmm3
+
+	lea	(%rdi, %rdx), %r9
+	and	$-64, %r9
+
+	mov 	%rsi, %r8
+	sub 	%rdi, %r8
+
+	movdqu	-16(%r9, %r8), %xmm4
+	movdqu	-32(%r9, %r8), %xmm5
+	movdqu	-48(%r9, %r8), %xmm6
+	movdqu	-64(%r9, %r8), %xmm7
+
+	movdqu	%xmm0, -16(%rdi, %rdx)
+	movdqu	%xmm1, -32(%rdi, %rdx)
+	movdqu	%xmm2, -48(%rdi, %rdx)
+	movdqu	%xmm3, -64(%rdi, %rdx)
+	movntdq	%xmm4, -16(%r9)
+	movntdq	%xmm5, -32(%r9)
+	movntdq	%xmm6, -48(%r9)
+	movntdq	%xmm7, -64(%r9)
+	lea 	-64(%r9), %r9
+
+	lea 	128(%rdi), %rbx
+	and 	$-64, %rbx
+
+/* Compute in %rdx how many bytes are left to copy after
+	the main loop stops.  */
+	mov 	%rbx, %rdx
+	sub 	%rdi, %rdx
+
+	cmp	%r9, %rbx
+	jae	L(mm_len_0_or_more_backward)
+
+	.p2align 4
+L(mm_large_page_loop_backward):
+	movdqu	-64(%r9, %r8), %xmm0
+	movdqu	-48(%r9, %r8), %xmm1
+	movdqu	-32(%r9, %r8), %xmm2
+	movdqu	-16(%r9, %r8), %xmm3
+	movntdq	%xmm0, -64(%r9)
+	movntdq	%xmm1, -48(%r9)
+	movntdq	%xmm2, -32(%r9)
+	movntdq	%xmm3, -16(%r9)
+	lea 	-64(%r9), %r9
+	cmp	%r9, %rbx
+	jb	L(mm_large_page_loop_backward)
+	jmp	L(mm_len_0_or_more_backward)
+
+END (MEMMOVE)
--- a/libc/arch-x86_64/string/sse2-memset-slm.S
+++ b/libc/arch-x86_64/string/sse2-memset-slm.S
@@ -0,0 +1,173 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMSET
+# define MEMSET		memset
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc			.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc			.cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)			\
+	.type name,  @function;	\
+	.globl name;			\
+name:					\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)			\
+	cfi_endproc;			\
+	.size name, .-name
+#endif
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (MEMSET)
+	movq	%rdi, %rax
+#ifdef USE_AS_BZERO_P
+	mov	%rsi, %rdx
+	xor	%rcx, %rcx
+#else
+	and	$0xff, %rsi
+	mov	$0x0101010101010101, %rcx
+	imul	%rsi, %rcx
+#endif
+	cmpq	$16, %rdx
+	jae	L(16bytesormore)
+	testb	$8, %dl
+	jnz	L(8_15bytes)
+	testb	$4, %dl
+	jnz	L(4_7bytes)
+	testb	$2, %dl
+	jnz	L(2_3bytes)
+	testb	$1, %dl
+	jz	L(return)
+	movb	%cl, (%rdi)
+L(return):
+	ret
+
+L(8_15bytes):
+	movq	%rcx, (%rdi)
+	movq	%rcx, -8(%rdi, %rdx)
+	ret
+
+L(4_7bytes):
+	movl	%ecx, (%rdi)
+	movl	%ecx, -4(%rdi, %rdx)
+	ret
+
+L(2_3bytes):
+	movw	%cx, (%rdi)
+	movw	%cx, -2(%rdi, %rdx)
+	ret
+
+	ALIGN (4)
+L(16bytesormore):
+#ifdef USE_AS_BZERO_P
+	pxor	%xmm0, %xmm0
+#else
+	movd	%rcx, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+#endif
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm0, -16(%rdi, %rdx)
+	cmpq	$32, %rdx
+	jbe	L(32bytesless)
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm0, -32(%rdi, %rdx)
+	cmpq	$64, %rdx
+	jbe	L(64bytesless)
+	movdqu	%xmm0, 32(%rdi)
+	movdqu	%xmm0, 48(%rdi)
+	movdqu	%xmm0, -64(%rdi, %rdx)
+	movdqu	%xmm0, -48(%rdi, %rdx)
+	cmpq	$128, %rdx
+	ja	L(128bytesmore)
+L(32bytesless):
+L(64bytesless):
+	ret
+
+	ALIGN (4)
+L(128bytesmore):
+	leaq	64(%rdi), %rcx
+	andq	$-64, %rcx
+	movq	%rdx, %r8
+	addq	%rdi, %rdx
+	andq	$-64, %rdx
+	cmpq	%rcx, %rdx
+	je	L(return)
+
+#ifdef SHARED_CACHE_SIZE
+	cmp	$SHARED_CACHE_SIZE, %r8
+#else
+	cmp	__x86_64_shared_cache_size(%rip), %r8
+#endif
+	ja	L(128bytesmore_nt)
+
+	ALIGN (4)
+L(128bytesmore_normal):
+	movdqa	%xmm0, (%rcx)
+	movaps	%xmm0, 0x10(%rcx)
+	movaps	%xmm0, 0x20(%rcx)
+	movaps	%xmm0, 0x30(%rcx)
+	addq	$64, %rcx
+	cmpq	%rcx, %rdx
+	jne	L(128bytesmore_normal)
+	ret
+
+	ALIGN (4)
+L(128bytesmore_nt):
+	movntdq	%xmm0, (%rcx)
+	movntdq	%xmm0, 0x10(%rcx)
+	movntdq	%xmm0, 0x20(%rcx)
+	movntdq	%xmm0, 0x30(%rcx)
+	leaq	64(%rcx), %rcx
+	cmpq	%rcx, %rdx
+	jne	L(128bytesmore_nt)
+	sfence
+	ret
+
+END (MEMSET)
--- a/libc/arch-x86_64/string/sse2-stpcpy-slm.S
+++ b/libc/arch-x86_64/string/sse2-stpcpy-slm.S
@@ -0,0 +1,33 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STPCPY
+#define STRCPY		stpcpy
+#include "sse2-strcpy-slm.S"
--- a/libc/arch-x86_64/string/sse2-stpncpy-slm.S
+++ b/libc/arch-x86_64/string/sse2-stpncpy-slm.S
@@ -0,0 +1,34 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCPY
+#define USE_AS_STPCPY
+#define STRCPY		stpncpy
+#include "sse2-strcpy-slm.S"
--- a/libc/arch-x86_64/string/sse2-strcat-slm.S
+++ b/libc/arch-x86_64/string/sse2-strcat-slm.S
@@ -0,0 +1,87 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef STRCAT
+# define STRCAT		strcat
+#endif
+
+#ifndef L
+# define L(label)		.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc		 .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc		.cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)		\
+	.type name,  @function;		\
+	.globl name;		\
+	.p2align 4;		\
+name:		\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)		\
+	cfi_endproc;		\
+	.size name, .-name
+#endif
+
+#define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+	mov	%rdi, %r9
+#ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+#endif
+
+#define RETURN jmp L(Strcpy)
+#include "sse2-strlen-slm.S"
+
+#undef RETURN
+#define RETURN ret
+
+L(Strcpy):
+	lea	(%r9, %rax), %rdi
+	mov	%rsi, %rcx
+	mov	%r9, %rax	/* save result */
+
+#ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(ExitZero)
+# define USE_AS_STRNCPY
+#endif
+#include "sse2-strcpy-slm.S"
--- a/libc/arch-x86_64/string/sse2-strcpy-slm.S
+++ b/libc/arch-x86_64/string/sse2-strcpy-slm.S
--- a/libc/arch-x86_64/string/sse2-strlen-slm.S
+++ b/libc/arch-x86_64/string/sse2-strlen-slm.S
@@ -0,0 +1,294 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef USE_AS_STRCAT
+
+#ifndef STRLEN
+# define STRLEN		strlen
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc			.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc			.cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)			\
+	.type name,  @function; 	\
+	.globl name;			\
+	.p2align 4;			\
+name:					\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)			\
+	cfi_endproc;			\
+	.size name, .-name
+#endif
+#define RETURN ret
+	.section .text.sse2,"ax",@progbits
+ENTRY (STRLEN)
+/* end ifndef USE_AS_STRCAT */
+#endif
+	xor	%rax, %rax
+	mov	%edi, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%rdi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	mov	%rdi, %rax
+	and	$-16, %rax
+	jmp	L(align16_start)
+L(next):
+	mov	%rdi, %rax
+	and	$-16, %rax
+	pcmpeqb	(%rax), %xmm0
+	mov	$-1, %r10d
+	sub	%rax, %rcx
+	shl	%cl, %r10d
+	pmovmskb %xmm0, %edx
+	and	%r10d, %edx
+	jnz	L(exit)
+L(align16_start):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	pcmpeqb	16(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+	
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+	
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$80, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm1
+	add	$16, %rax
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm2
+	add	$16, %rax
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm3
+	add	$16, %rax
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	add	$16, %rax
+	.p2align 4
+	L(align64_loop):
+	movaps	(%rax),	%xmm4
+	pminub	16(%rax), 	%xmm4
+	movaps	32(%rax), 	%xmm5
+	pminub	48(%rax), 	%xmm5
+	add	$64, 	%rax
+	pminub	%xmm4,	%xmm5
+	pcmpeqb	%xmm0,	%xmm5
+	pmovmskb %xmm5,	%edx
+	test	%edx,	%edx
+	jz	L(align64_loop)
+
+
+	pcmpeqb	-64(%rax), %xmm0
+	sub	$80, 	%rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+	RETURN
+
+	.p2align 4
+L(exit):
+	sub	%rdi, %rax
+L(exit_less16):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	RETURN
+	.p2align 4
+L(exit16):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$16, %rax
+	RETURN
+	.p2align 4
+L(exit32):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$32, %rax
+	RETURN
+	.p2align 4
+L(exit48):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$48, %rax
+	RETURN
+	.p2align 4
+L(exit64):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+#ifndef USE_AS_STRCAT
+	RETURN
+
+END (STRLEN)
+#endif
--- a/libc/arch-x86_64/string/sse2-strncat-slm.S
+++ b/libc/arch-x86_64/string/sse2-strncat-slm.S
@@ -0,0 +1,33 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCAT
+#define STRCAT		strncat
+#include "sse2-strcat-slm.S"
--- a/libc/arch-x86_64/string/sse2-strncpy-slm.S
+++ b/libc/arch-x86_64/string/sse2-strncpy-slm.S
@@ -0,0 +1,33 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCPY
+#define STRCPY		strncpy
+#include "sse2-strcpy-slm.S"
--- a/libc/arch-x86_64/string/sse4-memcmp-slm.S
+++ b/libc/arch-x86_64/string/sse4-memcmp-slm.S
--- a/libc/arch-x86_64/string/ssse3-strcmp-slm.S
+++ b/libc/arch-x86_64/string/ssse3-strcmp-slm.S
--- a/libc/arch-x86_64/string/ssse3-strncmp-slm.S
+++ b/libc/arch-x86_64/string/ssse3-strncmp-slm.S
@@ -0,0 +1,33 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCMP
+#define STRCMP		strncmp
+#include "ssse3-strcmp-slm.S"
--- a/libc/arch-x86_64/x86_64.mk
+++ b/libc/arch-x86_64/x86_64.mk
@@ -3,11 +3,7 @@
 libc_common_src_files_x86_64 := \
    bionic/index.cpp \
    bionic/memchr.c \
-    bionic/memcmp.c \
-    bionic/memcpy.cpp \
-    bionic/memmove.c \
    bionic/memrchr.c \
-    bionic/memset.c \
    bionic/strchr.cpp \
    bionic/strnlen.c \
    bionic/strrchr.cpp \
@@ -18,18 +14,8 @@ libc_common_src_files_x86_64 := \
    upstream-freebsd/lib/libc/string/wcslen.c \
    upstream-freebsd/lib/libc/string/wcsrchr.c \
    upstream-freebsd/lib/libc/string/wmemcmp.c \
-    upstream-openbsd/lib/libc/string/bcopy.c \
-    upstream-openbsd/lib/libc/string/stpcpy.c \
-    upstream-openbsd/lib/libc/string/stpncpy.c \
-    upstream-openbsd/lib/libc/string/strcat.c \
-    upstream-openbsd/lib/libc/string/strcmp.c \
-    upstream-openbsd/lib/libc/string/strcpy.c \
    upstream-openbsd/lib/libc/string/strlcat.c \
    upstream-openbsd/lib/libc/string/strlcpy.c \
-    upstream-openbsd/lib/libc/string/strlen.c \
-    upstream-openbsd/lib/libc/string/strncat.c \
-    upstream-openbsd/lib/libc/string/strncmp.c \
-    upstream-openbsd/lib/libc/string/strncpy.c \

 # Fortify implementations of libc functions.
 libc_common_src_files_x86_64 += \
@@ -55,6 +41,23 @@ libc_bionic_src_files_x86_64 := \
    arch-x86_64/bionic/vfork.S \
    bionic/__memcmp16.cpp \

+libc_bionic_src_files_x86_64 += \
+    arch-x86_64/string/sse2-bcopy-slm.S \
+    arch-x86_64/string/sse2-bzero-slm.S \
+    arch-x86_64/string/sse2-memcpy-slm.S \
+    arch-x86_64/string/sse2-memmove-slm.S \
+    arch-x86_64/string/sse2-memset-slm.S \
+    arch-x86_64/string/sse2-stpcpy-slm.S \
+    arch-x86_64/string/sse2-stpncpy-slm.S \
+    arch-x86_64/string/sse2-strcat-slm.S \
+    arch-x86_64/string/sse2-strcpy-slm.S \
+    arch-x86_64/string/sse2-strlen-slm.S \
+    arch-x86_64/string/sse2-strncat-slm.S \
+    arch-x86_64/string/sse2-strncpy-slm.S \
+    arch-x86_64/string/sse4-memcmp-slm.S \
+    arch-x86_64/string/ssse3-strcmp-slm.S \
+    arch-x86_64/string/ssse3-strncmp-slm.S \
+
 libc_crt_target_cflags_x86_64 += \
    -m64 \
    -I$(LOCAL_PATH)/arch-x86_64/include