diff --git a/libc/Android.mk b/libc/Android.mk index 2ec12811c..a2a106aa7 100644 --- a/libc/Android.mk +++ b/libc/Android.mk @@ -448,8 +448,6 @@ libc_upstream_openbsd_src_files := \ upstream-openbsd/lib/libc/stdlib/strtoull.c \ upstream-openbsd/lib/libc/stdlib/strtoumax.c \ upstream-openbsd/lib/libc/stdlib/system.c \ - upstream-openbsd/lib/libc/string/stpcpy.c \ - upstream-openbsd/lib/libc/string/stpncpy.c \ upstream-openbsd/lib/libc/string/strcasecmp.c \ upstream-openbsd/lib/libc/string/strcspn.c \ upstream-openbsd/lib/libc/string/strdup.c \ diff --git a/libc/arch-arm/arm.mk b/libc/arch-arm/arm.mk index 2dbcb568a..06b167575 100644 --- a/libc/arch-arm/arm.mk +++ b/libc/arch-arm/arm.mk @@ -26,6 +26,8 @@ libc_common_src_files_arm += \ upstream-freebsd/lib/libc/string/wcsrchr.c \ upstream-freebsd/lib/libc/string/wmemcmp.c \ upstream-openbsd/lib/libc/string/bcopy.c \ + upstream-openbsd/lib/libc/string/stpcpy.c \ + upstream-openbsd/lib/libc/string/stpncpy.c \ upstream-openbsd/lib/libc/string/strlcat.c \ upstream-openbsd/lib/libc/string/strlcpy.c \ upstream-openbsd/lib/libc/string/strncat.c \ diff --git a/libc/arch-arm64/arm64.mk b/libc/arch-arm64/arm64.mk index 14cc3f48d..7a9eb4ebd 100644 --- a/libc/arch-arm64/arm64.mk +++ b/libc/arch-arm64/arm64.mk @@ -15,6 +15,8 @@ libc_common_src_files_arm64 := \ upstream-freebsd/lib/libc/string/wcsrchr.c \ upstream-freebsd/lib/libc/string/wmemcmp.c \ upstream-openbsd/lib/libc/string/bcopy.c \ + upstream-openbsd/lib/libc/string/stpcpy.c \ + upstream-openbsd/lib/libc/string/stpncpy.c \ upstream-openbsd/lib/libc/string/strcat.c \ upstream-openbsd/lib/libc/string/strcpy.c \ upstream-openbsd/lib/libc/string/strlcat.c \ diff --git a/libc/arch-mips/mips.mk b/libc/arch-mips/mips.mk index bffd89778..d7d1df417 100644 --- a/libc/arch-mips/mips.mk +++ b/libc/arch-mips/mips.mk @@ -27,6 +27,8 @@ libc_common_src_files_mips += \ upstream-freebsd/lib/libc/string/wcsrchr.c \ upstream-freebsd/lib/libc/string/wmemcmp.c \ upstream-openbsd/lib/libc/string/bcopy.c \ + upstream-openbsd/lib/libc/string/stpcpy.c \ + upstream-openbsd/lib/libc/string/stpncpy.c \ upstream-openbsd/lib/libc/string/strcat.c \ upstream-openbsd/lib/libc/string/strcmp.c \ upstream-openbsd/lib/libc/string/strcpy.c \ diff --git a/libc/arch-mips64/mips64.mk b/libc/arch-mips64/mips64.mk index cc177cf74..b6e0209a6 100644 --- a/libc/arch-mips64/mips64.mk +++ b/libc/arch-mips64/mips64.mk @@ -17,6 +17,8 @@ libc_common_src_files_mips64 := \ upstream-freebsd/lib/libc/string/wcsrchr.c \ upstream-freebsd/lib/libc/string/wmemcmp.c \ upstream-openbsd/lib/libc/string/bcopy.c \ + upstream-openbsd/lib/libc/string/stpcpy.c \ + upstream-openbsd/lib/libc/string/stpncpy.c \ upstream-openbsd/lib/libc/string/strcat.c \ upstream-openbsd/lib/libc/string/strcmp.c \ upstream-openbsd/lib/libc/string/strcpy.c \ diff --git a/libc/arch-x86/atom/atom.mk b/libc/arch-x86/atom/atom.mk new file mode 100644 index 000000000..bf408b466 --- /dev/null +++ b/libc/arch-x86/atom/atom.mk @@ -0,0 +1,34 @@ +libc_bionic_src_files_x86 += \ + arch-x86/atom/string/sse2-bzero-atom.S \ + arch-x86/atom/string/sse2-index-atom.S \ + arch-x86/atom/string/sse2-memchr-atom.S \ + arch-x86/atom/string/sse2-memrchr-atom.S \ + arch-x86/atom/string/sse2-memset-atom.S \ + arch-x86/atom/string/sse2-strchr-atom.S \ + arch-x86/atom/string/sse2-strlen-atom.S \ + arch-x86/atom/string/sse2-strnlen-atom.S \ + arch-x86/atom/string/sse2-strrchr-atom.S \ + arch-x86/atom/string/sse2-wcschr-atom.S \ + arch-x86/atom/string/sse2-wcsrchr-atom.S \ + arch-x86/atom/string/sse2-wcslen-atom.S \ + arch-x86/atom/string/sse2-wcscmp-atom.S \ + arch-x86/atom/string/ssse3-bcopy-atom.S \ + arch-x86/atom/string/ssse3-memcmp-atom.S \ + arch-x86/atom/string/ssse3-memcmp16-atom.S \ + arch-x86/atom/string/ssse3-memcpy-atom.S \ + arch-x86/atom/string/ssse3-memmove-atom.S \ + arch-x86/atom/string/ssse3-strcat-atom.S \ + arch-x86/atom/string/ssse3-strcmp-atom.S \ + arch-x86/atom/string/ssse3-strcpy-atom.S \ + arch-x86/atom/string/ssse3-strlcat-atom.S \ + arch-x86/atom/string/ssse3-strlcpy-atom.S \ + arch-x86/atom/string/ssse3-strncat-atom.S \ + arch-x86/atom/string/ssse3-strncmp-atom.S \ + arch-x86/atom/string/ssse3-strncpy-atom.S \ + arch-x86/atom/string/ssse3-wcscat-atom.S \ + arch-x86/atom/string/ssse3-wcscpy-atom.S \ + arch-x86/atom/string/ssse3-wmemcmp-atom.S + +libc_bionic_src_files_x86 += \ + arch-x86/silvermont/string/sse2-stpcpy-slm.S \ + arch-x86/silvermont/string/sse2-stpncpy-slm.S diff --git a/libc/arch-x86/atom/string/cache.h b/libc/arch-x86/atom/string/cache.h new file mode 100644 index 000000000..823bb1e39 --- /dev/null +++ b/libc/arch-x86/atom/string/cache.h @@ -0,0 +1,36 @@ +/* +Copyright (c) 2010, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Values are optimized for Atom */ +#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */ +#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */ + +#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2) +#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2) diff --git a/libc/arch-x86/string/sse2-bzero-atom.S b/libc/arch-x86/atom/string/sse2-bzero-atom.S similarity index 100% rename from libc/arch-x86/string/sse2-bzero-atom.S rename to libc/arch-x86/atom/string/sse2-bzero-atom.S diff --git a/libc/arch-x86/string/sse2-index-atom.S b/libc/arch-x86/atom/string/sse2-index-atom.S similarity index 100% rename from libc/arch-x86/string/sse2-index-atom.S rename to libc/arch-x86/atom/string/sse2-index-atom.S diff --git a/libc/arch-x86/string/sse2-memchr-atom.S b/libc/arch-x86/atom/string/sse2-memchr-atom.S similarity index 100% rename from libc/arch-x86/string/sse2-memchr-atom.S rename to libc/arch-x86/atom/string/sse2-memchr-atom.S diff --git a/libc/arch-x86/string/sse2-memrchr-atom.S b/libc/arch-x86/atom/string/sse2-memrchr-atom.S similarity index 100% rename from libc/arch-x86/string/sse2-memrchr-atom.S rename to libc/arch-x86/atom/string/sse2-memrchr-atom.S diff --git a/libc/arch-x86/string/sse2-memset-atom.S b/libc/arch-x86/atom/string/sse2-memset-atom.S similarity index 98% rename from libc/arch-x86/string/sse2-memset-atom.S rename to libc/arch-x86/atom/string/sse2-memset-atom.S index a54bf5145..b0963a187 100644 --- a/libc/arch-x86/string/sse2-memset-atom.S +++ b/libc/arch-x86/atom/string/sse2-memset-atom.S @@ -29,7 +29,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "cache.h" -#undef __i686 #ifndef L # define L(label) .L##label @@ -107,7 +106,7 @@ name: \ jump table with relative offsets. */ # define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ /* We first load PC into EBX. */ \ - call __i686.get_pc_thunk.bx; \ + call __x86.get_pc_thunk.bx; \ /* Get the address of the jump table. */ \ add $(TABLE - .), %ebx; \ /* Get the entry and convert the relative offset to the \ @@ -117,12 +116,12 @@ name: \ /* We loaded the jump table and adjuested EDX. Go. */ \ jmp *%ebx - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx + .section .gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits + .globl __x86.get_pc_thunk.bx + .hidden __x86.get_pc_thunk.bx ALIGN (4) - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: + .type __x86.get_pc_thunk.bx,@function +__x86.get_pc_thunk.bx: movl (%esp), %ebx ret #else @@ -321,7 +320,7 @@ L(128bytesormore): mov $SHARED_CACHE_SIZE, %ebx #else # if (defined SHARED || defined __PIC__) - call __i686.get_pc_thunk.bx + call __x86.get_pc_thunk.bx add $_GLOBAL_OFFSET_TABLE_, %ebx mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx # else @@ -340,7 +339,7 @@ L(128bytesormore): #else # if (defined SHARED || defined __PIC__) # define RESTORE_EBX_STATE - call __i686.get_pc_thunk.bx + call __x86.get_pc_thunk.bx add $_GLOBAL_OFFSET_TABLE_, %ebx cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx # else diff --git a/libc/arch-x86/string/sse2-strchr-atom.S b/libc/arch-x86/atom/string/sse2-strchr-atom.S similarity index 100% rename from libc/arch-x86/string/sse2-strchr-atom.S rename to libc/arch-x86/atom/string/sse2-strchr-atom.S diff --git a/libc/arch-x86/string/sse2-strlen-atom.S b/libc/arch-x86/atom/string/sse2-strlen-atom.S similarity index 100% rename from libc/arch-x86/string/sse2-strlen-atom.S rename to libc/arch-x86/atom/string/sse2-strlen-atom.S diff --git a/libc/arch-x86/string/sse2-strnlen-atom.S b/libc/arch-x86/atom/string/sse2-strnlen-atom.S similarity index 100% rename from libc/arch-x86/string/sse2-strnlen-atom.S rename to libc/arch-x86/atom/string/sse2-strnlen-atom.S diff --git a/libc/arch-x86/string/sse2-strrchr-atom.S b/libc/arch-x86/atom/string/sse2-strrchr-atom.S similarity index 100% rename from libc/arch-x86/string/sse2-strrchr-atom.S rename to libc/arch-x86/atom/string/sse2-strrchr-atom.S diff --git a/libc/arch-x86/string/sse2-wcschr-atom.S b/libc/arch-x86/atom/string/sse2-wcschr-atom.S similarity index 100% rename from libc/arch-x86/string/sse2-wcschr-atom.S rename to libc/arch-x86/atom/string/sse2-wcschr-atom.S diff --git a/libc/arch-x86/string/sse2-wcscmp-atom.S b/libc/arch-x86/atom/string/sse2-wcscmp-atom.S similarity index 100% rename from libc/arch-x86/string/sse2-wcscmp-atom.S rename to libc/arch-x86/atom/string/sse2-wcscmp-atom.S diff --git a/libc/arch-x86/string/sse2-wcslen-atom.S b/libc/arch-x86/atom/string/sse2-wcslen-atom.S similarity index 100% rename from libc/arch-x86/string/sse2-wcslen-atom.S rename to libc/arch-x86/atom/string/sse2-wcslen-atom.S diff --git a/libc/arch-x86/string/sse2-wcsrchr-atom.S b/libc/arch-x86/atom/string/sse2-wcsrchr-atom.S similarity index 100% rename from libc/arch-x86/string/sse2-wcsrchr-atom.S rename to libc/arch-x86/atom/string/sse2-wcsrchr-atom.S diff --git a/libc/arch-x86/string/ssse3-bcopy-atom.S b/libc/arch-x86/atom/string/ssse3-bcopy-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-bcopy-atom.S rename to libc/arch-x86/atom/string/ssse3-bcopy-atom.S diff --git a/libc/arch-x86/string/ssse3-memcmp-atom.S b/libc/arch-x86/atom/string/ssse3-memcmp-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-memcmp-atom.S rename to libc/arch-x86/atom/string/ssse3-memcmp-atom.S diff --git a/libc/arch-x86/string/ssse3-memcmp16-atom.S b/libc/arch-x86/atom/string/ssse3-memcmp16-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-memcmp16-atom.S rename to libc/arch-x86/atom/string/ssse3-memcmp16-atom.S diff --git a/libc/arch-x86/string/ssse3-memcpy-atom.S b/libc/arch-x86/atom/string/ssse3-memcpy-atom.S similarity index 99% rename from libc/arch-x86/string/ssse3-memcpy-atom.S rename to libc/arch-x86/atom/string/ssse3-memcpy-atom.S index 1080a3849..ac5ec2d4b 100644 --- a/libc/arch-x86/string/ssse3-memcpy-atom.S +++ b/libc/arch-x86/atom/string/ssse3-memcpy-atom.S @@ -29,7 +29,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "cache.h" -#undef __i686 #ifndef MEMCPY # define MEMCPY memcpy @@ -101,9 +100,8 @@ name: \ # define RETURN_END POP (%ebx); ret # define RETURN RETURN_END; CFI_PUSH (%ebx) # define JMPTBL(I, B) I - B -# undef __i686 -# define SETUP_PIC_REG(x) call __i686.get_pc_thunk.x +# define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x /* Load an entry in a jump table into EBX and branch to it. TABLE is a jump table with relative offsets. INDEX is a register contains the diff --git a/libc/arch-x86/string/ssse3-memmove-atom.S b/libc/arch-x86/atom/string/ssse3-memmove-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-memmove-atom.S rename to libc/arch-x86/atom/string/ssse3-memmove-atom.S diff --git a/libc/arch-x86/string/ssse3-strcat-atom.S b/libc/arch-x86/atom/string/ssse3-strcat-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-strcat-atom.S rename to libc/arch-x86/atom/string/ssse3-strcat-atom.S diff --git a/libc/arch-x86/string/ssse3-strcmp-atom.S b/libc/arch-x86/atom/string/ssse3-strcmp-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-strcmp-atom.S rename to libc/arch-x86/atom/string/ssse3-strcmp-atom.S diff --git a/libc/arch-x86/string/ssse3-strcpy-atom.S b/libc/arch-x86/atom/string/ssse3-strcpy-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-strcpy-atom.S rename to libc/arch-x86/atom/string/ssse3-strcpy-atom.S diff --git a/libc/arch-x86/string/ssse3-strlcat-atom.S b/libc/arch-x86/atom/string/ssse3-strlcat-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-strlcat-atom.S rename to libc/arch-x86/atom/string/ssse3-strlcat-atom.S diff --git a/libc/arch-x86/string/ssse3-strlcpy-atom.S b/libc/arch-x86/atom/string/ssse3-strlcpy-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-strlcpy-atom.S rename to libc/arch-x86/atom/string/ssse3-strlcpy-atom.S diff --git a/libc/arch-x86/string/ssse3-strncat-atom.S b/libc/arch-x86/atom/string/ssse3-strncat-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-strncat-atom.S rename to libc/arch-x86/atom/string/ssse3-strncat-atom.S diff --git a/libc/arch-x86/string/ssse3-strncmp-atom.S b/libc/arch-x86/atom/string/ssse3-strncmp-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-strncmp-atom.S rename to libc/arch-x86/atom/string/ssse3-strncmp-atom.S diff --git a/libc/arch-x86/string/ssse3-strncpy-atom.S b/libc/arch-x86/atom/string/ssse3-strncpy-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-strncpy-atom.S rename to libc/arch-x86/atom/string/ssse3-strncpy-atom.S diff --git a/libc/arch-x86/string/ssse3-wcscat-atom.S b/libc/arch-x86/atom/string/ssse3-wcscat-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-wcscat-atom.S rename to libc/arch-x86/atom/string/ssse3-wcscat-atom.S diff --git a/libc/arch-x86/string/ssse3-wcscpy-atom.S b/libc/arch-x86/atom/string/ssse3-wcscpy-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-wcscpy-atom.S rename to libc/arch-x86/atom/string/ssse3-wcscpy-atom.S diff --git a/libc/arch-x86/string/ssse3-wmemcmp-atom.S b/libc/arch-x86/atom/string/ssse3-wmemcmp-atom.S similarity index 100% rename from libc/arch-x86/string/ssse3-wmemcmp-atom.S rename to libc/arch-x86/atom/string/ssse3-wmemcmp-atom.S diff --git a/libc/arch-x86/generic/generic.mk b/libc/arch-x86/generic/generic.mk new file mode 100644 index 000000000..c8b40ee0c --- /dev/null +++ b/libc/arch-x86/generic/generic.mk @@ -0,0 +1,55 @@ +libc_bionic_src_files_x86 += \ + arch-x86/atom/string/sse2-index-atom.S \ + arch-x86/atom/string/sse2-memchr-atom.S \ + arch-x86/atom/string/sse2-memrchr-atom.S \ + arch-x86/atom/string/sse2-strchr-atom.S \ + arch-x86/atom/string/sse2-strnlen-atom.S \ + arch-x86/atom/string/sse2-strrchr-atom.S \ + arch-x86/atom/string/sse2-wcschr-atom.S \ + arch-x86/atom/string/sse2-wcsrchr-atom.S \ + arch-x86/atom/string/sse2-wcslen-atom.S \ + arch-x86/atom/string/sse2-wcscmp-atom.S \ + arch-x86/silvermont/string/sse2-bcopy-slm.S \ + arch-x86/silvermont/string/sse2-bzero-slm.S \ + arch-x86/silvermont/string/sse2-memcpy-slm.S \ + arch-x86/silvermont/string/sse2-memmove-slm.S \ + arch-x86/silvermont/string/sse2-memset-slm.S \ + arch-x86/silvermont/string/sse2-stpcpy-slm.S \ + arch-x86/silvermont/string/sse2-stpncpy-slm.S \ + arch-x86/silvermont/string/sse2-strcpy-slm.S \ + arch-x86/silvermont/string/sse2-strlen-slm.S \ + arch-x86/silvermont/string/sse2-strncpy-slm.S + +ifeq ($(ARCH_X86_HAVE_SSSE3),true) +libc_bionic_src_files_x86 += \ + arch-x86/atom/string/ssse3-strncat-atom.S \ + arch-x86/atom/string/ssse3-strlcat-atom.S \ + arch-x86/atom/string/ssse3-strlcpy-atom.S \ + arch-x86/atom/string/ssse3-strcmp-atom.S \ + arch-x86/atom/string/ssse3-strncmp-atom.S \ + arch-x86/atom/string/ssse3-strcat-atom.S \ + arch-x86/atom/string/ssse3-memcmp16-atom.S \ + arch-x86/atom/string/ssse3-wcscat-atom.S \ + arch-x86/atom/string/ssse3-wcscpy-atom.S +else +libc_bionic_src_files_x86 += \ + arch-x86/generic/string/strcmp.S \ + arch-x86/generic/string/strncmp.S \ + arch-x86/generic/string/strcat.S \ + bionic/__memcmp16.cpp \ + upstream-freebsd/lib/libc/string/wcscpy.c \ + upstream-freebsd/lib/libc/string/wcscat.c \ + upstream-openbsd/lib/libc/string/strlcat.c \ + upstream-openbsd/lib/libc/string/strlcpy.c \ + upstream-openbsd/lib/libc/string/strncat.c +endif + +ifeq ($(ARCH_X86_HAVE_SSE4),true) + libc_bionic_src_files_x86 += \ + arch-x86/silvermont/string/sse4-memcmp-slm.S \ + arch-x86/silvermont/string/sse4-wmemcmp-slm.S +else +libc_bionic_src_files_x86 += \ + arch-x86/generic/string/memcmp.S \ + upstream-freebsd/lib/libc/string/wmemcmp.c +endif diff --git a/libc/arch-x86/string/bcopy.S b/libc/arch-x86/generic/string/bcopy.S similarity index 100% rename from libc/arch-x86/string/bcopy.S rename to libc/arch-x86/generic/string/bcopy.S diff --git a/libc/arch-x86/string/memcmp.S b/libc/arch-x86/generic/string/memcmp.S similarity index 100% rename from libc/arch-x86/string/memcmp.S rename to libc/arch-x86/generic/string/memcmp.S diff --git a/libc/arch-x86/string/memcpy.S b/libc/arch-x86/generic/string/memcpy.S similarity index 100% rename from libc/arch-x86/string/memcpy.S rename to libc/arch-x86/generic/string/memcpy.S diff --git a/libc/arch-x86/string/memmove.S b/libc/arch-x86/generic/string/memmove.S similarity index 100% rename from libc/arch-x86/string/memmove.S rename to libc/arch-x86/generic/string/memmove.S diff --git a/libc/arch-x86/string/strcat.S b/libc/arch-x86/generic/string/strcat.S similarity index 100% rename from libc/arch-x86/string/strcat.S rename to libc/arch-x86/generic/string/strcat.S diff --git a/libc/arch-x86/string/strcmp.S b/libc/arch-x86/generic/string/strcmp.S similarity index 100% rename from libc/arch-x86/string/strcmp.S rename to libc/arch-x86/generic/string/strcmp.S diff --git a/libc/arch-x86/string/strncmp.S b/libc/arch-x86/generic/string/strncmp.S similarity index 100% rename from libc/arch-x86/string/strncmp.S rename to libc/arch-x86/generic/string/strncmp.S diff --git a/libc/arch-x86/string/swab.S b/libc/arch-x86/generic/string/swab.S similarity index 100% rename from libc/arch-x86/string/swab.S rename to libc/arch-x86/generic/string/swab.S diff --git a/libc/arch-x86/silvermont/silvermont.mk b/libc/arch-x86/silvermont/silvermont.mk new file mode 100644 index 000000000..b951ad5af --- /dev/null +++ b/libc/arch-x86/silvermont/silvermont.mk @@ -0,0 +1,34 @@ +libc_bionic_src_files_x86 += \ + arch-x86/silvermont/string/sse2-bcopy-slm.S \ + arch-x86/silvermont/string/sse2-bzero-slm.S \ + arch-x86/silvermont/string/sse2-memcpy-slm.S \ + arch-x86/silvermont/string/sse2-memmove-slm.S \ + arch-x86/silvermont/string/sse2-memset-slm.S \ + arch-x86/silvermont/string/sse2-stpcpy-slm.S \ + arch-x86/silvermont/string/sse2-stpncpy-slm.S \ + arch-x86/silvermont/string/sse2-strcpy-slm.S \ + arch-x86/silvermont/string/sse2-strlen-slm.S \ + arch-x86/silvermont/string/sse2-strncpy-slm.S \ + arch-x86/silvermont/string/sse4-memcmp-slm.S \ + arch-x86/silvermont/string/sse4-wmemcmp-slm.S + +libc_bionic_src_files_x86 += \ + arch-x86/atom/string/sse2-memchr-atom.S \ + arch-x86/atom/string/sse2-memrchr-atom.S \ + arch-x86/atom/string/sse2-strchr-atom.S \ + arch-x86/atom/string/sse2-strrchr-atom.S \ + arch-x86/atom/string/sse2-index-atom.S \ + arch-x86/atom/string/sse2-strnlen-atom.S \ + arch-x86/atom/string/sse2-wcschr-atom.S \ + arch-x86/atom/string/sse2-wcsrchr-atom.S \ + arch-x86/atom/string/sse2-wcslen-atom.S \ + arch-x86/atom/string/sse2-wcscmp-atom.S \ + arch-x86/atom/string/ssse3-strncat-atom.S \ + arch-x86/atom/string/ssse3-strlcat-atom.S \ + arch-x86/atom/string/ssse3-strlcpy-atom.S \ + arch-x86/atom/string/ssse3-strcmp-atom.S \ + arch-x86/atom/string/ssse3-strncmp-atom.S \ + arch-x86/atom/string/ssse3-strcat-atom.S \ + arch-x86/atom/string/ssse3-memcmp16-atom.S \ + arch-x86/atom/string/ssse3-wcscat-atom.S \ + arch-x86/atom/string/ssse3-wcscpy-atom.S diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/cache.h similarity index 90% rename from libc/arch-x86/string/cache.h rename to libc/arch-x86/silvermont/string/cache.h index 9d0a563ca..c342b1c33 100644 --- a/libc/arch-x86/string/cache.h +++ b/libc/arch-x86/silvermont/string/cache.h @@ -28,15 +28,9 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#if defined(__slm__) /* Values are optimized for Silvermont */ #define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */ #define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */ -#else -/* Values are optimized for Atom */ -#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */ -#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */ -#endif #define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2) #define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2) diff --git a/libc/arch-x86/silvermont/string/sse2-bcopy-slm.S b/libc/arch-x86/silvermont/string/sse2-bcopy-slm.S new file mode 100644 index 000000000..190d52f29 --- /dev/null +++ b/libc/arch-x86/silvermont/string/sse2-bcopy-slm.S @@ -0,0 +1,34 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#define MEMMOVE bcopy +#define USE_AS_BCOPY +#include "sse2-memmove-slm.S" diff --git a/libc/arch-x86/silvermont/string/sse2-bzero-slm.S b/libc/arch-x86/silvermont/string/sse2-bzero-slm.S new file mode 100644 index 000000000..b682ed686 --- /dev/null +++ b/libc/arch-x86/silvermont/string/sse2-bzero-slm.S @@ -0,0 +1,33 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define USE_AS_BZERO +#define MEMSET bzero +#include "sse2-memset-slm.S" diff --git a/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S b/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S new file mode 100644 index 000000000..1b305c790 --- /dev/null +++ b/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S @@ -0,0 +1,308 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "cache.h" + +#ifndef MEMCPY +# define MEMCPY memcpy +#endif + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +#endif + +#ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +#endif + +#ifndef cfi_rel_offset +# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off +#endif + +#ifndef cfi_restore +# define cfi_restore(reg) .cfi_restore reg +#endif + +#ifndef cfi_adjust_cfa_offset +# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off +#endif + +#ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +#endif + +#ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +#endif + +#define DEST PARMS +#define SRC DEST+4 +#define LEN SRC+4 + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#define PARMS 8 /* Preserve EBX. */ +#define ENTRANCE PUSH (%ebx); +#define RETURN_END POP (%ebx); ret +#define RETURN RETURN_END; CFI_PUSH (%ebx) + + .section .text.sse2,"ax",@progbits +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + + cmp %eax, %edx + je L(return) + + cmp $16, %ecx + jbe L(len_0_16_bytes) + + cmp $SHARED_CACHE_SIZE_HALF, %ecx + jae L(large_page) + + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + cmpl $32, %ecx + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jbe L(return) + + movdqu 16(%eax), %xmm0 + movdqu -32(%eax, %ecx), %xmm1 + cmpl $64, %ecx + movdqu %xmm0, 16(%edx) + movdqu %xmm1, -32(%edx, %ecx) + jbe L(return) + + movdqu 32(%eax), %xmm0 + movdqu 48(%eax), %xmm1 + movdqu -48(%eax, %ecx), %xmm2 + movdqu -64(%eax, %ecx), %xmm3 + cmpl $128, %ecx + movdqu %xmm0, 32(%edx) + movdqu %xmm1, 48(%edx) + movdqu %xmm2, -48(%edx, %ecx) + movdqu %xmm3, -64(%edx, %ecx) + jbe L(return) + +/* Now the main loop: we align the address of the destination. */ + leal 64(%edx), %ebx + andl $-64, %ebx + + addl %edx, %ecx + andl $-64, %ecx + + subl %edx, %eax + +/* We should stop two iterations before the termination + (in order not to misprefetch). */ + subl $64, %ecx + cmpl %ebx, %ecx + je L(main_loop_just_one_iteration) + + subl $64, %ecx + cmpl %ebx, %ecx + je L(main_loop_last_two_iterations) + + + .p2align 4 +L(main_loop_cache): + + prefetcht0 128(%ebx, %eax) + + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqa %xmm0, (%ebx) + movdqa %xmm1, 16(%ebx) + movdqa %xmm2, 32(%ebx) + movdqa %xmm3, 48(%ebx) + lea 64(%ebx), %ebx + cmpl %ebx, %ecx + jne L(main_loop_cache) + +L(main_loop_last_two_iterations): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqu 64(%ebx, %eax), %xmm4 + movdqu 80(%ebx, %eax), %xmm5 + movdqu 96(%ebx, %eax), %xmm6 + movdqu 112(%ebx, %eax), %xmm7 + movdqa %xmm0, (%ebx) + movdqa %xmm1, 16(%ebx) + movdqa %xmm2, 32(%ebx) + movdqa %xmm3, 48(%ebx) + movdqa %xmm4, 64(%ebx) + movdqa %xmm5, 80(%ebx) + movdqa %xmm6, 96(%ebx) + movdqa %xmm7, 112(%ebx) + jmp L(return) + +L(main_loop_just_one_iteration): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqa %xmm0, (%ebx) + movdqa %xmm1, 16(%ebx) + movdqa %xmm2, 32(%ebx) + movdqa %xmm3, 48(%ebx) + jmp L(return) + +L(large_page): + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + + movdqu 64(%eax), %xmm0 + movdqu 80(%eax), %xmm1 + movdqu 96(%eax), %xmm2 + movdqu 112(%eax), %xmm3 + movdqu -128(%eax, %ecx), %xmm4 + movdqu -112(%eax, %ecx), %xmm5 + movdqu -96(%eax, %ecx), %xmm6 + movdqu -80(%eax, %ecx), %xmm7 + movdqu %xmm0, 64(%edx) + movdqu %xmm1, 80(%edx) + movdqu %xmm2, 96(%edx) + movdqu %xmm3, 112(%edx) + movdqu %xmm4, -128(%edx, %ecx) + movdqu %xmm5, -112(%edx, %ecx) + movdqu %xmm6, -96(%edx, %ecx) + movdqu %xmm7, -80(%edx, %ecx) + +/* Now the main loop with non temporal stores. We align + the address of the destination. */ + leal 128(%edx), %ebx + andl $-128, %ebx + + addl %edx, %ecx + andl $-128, %ecx + + subl %edx, %eax + + .p2align 4 +L(main_loop_large_page): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqu 64(%ebx, %eax), %xmm4 + movdqu 80(%ebx, %eax), %xmm5 + movdqu 96(%ebx, %eax), %xmm6 + movdqu 112(%ebx, %eax), %xmm7 + movntdq %xmm0, (%ebx) + movntdq %xmm1, 16(%ebx) + movntdq %xmm2, 32(%ebx) + movntdq %xmm3, 48(%ebx) + movntdq %xmm4, 64(%ebx) + movntdq %xmm5, 80(%ebx) + movntdq %xmm6, 96(%ebx) + movntdq %xmm7, 112(%ebx) + lea 128(%ebx), %ebx + cmpl %ebx, %ecx + jne L(main_loop_large_page) + sfence + jmp L(return) + +L(len_0_16_bytes): + testb $24, %cl + jne L(len_9_16_bytes) + testb $4, %cl + .p2align 4,,5 + jne L(len_5_8_bytes) + testl %ecx, %ecx + .p2align 4,,2 + je L(return) + movzbl (%eax), %ebx + testb $2, %cl + movb %bl, (%edx) + je L(return) + movzwl -2(%eax,%ecx), %ebx + movw %bx, -2(%edx,%ecx) + jmp L(return) + +L(len_9_16_bytes): + movq (%eax), %xmm0 + movq -8(%eax, %ecx), %xmm1 + movq %xmm0, (%edx) + movq %xmm1, -8(%edx, %ecx) + jmp L(return) + +L(len_5_8_bytes): + movl (%eax), %ebx + movl %ebx, (%edx) + movl -4(%eax,%ecx), %ebx + movl %ebx, -4(%edx,%ecx) + jmp L(return) + +L(return): + movl %edx, %eax + RETURN + +END (MEMCPY) diff --git a/libc/arch-x86/silvermont/string/sse2-memmove-slm.S b/libc/arch-x86/silvermont/string/sse2-memmove-slm.S new file mode 100644 index 000000000..79a0a367e --- /dev/null +++ b/libc/arch-x86/silvermont/string/sse2-memmove-slm.S @@ -0,0 +1,673 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "cache.h" + +#ifndef MEMMOVE +# define MEMMOVE memmove +#endif + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +#endif + +#ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +#endif + +#ifndef cfi_rel_offset +# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off +#endif + +#ifndef cfi_restore +# define cfi_restore(reg) .cfi_restore reg +#endif + +#ifndef cfi_adjust_cfa_offset +# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off +#endif + +#ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +#endif + +#ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +#endif + +#ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +#else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#define PARMS 8 /* Preserve EBX. */ +#define ENTRANCE PUSH (%ebx); +#define RETURN_END POP (%ebx); ret +#define RETURN RETURN_END; CFI_PUSH (%ebx) + + .section .text.sse2,"ax",@progbits +ENTRY (MEMMOVE) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + +/* Check whether we should copy backward or forward. */ + cmp %eax, %edx + je L(mm_return) + ja L(mm_len_0_or_more_backward) + +/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] + separately. */ + cmp $16, %ecx + jbe L(mm_len_0_16_bytes_forward) + + cmpl $32, %ecx + jg L(mm_len_32_or_more_forward) + +/* Copy [0..32] and return. */ + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jmp L(mm_return) + +L(mm_len_32_or_more_forward): + cmpl $64, %ecx + jg L(mm_len_64_or_more_forward) + +/* Copy [0..64] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu -16(%eax, %ecx), %xmm2 + movdqu -32(%eax, %ecx), %xmm3 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, -16(%edx, %ecx) + movdqu %xmm3, -32(%edx, %ecx) + jmp L(mm_return) + +L(mm_len_64_or_more_forward): + cmpl $128, %ecx + jg L(mm_len_128_or_more_forward) + +/* Copy [0..128] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + jmp L(mm_return) + +L(mm_len_128_or_more_forward): + + cmp $SHARED_CACHE_SIZE_HALF, %ecx + jae L(mm_large_page_forward) + + PUSH (%esi) + PUSH (%edi) + movl %eax, %esi + movl %edx, %edi + +/* Aligning the address of destination. */ + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm1 + movdqu 32(%esi), %xmm2 + movdqu 48(%esi), %xmm3 + + leal 64(%edi), %edx + andl $-64, %edx + + movl %esi, %eax + subl %edi, %eax + + movdqu (%edx, %eax), %xmm4 + movdqu 16(%edx, %eax), %xmm5 + movdqu 32(%edx, %eax), %xmm6 + movdqu 48(%edx, %eax), %xmm7 + + movdqu %xmm0, (%edi) + movdqu %xmm1, 16(%edi) + movdqu %xmm2, 32(%edi) + movdqu %xmm3, 48(%edi) + movdqa %xmm4, (%edx) + movdqa %xmm5, 16(%edx) + movdqa %xmm6, 32(%edx) + movdqa %xmm7, 48(%edx) + addl $64, %edx + + leal (%edi, %ecx), %ebx + andl $-64, %ebx + + cmp %edx, %ebx + jbe L(mm_copy_remaining_forward) + + .p2align 4 +L(mm_main_loop_forward): + + prefetcht0 128(%edx, %eax) + + movdqu (%edx, %eax), %xmm0 + movdqu 16(%edx, %eax), %xmm1 + movdqu 32(%edx, %eax), %xmm2 + movdqu 48(%edx, %eax), %xmm3 + movdqa %xmm0, (%edx) + movdqa %xmm1, 16(%edx) + movdqa %xmm2, 32(%edx) + movdqa %xmm3, 48(%edx) + leal 64(%edx), %edx + cmp %edx, %ebx + ja L(mm_main_loop_forward) + +L(mm_copy_remaining_forward): + addl %edi, %ecx + subl %edx, %ecx +/* We copied all up till %edx position in the dst. + In %ecx now is how many bytes are left to copy. + Now we need to advance %esi. */ + leal (%edx, %eax), %esi + +L(mm_remaining_0_64_bytes_forward): + cmp $32, %ecx + ja L(mm_remaining_33_64_bytes_forward) + cmp $16, %ecx + ja L(mm_remaining_17_32_bytes_forward) + testl %ecx, %ecx + .p2align 4,,2 + je L(mm_return_pop_all) + + cmpb $8, %cl + ja L(mm_remaining_9_16_bytes_forward) + cmpb $4, %cl + .p2align 4,,5 + ja L(mm_remaining_5_8_bytes_forward) + cmpb $2, %cl + .p2align 4,,1 + ja L(mm_remaining_3_4_bytes_forward) + movzbl -1(%esi,%ecx), %eax + movzbl (%esi), %ebx + movb %al, -1(%edx,%ecx) + movb %bl, (%edx) + jmp L(mm_return_pop_all) + +L(mm_remaining_33_64_bytes_forward): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm1 + movdqu -32(%esi, %ecx), %xmm2 + movdqu -16(%esi, %ecx), %xmm3 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, -32(%edx, %ecx) + movdqu %xmm3, -16(%edx, %ecx) + jmp L(mm_return_pop_all) + +L(mm_remaining_17_32_bytes_forward): + movdqu (%esi), %xmm0 + movdqu -16(%esi, %ecx), %xmm1 + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jmp L(mm_return_pop_all) + +L(mm_remaining_3_4_bytes_forward): + movzwl -2(%esi,%ecx), %eax + movzwl (%esi), %ebx + movw %ax, -2(%edx,%ecx) + movw %bx, (%edx) + jmp L(mm_return_pop_all) + +L(mm_remaining_5_8_bytes_forward): + movl (%esi), %eax + movl -4(%esi,%ecx), %ebx + movl %eax, (%edx) + movl %ebx, -4(%edx,%ecx) + jmp L(mm_return_pop_all) + +L(mm_remaining_9_16_bytes_forward): + movq (%esi), %xmm0 + movq -8(%esi, %ecx), %xmm1 + movq %xmm0, (%edx) + movq %xmm1, -8(%edx, %ecx) + jmp L(mm_return_pop_all) + + +L(mm_len_0_16_bytes_forward): + testb $24, %cl + jne L(mm_len_9_16_bytes_forward) + testb $4, %cl + .p2align 4,,5 + jne L(mm_len_5_8_bytes_forward) + testl %ecx, %ecx + .p2align 4,,2 + je L(mm_return) + testb $2, %cl + .p2align 4,,1 + jne L(mm_len_2_4_bytes_forward) + movzbl -1(%eax,%ecx), %ebx + movzbl (%eax), %eax + movb %bl, -1(%edx,%ecx) + movb %al, (%edx) + jmp L(mm_return) + +L(mm_len_2_4_bytes_forward): + movzwl -2(%eax,%ecx), %ebx + movzwl (%eax), %eax + movw %bx, -2(%edx,%ecx) + movw %ax, (%edx) + jmp L(mm_return) + +L(mm_len_5_8_bytes_forward): + movl (%eax), %ebx + movl -4(%eax,%ecx), %eax + movl %ebx, (%edx) + movl %eax, -4(%edx,%ecx) + jmp L(mm_return) + +L(mm_len_9_16_bytes_forward): + movq (%eax), %xmm0 + movq -8(%eax, %ecx), %xmm1 + movq %xmm0, (%edx) + movq %xmm1, -8(%edx, %ecx) + jmp L(mm_return) + +/* The code for copying backwards. */ +L(mm_len_0_or_more_backward): + +/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] + separately. */ + cmp $16, %ecx + jbe L(mm_len_0_16_bytes_backward) + + cmpl $32, %ecx + jg L(mm_len_32_or_more_backward) + +/* Copy [0..32] and return. */ + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jmp L(mm_return) + +L(mm_len_32_or_more_backward): + cmpl $64, %ecx + jg L(mm_len_64_or_more_backward) + +/* Copy [0..64] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu -16(%eax, %ecx), %xmm2 + movdqu -32(%eax, %ecx), %xmm3 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, -16(%edx, %ecx) + movdqu %xmm3, -32(%edx, %ecx) + jmp L(mm_return) + +L(mm_len_64_or_more_backward): + cmpl $128, %ecx + jg L(mm_len_128_or_more_backward) + +/* Copy [0..128] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + jmp L(mm_return) + +L(mm_len_128_or_more_backward): + + cmp $SHARED_CACHE_SIZE_HALF, %ecx + jae L(mm_large_page_backward) + + PUSH (%esi) + PUSH (%edi) + +/* Aligning the address of destination. We need to save + 16 bits from the source in order not to overwrite them. */ + movdqu -16(%eax, %ecx), %xmm0 + movdqu -32(%eax, %ecx), %xmm1 + movdqu -48(%eax, %ecx), %xmm2 + movdqu -64(%eax, %ecx), %xmm3 + + leal (%edx, %ecx), %edi + andl $-64, %edi + + movl %eax, %esi + subl %edx, %esi + + movdqu -16(%edi, %esi), %xmm4 + movdqu -32(%edi, %esi), %xmm5 + movdqu -48(%edi, %esi), %xmm6 + movdqu -64(%edi, %esi), %xmm7 + + movdqu %xmm0, -16(%edx, %ecx) + movdqu %xmm1, -32(%edx, %ecx) + movdqu %xmm2, -48(%edx, %ecx) + movdqu %xmm3, -64(%edx, %ecx) + movdqa %xmm4, -16(%edi) + movdqa %xmm5, -32(%edi) + movdqa %xmm6, -48(%edi) + movdqa %xmm7, -64(%edi) + leal -64(%edi), %edi + + leal 64(%edx), %ebx + andl $-64, %ebx + +/* Compute in %ecx how many bytes are left to copy after + the main loop stops. */ + movl %ebx, %ecx + subl %edx, %ecx + + cmp %edi, %ebx + jb L(mm_main_loop_backward) + + POP (%edi) + POP (%esi) + jmp L(mm_len_0_or_more_backward) + + .p2align 4 +L(mm_main_loop_backward): + + prefetcht0 -128(%edi, %esi) + + movdqu -64(%edi, %esi), %xmm0 + movdqu -48(%edi, %esi), %xmm1 + movdqu -32(%edi, %esi), %xmm2 + movdqu -16(%edi, %esi), %xmm3 + movdqa %xmm0, -64(%edi) + movdqa %xmm1, -48(%edi) + movdqa %xmm2, -32(%edi) + movdqa %xmm3, -16(%edi) + leal -64(%edi), %edi + cmp %edi, %ebx + jb L(mm_main_loop_backward) + POP (%edi) + POP (%esi) + jmp L(mm_len_0_or_more_backward) + +/* Copy [0..16] and return. */ +L(mm_len_0_16_bytes_backward): + testb $24, %cl + jnz L(mm_len_9_16_bytes_backward) + testb $4, %cl + .p2align 4,,5 + jnz L(mm_len_5_8_bytes_backward) + testl %ecx, %ecx + .p2align 4,,2 + je L(mm_return) + testb $2, %cl + .p2align 4,,1 + jne L(mm_len_3_4_bytes_backward) + movzbl -1(%eax,%ecx), %ebx + movzbl (%eax), %eax + movb %bl, -1(%edx,%ecx) + movb %al, (%edx) + jmp L(mm_return) + +L(mm_len_3_4_bytes_backward): + movzwl -2(%eax,%ecx), %ebx + movzwl (%eax), %eax + movw %bx, -2(%edx,%ecx) + movw %ax, (%edx) + jmp L(mm_return) + +L(mm_len_9_16_bytes_backward): + PUSH (%esi) + movl -4(%eax,%ecx), %ebx + movl -8(%eax,%ecx), %esi + movl %ebx, -4(%edx,%ecx) + movl %esi, -8(%edx,%ecx) + subl $8, %ecx + POP (%esi) + jmp L(mm_len_0_16_bytes_backward) + +L(mm_len_5_8_bytes_backward): + movl (%eax), %ebx + movl -4(%eax,%ecx), %eax + movl %ebx, (%edx) + movl %eax, -4(%edx,%ecx) + +L(mm_return): + movl %edx, %eax + RETURN + +L(mm_return_pop_all): + movl %edi, %eax + POP (%edi) + POP (%esi) + RETURN + +/* Big length copy forward part. */ + +L(mm_large_page_forward): +/* Aligning the address of destination. We need to save + 16 bits from the source in order not to overwrite them. */ + + PUSH (%esi) + PUSH (%edi) + movl %eax, %esi + movl %edx, %edi + + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm1 + movdqu 32(%esi), %xmm2 + movdqu 48(%esi), %xmm3 + + leal 64(%edi), %edx + andl $-64, %edx + + movl %esi, %eax + subl %edi, %eax + + movdqu (%edx, %eax), %xmm4 + movdqu 16(%edx, %eax), %xmm5 + movdqu 32(%edx, %eax), %xmm6 + movdqu 48(%edx, %eax), %xmm7 + + movdqu %xmm0, (%edi) + movdqu %xmm1, 16(%edi) + movdqu %xmm2, 32(%edi) + movdqu %xmm3, 48(%edi) + movntdq %xmm4, (%edx) + movntdq %xmm5, 16(%edx) + movntdq %xmm6, 32(%edx) + movntdq %xmm7, 48(%edx) + addl $64, %edx + + leal (%edi, %ecx), %ebx + andl $-128, %ebx + + cmp %edx, %ebx + jbe L(mm_copy_remaining_forward) + + .p2align 4 +L(mm_large_page_loop_forward): + movdqu (%edx, %eax), %xmm0 + movdqu 16(%edx, %eax), %xmm1 + movdqu 32(%edx, %eax), %xmm2 + movdqu 48(%edx, %eax), %xmm3 + movdqu 64(%edx, %eax), %xmm4 + movdqu 80(%edx, %eax), %xmm5 + movdqu 96(%edx, %eax), %xmm6 + movdqu 112(%edx, %eax), %xmm7 + movntdq %xmm0, (%edx) + movntdq %xmm1, 16(%edx) + movntdq %xmm2, 32(%edx) + movntdq %xmm3, 48(%edx) + movntdq %xmm4, 64(%edx) + movntdq %xmm5, 80(%edx) + movntdq %xmm6, 96(%edx) + movntdq %xmm7, 112(%edx) + leal 128(%edx), %edx + cmp %edx, %ebx + ja L(mm_large_page_loop_forward) + sfence + + addl %edi, %ecx + subl %edx, %ecx +/* We copied all up till %edx position in the dst. + In %ecx now is how many bytes are left to copy. + Now we need to advance %esi. */ + leal (%edx, %eax), %esi + + cmp $64, %ecx + jb L(mm_remaining_0_64_bytes_forward) + + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm1 + movdqu 32(%esi), %xmm2 + movdqu 48(%esi), %xmm3 + movdqu -64(%esi, %ecx), %xmm4 + movdqu -48(%esi, %ecx), %xmm5 + movdqu -32(%esi, %ecx), %xmm6 + movdqu -16(%esi, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + jmp L(mm_return_pop_all) + + +/* Big length copy backward part. */ +L(mm_large_page_backward): +/* Aligning the address of destination. We need to save + 16 bits from the source in order not to overwrite them. */ + + PUSH (%esi) + PUSH (%edi) + + movdqu -16(%eax, %ecx), %xmm0 + movdqu -32(%eax, %ecx), %xmm1 + movdqu -48(%eax, %ecx), %xmm2 + movdqu -64(%eax, %ecx), %xmm3 + + leal (%edx, %ecx), %edi + andl $-64, %edi + + movl %eax, %esi + subl %edx, %esi + + movdqu -16(%edi, %esi), %xmm4 + movdqu -32(%edi, %esi), %xmm5 + movdqu -48(%edi, %esi), %xmm6 + movdqu -64(%edi, %esi), %xmm7 + + movdqu %xmm0, -16(%edx, %ecx) + movdqu %xmm1, -32(%edx, %ecx) + movdqu %xmm2, -48(%edx, %ecx) + movdqu %xmm3, -64(%edx, %ecx) + movntdq %xmm4, -16(%edi) + movntdq %xmm5, -32(%edi) + movntdq %xmm6, -48(%edi) + movntdq %xmm7, -64(%edi) + leal -64(%edi), %edi + + leal 128(%edx), %ebx + andl $-64, %ebx + +/* Compute in %ecx how many bytes are left to copy after + the main loop stops. */ + movl %ebx, %ecx + subl %edx, %ecx + + cmp %edi, %ebx + jae L(mm_len_0_or_more_backward) + + .p2align 4 +L(mm_large_page_loop_backward): + movdqu -64(%edi, %esi), %xmm0 + movdqu -48(%edi, %esi), %xmm1 + movdqu -32(%edi, %esi), %xmm2 + movdqu -16(%edi, %esi), %xmm3 + movntdq %xmm0, -64(%edi) + movntdq %xmm1, -48(%edi) + movntdq %xmm2, -32(%edi) + movntdq %xmm3, -16(%edi) + leal -64(%edi), %edi + cmp %edi, %ebx + jb L(mm_large_page_loop_backward) + POP (%edi) + POP (%esi) + jmp L(mm_len_0_or_more_backward) + +END (MEMMOVE) diff --git a/libc/arch-x86/silvermont/string/sse2-memset-slm.S b/libc/arch-x86/silvermont/string/sse2-memset-slm.S new file mode 100644 index 000000000..c30bf7464 --- /dev/null +++ b/libc/arch-x86/silvermont/string/sse2-memset-slm.S @@ -0,0 +1,841 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "cache.h" + +#ifndef MEMSET +# define MEMSET memset +#endif + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef ALIGN +# define ALIGN(n) .p2align n +#endif + +#ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +#endif + +#ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +#endif + +#ifndef cfi_rel_offset +# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off +#endif + +#ifndef cfi_restore +# define cfi_restore(reg) .cfi_restore reg +#endif + +#ifndef cfi_adjust_cfa_offset +# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off +#endif + +#ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +#endif + +#ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_BZERO +# define DEST PARMS +# define LEN DEST+4 +# define SETRTNVAL +#else +# define DEST PARMS +# define CHR DEST+4 +# define LEN CHR+4 +# define SETRTNVAL movl DEST(%esp), %eax +#endif + +#if (defined SHARED || defined __PIC__) +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define PARMS 8 /* Preserve EBX. */ +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + /* We first load PC into EBX. */ \ + call __x86.get_pc_thunk.bx; \ + /* Get the address of the jump table. */ \ + add $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + add (%ebx,%ecx,4), %ebx; \ + add %ecx, %edx; \ + /* We loaded the jump table and adjuested EDX. Go. */ \ + jmp *%ebx + + .section .gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits + .globl __x86.get_pc_thunk.bx + .hidden __x86.get_pc_thunk.bx + ALIGN (4) + .type __x86.get_pc_thunk.bx,@function +__x86.get_pc_thunk.bx: + movl (%esp), %ebx + ret +#else +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define PARMS 4 +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + add %ecx, %edx; \ + jmp *TABLE(,%ecx,4) +#endif + + .section .text.sse2,"ax",@progbits + ALIGN (4) +ENTRY (MEMSET) + ENTRANCE + + movl LEN(%esp), %ecx + cmp $0, %ecx + ja L(1byteormore) + SETRTNVAL + RETURN + +L(1byteormore): +#ifdef USE_AS_BZERO + xor %eax, %eax +#else + movzbl CHR(%esp), %eax + movb %al, %ah + /* Fill the whole EAX with pattern. */ + movl %eax, %edx + shl $16, %eax + or %edx, %eax +#endif + movl DEST(%esp), %edx + cmp $1, %ecx + je L(1byte) + cmp $16, %ecx + jae L(16bytesormore) + + cmp $4, %ecx + jb L(4bytesless) + movl %eax, (%edx) + movl %eax, -4(%edx, %ecx) + cmp $8, %ecx + jb L(8bytesless) + movl %eax, 4(%edx) + movl %eax, -8(%edx, %ecx) +L(8bytesless): + SETRTNVAL + RETURN + +L(4bytesless): + movw %ax, (%edx) + movw %ax, -2(%edx, %ecx) + SETRTNVAL + RETURN + +L(1byte): + movb %al, (%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(16bytesormore): +#ifdef USE_AS_BZERO + pxor %xmm0, %xmm0 +#else + movd %eax, %xmm0 + pshufd $0, %xmm0, %xmm0 +#endif + + cmp $64, %ecx + ja L(64bytesmore) + movdqu %xmm0, (%edx) + movdqu %xmm0, -16(%edx, %ecx) + cmp $32, %ecx + jbe L(32bytesless) + movdqu %xmm0, 16(%edx) + movdqu %xmm0, -32(%edx, %ecx) +L(32bytesless): + SETRTNVAL + RETURN + +L(64bytesmore): + testl $0xf, %edx + jz L(aligned_16) +L(not_aligned_16): + movdqu %xmm0, (%edx) + movl %edx, %eax + and $-16, %edx + add $16, %edx + sub %edx, %eax + add %eax, %ecx + movd %xmm0, %eax + + ALIGN (4) +L(aligned_16): + cmp $128, %ecx + jae L(128bytesormore) + +L(aligned_16_less128bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytesormore): +#ifdef SHARED_CACHE_SIZE + PUSH (%ebx) + mov $SHARED_CACHE_SIZE, %ebx +#else +# if (defined SHARED || defined __PIC__) + call __x86.get_pc_thunk.bx + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov $__x86_shared_cache_size@GOTOFF(%ebx), %ebx +# else + PUSH (%ebx) + mov $__x86_shared_cache_size, %ebx +# endif +#endif + cmp %ebx, %ecx + jae L(128bytesormore_nt_start) + + POP (%ebx) + +#ifdef DATA_CACHE_SIZE + PUSH (%ebx) + mov $DATA_CACHE_SIZE, %ebx +#else +# if (defined SHARED || defined __PIC__) + call __x86.get_pc_thunk.bx + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov $__x86_data_cache_size@GOTOFF(%ebx), %ebx +# else + PUSH (%ebx) + mov $__x86_data_cache_size, %ebx +# endif +#endif + + cmp %ebx, %ecx + jae L(128bytes_L2_normal) + subl $128, %ecx +L(128bytesormore_normal): + sub $128, %ecx + movdqa %xmm0, (%edx) + movaps %xmm0, 0x10(%edx) + movaps %xmm0, 0x20(%edx) + movaps %xmm0, 0x30(%edx) + movaps %xmm0, 0x40(%edx) + movaps %xmm0, 0x50(%edx) + movaps %xmm0, 0x60(%edx) + movaps %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jb L(128bytesless_normal) + + + sub $128, %ecx + movdqa %xmm0, (%edx) + movaps %xmm0, 0x10(%edx) + movaps %xmm0, 0x20(%edx) + movaps %xmm0, 0x30(%edx) + movaps %xmm0, 0x40(%edx) + movaps %xmm0, 0x50(%edx) + movaps %xmm0, 0x60(%edx) + movaps %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jae L(128bytesormore_normal) + +L(128bytesless_normal): + lea 128(%ecx), %ecx +#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__) + POP (%ebx) +#endif + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytes_L2_normal): + prefetchnta 0x380(%edx) + prefetchnta 0x3c0(%edx) + sub $128, %ecx + movdqa %xmm0, (%edx) + movaps %xmm0, 0x10(%edx) + movaps %xmm0, 0x20(%edx) + movaps %xmm0, 0x30(%edx) + movaps %xmm0, 0x40(%edx) + movaps %xmm0, 0x50(%edx) + movaps %xmm0, 0x60(%edx) + movaps %xmm0, 0x70(%edx) + add $128, %edx + cmp $128, %ecx + jae L(128bytes_L2_normal) + +L(128bytesless_L2_normal): +#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__) + POP (%ebx) +#endif + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + +L(128bytesormore_nt_start): + sub %ebx, %ecx + ALIGN (4) +L(128bytesormore_shared_cache_loop): + prefetchnta 0x3c0(%edx) + prefetchnta 0x380(%edx) + sub $0x80, %ebx + movdqa %xmm0, (%edx) + movaps %xmm0, 0x10(%edx) + movaps %xmm0, 0x20(%edx) + movaps %xmm0, 0x30(%edx) + movaps %xmm0, 0x40(%edx) + movaps %xmm0, 0x50(%edx) + movaps %xmm0, 0x60(%edx) + movaps %xmm0, 0x70(%edx) + add $0x80, %edx + cmp $0x80, %ebx + jae L(128bytesormore_shared_cache_loop) + cmp $0x80, %ecx + jb L(shared_cache_loop_end) + ALIGN (4) +L(128bytesormore_nt): + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm0, 0x10(%edx) + movntdq %xmm0, 0x20(%edx) + movntdq %xmm0, 0x30(%edx) + movntdq %xmm0, 0x40(%edx) + movntdq %xmm0, 0x50(%edx) + movntdq %xmm0, 0x60(%edx) + movntdq %xmm0, 0x70(%edx) + add $0x80, %edx + cmp $0x80, %ecx + jae L(128bytesormore_nt) + sfence +L(shared_cache_loop_end): +#if defined SHARED_CACHE_SIZE || !(defined SHARED || defined __PIC__) + POP (%ebx) +#endif + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_16_128bytes): + .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes)) + .popsection + + ALIGN (4) +L(aligned_16_112bytes): + movdqa %xmm0, -112(%edx) +L(aligned_16_96bytes): + movdqa %xmm0, -96(%edx) +L(aligned_16_80bytes): + movdqa %xmm0, -80(%edx) +L(aligned_16_64bytes): + movdqa %xmm0, -64(%edx) +L(aligned_16_48bytes): + movdqa %xmm0, -48(%edx) +L(aligned_16_32bytes): + movdqa %xmm0, -32(%edx) +L(aligned_16_16bytes): + movdqa %xmm0, -16(%edx) +L(aligned_16_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_113bytes): + movdqa %xmm0, -113(%edx) +L(aligned_16_97bytes): + movdqa %xmm0, -97(%edx) +L(aligned_16_81bytes): + movdqa %xmm0, -81(%edx) +L(aligned_16_65bytes): + movdqa %xmm0, -65(%edx) +L(aligned_16_49bytes): + movdqa %xmm0, -49(%edx) +L(aligned_16_33bytes): + movdqa %xmm0, -33(%edx) +L(aligned_16_17bytes): + movdqa %xmm0, -17(%edx) +L(aligned_16_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_114bytes): + movdqa %xmm0, -114(%edx) +L(aligned_16_98bytes): + movdqa %xmm0, -98(%edx) +L(aligned_16_82bytes): + movdqa %xmm0, -82(%edx) +L(aligned_16_66bytes): + movdqa %xmm0, -66(%edx) +L(aligned_16_50bytes): + movdqa %xmm0, -50(%edx) +L(aligned_16_34bytes): + movdqa %xmm0, -34(%edx) +L(aligned_16_18bytes): + movdqa %xmm0, -18(%edx) +L(aligned_16_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_115bytes): + movdqa %xmm0, -115(%edx) +L(aligned_16_99bytes): + movdqa %xmm0, -99(%edx) +L(aligned_16_83bytes): + movdqa %xmm0, -83(%edx) +L(aligned_16_67bytes): + movdqa %xmm0, -67(%edx) +L(aligned_16_51bytes): + movdqa %xmm0, -51(%edx) +L(aligned_16_35bytes): + movdqa %xmm0, -35(%edx) +L(aligned_16_19bytes): + movdqa %xmm0, -19(%edx) +L(aligned_16_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_116bytes): + movdqa %xmm0, -116(%edx) +L(aligned_16_100bytes): + movdqa %xmm0, -100(%edx) +L(aligned_16_84bytes): + movdqa %xmm0, -84(%edx) +L(aligned_16_68bytes): + movdqa %xmm0, -68(%edx) +L(aligned_16_52bytes): + movdqa %xmm0, -52(%edx) +L(aligned_16_36bytes): + movdqa %xmm0, -36(%edx) +L(aligned_16_20bytes): + movdqa %xmm0, -20(%edx) +L(aligned_16_4bytes): + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_117bytes): + movdqa %xmm0, -117(%edx) +L(aligned_16_101bytes): + movdqa %xmm0, -101(%edx) +L(aligned_16_85bytes): + movdqa %xmm0, -85(%edx) +L(aligned_16_69bytes): + movdqa %xmm0, -69(%edx) +L(aligned_16_53bytes): + movdqa %xmm0, -53(%edx) +L(aligned_16_37bytes): + movdqa %xmm0, -37(%edx) +L(aligned_16_21bytes): + movdqa %xmm0, -21(%edx) +L(aligned_16_5bytes): + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_118bytes): + movdqa %xmm0, -118(%edx) +L(aligned_16_102bytes): + movdqa %xmm0, -102(%edx) +L(aligned_16_86bytes): + movdqa %xmm0, -86(%edx) +L(aligned_16_70bytes): + movdqa %xmm0, -70(%edx) +L(aligned_16_54bytes): + movdqa %xmm0, -54(%edx) +L(aligned_16_38bytes): + movdqa %xmm0, -38(%edx) +L(aligned_16_22bytes): + movdqa %xmm0, -22(%edx) +L(aligned_16_6bytes): + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_119bytes): + movdqa %xmm0, -119(%edx) +L(aligned_16_103bytes): + movdqa %xmm0, -103(%edx) +L(aligned_16_87bytes): + movdqa %xmm0, -87(%edx) +L(aligned_16_71bytes): + movdqa %xmm0, -71(%edx) +L(aligned_16_55bytes): + movdqa %xmm0, -55(%edx) +L(aligned_16_39bytes): + movdqa %xmm0, -39(%edx) +L(aligned_16_23bytes): + movdqa %xmm0, -23(%edx) +L(aligned_16_7bytes): + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_120bytes): + movdqa %xmm0, -120(%edx) +L(aligned_16_104bytes): + movdqa %xmm0, -104(%edx) +L(aligned_16_88bytes): + movdqa %xmm0, -88(%edx) +L(aligned_16_72bytes): + movdqa %xmm0, -72(%edx) +L(aligned_16_56bytes): + movdqa %xmm0, -56(%edx) +L(aligned_16_40bytes): + movdqa %xmm0, -40(%edx) +L(aligned_16_24bytes): + movdqa %xmm0, -24(%edx) +L(aligned_16_8bytes): + movq %xmm0, -8(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_121bytes): + movdqa %xmm0, -121(%edx) +L(aligned_16_105bytes): + movdqa %xmm0, -105(%edx) +L(aligned_16_89bytes): + movdqa %xmm0, -89(%edx) +L(aligned_16_73bytes): + movdqa %xmm0, -73(%edx) +L(aligned_16_57bytes): + movdqa %xmm0, -57(%edx) +L(aligned_16_41bytes): + movdqa %xmm0, -41(%edx) +L(aligned_16_25bytes): + movdqa %xmm0, -25(%edx) +L(aligned_16_9bytes): + movq %xmm0, -9(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_122bytes): + movdqa %xmm0, -122(%edx) +L(aligned_16_106bytes): + movdqa %xmm0, -106(%edx) +L(aligned_16_90bytes): + movdqa %xmm0, -90(%edx) +L(aligned_16_74bytes): + movdqa %xmm0, -74(%edx) +L(aligned_16_58bytes): + movdqa %xmm0, -58(%edx) +L(aligned_16_42bytes): + movdqa %xmm0, -42(%edx) +L(aligned_16_26bytes): + movdqa %xmm0, -26(%edx) +L(aligned_16_10bytes): + movq %xmm0, -10(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_123bytes): + movdqa %xmm0, -123(%edx) +L(aligned_16_107bytes): + movdqa %xmm0, -107(%edx) +L(aligned_16_91bytes): + movdqa %xmm0, -91(%edx) +L(aligned_16_75bytes): + movdqa %xmm0, -75(%edx) +L(aligned_16_59bytes): + movdqa %xmm0, -59(%edx) +L(aligned_16_43bytes): + movdqa %xmm0, -43(%edx) +L(aligned_16_27bytes): + movdqa %xmm0, -27(%edx) +L(aligned_16_11bytes): + movq %xmm0, -11(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_124bytes): + movdqa %xmm0, -124(%edx) +L(aligned_16_108bytes): + movdqa %xmm0, -108(%edx) +L(aligned_16_92bytes): + movdqa %xmm0, -92(%edx) +L(aligned_16_76bytes): + movdqa %xmm0, -76(%edx) +L(aligned_16_60bytes): + movdqa %xmm0, -60(%edx) +L(aligned_16_44bytes): + movdqa %xmm0, -44(%edx) +L(aligned_16_28bytes): + movdqa %xmm0, -28(%edx) +L(aligned_16_12bytes): + movq %xmm0, -12(%edx) + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_125bytes): + movdqa %xmm0, -125(%edx) +L(aligned_16_109bytes): + movdqa %xmm0, -109(%edx) +L(aligned_16_93bytes): + movdqa %xmm0, -93(%edx) +L(aligned_16_77bytes): + movdqa %xmm0, -77(%edx) +L(aligned_16_61bytes): + movdqa %xmm0, -61(%edx) +L(aligned_16_45bytes): + movdqa %xmm0, -45(%edx) +L(aligned_16_29bytes): + movdqa %xmm0, -29(%edx) +L(aligned_16_13bytes): + movq %xmm0, -13(%edx) + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_126bytes): + movdqa %xmm0, -126(%edx) +L(aligned_16_110bytes): + movdqa %xmm0, -110(%edx) +L(aligned_16_94bytes): + movdqa %xmm0, -94(%edx) +L(aligned_16_78bytes): + movdqa %xmm0, -78(%edx) +L(aligned_16_62bytes): + movdqa %xmm0, -62(%edx) +L(aligned_16_46bytes): + movdqa %xmm0, -46(%edx) +L(aligned_16_30bytes): + movdqa %xmm0, -30(%edx) +L(aligned_16_14bytes): + movq %xmm0, -14(%edx) + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_127bytes): + movdqa %xmm0, -127(%edx) +L(aligned_16_111bytes): + movdqa %xmm0, -111(%edx) +L(aligned_16_95bytes): + movdqa %xmm0, -95(%edx) +L(aligned_16_79bytes): + movdqa %xmm0, -79(%edx) +L(aligned_16_63bytes): + movdqa %xmm0, -63(%edx) +L(aligned_16_47bytes): + movdqa %xmm0, -47(%edx) +L(aligned_16_31bytes): + movdqa %xmm0, -31(%edx) +L(aligned_16_15bytes): + movq %xmm0, -15(%edx) + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN_END + +END (MEMSET) diff --git a/libc/arch-x86/silvermont/string/sse2-stpcpy-slm.S b/libc/arch-x86/silvermont/string/sse2-stpcpy-slm.S new file mode 100755 index 000000000..5c43fa50c --- /dev/null +++ b/libc/arch-x86/silvermont/string/sse2-stpcpy-slm.S @@ -0,0 +1,33 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define USE_AS_STPCPY +#define STRCPY stpcpy +#include "sse2-strcpy-slm.S" diff --git a/libc/arch-x86/silvermont/string/sse2-stpncpy-slm.S b/libc/arch-x86/silvermont/string/sse2-stpncpy-slm.S new file mode 100644 index 000000000..af5c0d362 --- /dev/null +++ b/libc/arch-x86/silvermont/string/sse2-stpncpy-slm.S @@ -0,0 +1,34 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define USE_AS_STRNCPY +#define USE_AS_STPCPY +#define STRCPY stpncpy +#include "sse2-strcpy-slm.S" diff --git a/libc/arch-x86/silvermont/string/sse2-strcpy-slm.S b/libc/arch-x86/silvermont/string/sse2-strcpy-slm.S new file mode 100755 index 000000000..b5d84b51f --- /dev/null +++ b/libc/arch-x86/silvermont/string/sse2-strcpy-slm.S @@ -0,0 +1,2157 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +#endif + +#ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +#endif + +#ifndef cfi_rel_offset +# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off +#endif + +#ifndef cfi_restore +# define cfi_restore(reg) .cfi_restore reg +#endif + +#ifndef cfi_adjust_cfa_offset +# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off +#endif + +#ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +#endif + +#ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifndef STRCPY +# define STRCPY strcpy +#endif + +#ifdef USE_AS_STPNCPY +# define USE_AS_STRNCPY +# define USE_AS_STPCPY +#endif + +#ifdef USE_AS_STRNCPY +# define PARMS 16 +# define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi) +# define RETURN POP(%edi); POP(%esi); POP(%ebx); ret; CFI_PUSH(%ebx); CFI_PUSH(%edi); CFI_PUSH(%edi); +#else +# define PARMS 12 +# define ENTRANCE PUSH(%esi); PUSH(%edi) +# define RETURN POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi); +#endif + +#define STR1 PARMS +#define STR2 STR1+4 +#define LEN STR2+4 + + +#if (defined SHARED || defined __PIC__) +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into ECX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into ECX. */ \ + call __x86.get_pc_thunk.cx; \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ecx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ecx,INDEX,SCALE), %ecx; \ + /* We loaded the jump table and adjuested ECX. Go. */ \ + jmp *%ecx +#else +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +#endif + +.text +ENTRY (STRCPY) + ENTRANCE + mov STR1(%esp), %edi + mov STR2(%esp), %esi +#ifdef USE_AS_STRNCPY + movl LEN(%esp), %ebx + test %ebx, %ebx + jz L(ExitZero) +#endif + + mov %esi, %ecx +#ifndef USE_AS_STPCPY + mov %edi, %eax /* save result */ +#endif + and $15, %ecx + jz L(SourceStringAlignmentZero) + + and $-16, %esi + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb (%esi), %xmm1 +#ifdef USE_AS_STRNCPY + add %ecx, %ebx +#endif + pmovmskb %xmm1, %edx + shr %cl, %edx +#ifdef USE_AS_STRNCPY +#ifdef USE_AS_STPCPY + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +#else + cmp $17, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +#endif +#endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx +#ifdef USE_AS_STRNCPY +#ifdef USE_AS_STPCPY + cmp $32, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +#else + cmp $33, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +#endif +#endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%edi) + + sub %ecx, %edi + mov %edi, %edx + mov $16, %ecx + and $15, %edx + jz L(Align16Both) + +/* If source adress alignment != destination adress alignment */ + .p2align 4 +L(Unalign16Both): + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%edi, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +#ifdef USE_AS_STRNCPY + sub $48, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm2) +#else + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%edi, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +#ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm3) +#else + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%esi, %ecx), %xmm4 + movdqu %xmm3, (%edi, %ecx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +#ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm4) +#else + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%esi, %ecx), %xmm1 + movdqu %xmm4, (%edi, %ecx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +#ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm1) +#else + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%edi, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +#ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm2) +#else + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%edi, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +#ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm3) +#else + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +#endif + + movdqu %xmm3, (%edi, %ecx) + mov %esi, %edx + lea 16(%esi, %ecx), %esi + and $-0x40, %esi + sub %esi, %edx + sub %edx, %edi +#ifdef USE_AS_STRNCPY + lea 64+64(%ebx, %edx), %ebx +#endif +L(Unaligned64Loop): + movaps (%esi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%esi), %xmm5 + movaps 32(%esi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%esi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx +#ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) +#endif + test %edx, %edx + jnz L(Unaligned64Leave) + +L(Unaligned64Loop_start): + add $64, %edi + add $64, %esi + movdqu %xmm4, -64(%edi) + movaps (%esi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%edi) + movaps 16(%esi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%esi), %xmm3 + movdqu %xmm6, -32(%edi) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%edi) + movaps 48(%esi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx +#ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) +#endif + test %edx, %edx + jz L(Unaligned64Loop_start) + +L(Unaligned64Leave): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %ecx, %ecx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %ecx, %edx + movdqu %xmm4, (%edi) + movdqu %xmm5, 16(%edi) + movdqu %xmm6, 32(%edi) +#ifdef USE_AS_STRNCPY +#ifdef USE_AS_STPCPY + lea 48(%edi, %edx), %eax +#endif + movdqu %xmm7, 48(%edi) + add $15, %ebx + sub %edx, %ebx + lea 49(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) +#else + add $48, %esi + add $48, %edi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) +#endif + +/* If source adress alignment == destination adress alignment */ + +L(SourceStringAlignmentZero): + pxor %xmm0, %xmm0 + movdqa (%esi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + +#ifdef USE_AS_STRNCPY +#ifdef USE_AS_STPCPY + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +#else + cmp $17, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +#endif +#endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1) + + pcmpeqb 16(%esi), %xmm0 + movdqu %xmm1, (%edi) + pmovmskb %xmm0, %edx + +#ifdef USE_AS_STRNCPY +#ifdef USE_AS_STPCPY + cmp $32, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +#else + cmp $33, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +#endif +#endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes1) + + mov %edi, %edx + mov $16, %ecx + and $15, %edx + jnz L(Unalign16Both) + +L(Align16Both): + movdqa (%esi, %ecx), %xmm1 + movdqa 16(%esi, %ecx), %xmm2 + movdqa %xmm1, (%edi, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +#ifdef USE_AS_STRNCPY + sub $48, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesXmm2) +#else + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +#endif + + movdqa 16(%esi, %ecx), %xmm3 + movdqa %xmm2, (%edi, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + lea 16(%ecx), %ecx +#ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesXmm3) +#else + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +#endif + + movdqa 16(%esi, %ecx), %xmm4 + movdqa %xmm3, (%edi, %ecx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + lea 16(%ecx), %ecx +#ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesXmm4) +#else + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +#endif + + movdqa 16(%esi, %ecx), %xmm1 + movdqa %xmm4, (%edi, %ecx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + lea 16(%ecx), %ecx +#ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesXmm1) +#else + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +#endif + + movdqa 16(%esi, %ecx), %xmm2 + movdqa %xmm1, (%edi, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + lea 16(%ecx), %ecx +#ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesXmm2) +#else + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +#endif + + movdqa 16(%esi, %ecx), %xmm3 + movdqa %xmm2, (%edi, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + lea 16(%ecx), %ecx +#ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesXmm3) +#else + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +#endif + + movdqa %xmm3, (%edi, %ecx) + mov %esi, %edx + lea 16(%esi, %ecx), %esi + and $-0x40, %esi + sub %esi, %edx + sub %edx, %edi +#ifdef USE_AS_STRNCPY + lea 64+64(%ebx, %edx), %ebx +#endif +L(Aligned64Loop): + movdqa (%esi), %xmm2 + movdqa %xmm2, %xmm4 + movaps 16(%esi), %xmm5 + movdqa 32(%esi), %xmm3 + movdqa %xmm3, %xmm6 + movaps 48(%esi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx +#ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(AlignedLeaveCase2OrCase3) +#endif + test %edx, %edx + jnz L(Aligned64Leave) + +L(Aligned64Loop_start): + add $64, %esi + add $64, %edi + movaps %xmm4, -64(%edi) + movdqa (%esi), %xmm2 + movdqa %xmm2, %xmm4 + movaps %xmm5, -48(%edi) + movaps 16(%esi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%esi), %xmm3 + movaps %xmm6, -32(%edi) + movdqa %xmm3, %xmm6 + movaps %xmm7, -16(%edi) + movaps 48(%esi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx +#ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(AlignedLeaveCase2OrCase3) +#endif + test %edx, %edx + jz L(Aligned64Loop_start) + +L(Aligned64Leave): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16Bytes_0) + test %ecx, %ecx + jnz L(CopyFrom1To16Bytes_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16Bytes_32) + + bsf %ecx, %edx + movdqa %xmm4, (%edi) + movdqa %xmm5, 16(%edi) + movdqa %xmm6, 32(%edi) +#ifdef USE_AS_STRNCPY +#ifdef USE_AS_STPCPY + lea 48(%edi, %edx), %eax +#endif + movdqa %xmm7, 48(%edi) + add $15, %ebx + sub %edx, %ebx + lea 49(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) +#else + add $48, %esi + add $48, %edi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) +#endif + +/*----------------------------------------------------*/ + +/* Case1 */ +#ifndef USE_AS_STRNCPY + .p2align 4 +L(CopyFrom1To16Bytes): + add %ecx, %edi + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) +#endif + .p2align 4 +L(CopyFrom1To16BytesTail): +#ifdef USE_AS_STRNCPY + sub %ecx, %ebx +#endif + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %esi + add $16, %edi +#ifdef USE_AS_STRNCPY + sub $16, %ebx +#endif +L(CopyFrom1To16BytesTail1): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): +#ifdef USE_AS_STRNCPY + sub %ecx, %ebx +#endif + bsf %edx, %edx + add %ecx, %esi + add $16, %edx + sub %ecx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16Bytes_0): + bsf %edx, %edx +#ifdef USE_AS_STRNCPY +#ifdef USE_AS_STPCPY + lea (%edi, %edx), %eax +#endif + movdqa %xmm4, (%edi) + add $63, %ebx + sub %edx, %ebx + lea 1(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) +#else + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) +#endif + + .p2align 4 +L(CopyFrom1To16Bytes_16): + bsf %ecx, %edx + movdqa %xmm4, (%edi) +#ifdef USE_AS_STRNCPY +#ifdef USE_AS_STPCPY + lea 16(%edi, %edx), %eax +#endif + movdqa %xmm5, 16(%edi) + add $47, %ebx + sub %edx, %ebx + lea 17(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) +#else + add $16, %esi + add $16, %edi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) +#endif + + .p2align 4 +L(CopyFrom1To16Bytes_32): + bsf %edx, %edx + movdqa %xmm4, (%edi) + movdqa %xmm5, 16(%edi) +#ifdef USE_AS_STRNCPY +#ifdef USE_AS_STPCPY + lea 32(%edi, %edx), %eax +#endif + movdqa %xmm6, 32(%edi) + add $31, %ebx + sub %edx, %ebx + lea 33(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) +#else + add $32, %esi + add $32, %edi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) +#endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %edx, %edx +#ifdef USE_AS_STRNCPY +#ifdef USE_AS_STPCPY + lea (%edi, %edx), %eax +#endif + movdqu %xmm4, (%edi) + add $63, %ebx + sub %edx, %ebx + lea 1(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) +#else + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) +#endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %ecx, %edx + movdqu %xmm4, (%edi) +#ifdef USE_AS_STRNCPY +#ifdef USE_AS_STPCPY + lea 16(%edi, %edx), %eax +#endif + movdqu %xmm5, 16(%edi) + add $47, %ebx + sub %edx, %ebx + lea 17(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) +#else + add $16, %esi + add $16, %edi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) +#endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %edx, %edx + movdqu %xmm4, (%edi) + movdqu %xmm5, 16(%edi) +#ifdef USE_AS_STRNCPY +#ifdef USE_AS_STPCPY + lea 32(%edi, %edx), %eax +#endif + movdqu %xmm6, 32(%edi) + add $31, %ebx + sub %edx, %ebx + lea 33(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) +#else + add $32, %esi + add $32, %edi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) +#endif + +#ifdef USE_AS_STRNCPY + .p2align 4 +L(CopyFrom1To16BytesXmm6): + movdqa %xmm6, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesXmm5): + movdqa %xmm5, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesXmm4): + movdqa %xmm4, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesXmm3): + movdqa %xmm3, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesXmm2): + movdqa %xmm2, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesXmm1): + movdqa %xmm1, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm6): + movdqu %xmm6, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm5): + movdqu %xmm5, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm4): + movdqu %xmm4, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm3): + movdqu %xmm3, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm1): + movdqu %xmm1, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %ecx, %edi + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + add $16, %edx + sub %ecx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +L(CopyFrom1To16BytesTailCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %ecx, %edi + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To32BytesCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTailCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %edi + add $16, %esi + sub $16, %ebx +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +#endif + +/*-----------------------------------------------------------------*/ + .p2align 4 +L(Exit0): +#ifdef USE_AS_STPCPY + mov %edi, %eax +#endif + RETURN + + .p2align 4 +L(Exit1): + movb %dh, (%edi) +#ifdef USE_AS_STPCPY + lea (%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $1, %ebx + lea 1(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit2): + movw (%esi), %dx + movw %dx, (%edi) +#ifdef USE_AS_STPCPY + lea 1(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $2, %ebx + lea 2(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit3): + movw (%esi), %cx + movw %cx, (%edi) + movb %dh, 2(%edi) +#ifdef USE_AS_STPCPY + lea 2(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $3, %ebx + lea 3(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit4): + movl (%esi), %edx + movl %edx, (%edi) +#ifdef USE_AS_STPCPY + lea 3(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $4, %ebx + lea 4(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit5): + movl (%esi), %ecx + movb %dh, 4(%edi) + movl %ecx, (%edi) +#ifdef USE_AS_STPCPY + lea 4(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $5, %ebx + lea 5(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%edi) + movw %dx, 4(%edi) +#ifdef USE_AS_STPCPY + lea 5(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $6, %ebx + lea 6(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%edi) + movl %edx, 3(%edi) +#ifdef USE_AS_STPCPY + lea 6(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $7, %ebx + lea 7(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%edi) +#ifdef USE_AS_STPCPY + lea 7(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $8, %ebx + lea 8(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit9): + movlpd (%esi), %xmm0 + movb %dh, 8(%edi) + movlpd %xmm0, (%edi) +#ifdef USE_AS_STPCPY + lea 8(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $9, %ebx + lea 9(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) +#ifdef USE_AS_STPCPY + lea 9(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $10, %ebx + lea 10(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) +#ifdef USE_AS_STPCPY + lea 10(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $11, %ebx + lea 11(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) +#ifdef USE_AS_STPCPY + lea 11(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $12, %ebx + lea 12(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 5(%edi) +#ifdef USE_AS_STPCPY + lea 12(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $13, %ebx + lea 13(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 6(%edi) +#ifdef USE_AS_STPCPY + lea 13(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $14, %ebx + lea 14(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 7(%edi) +#ifdef USE_AS_STPCPY + lea 14(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $15, %ebx + lea 15(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%edi) +#ifdef USE_AS_STPCPY + lea 15(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $16, %ebx + lea 16(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit17): + movdqu (%esi), %xmm0 + xor %cl, %cl + movdqu %xmm0, (%edi) + movb %cl, 16(%edi) +#ifdef USE_AS_STPCPY + lea 16(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $17, %ebx + lea 17(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%edi) + movw %cx, 16(%edi) +#ifdef USE_AS_STPCPY + lea 17(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $18, %ebx + lea 18(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 15(%edi) +#ifdef USE_AS_STPCPY + lea 18(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $19, %ebx + lea 19(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) +#ifdef USE_AS_STPCPY + lea 19(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $20, %ebx + lea 20(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + xor %dl, %dl + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) + movb %dl, 20(%edi) +#ifdef USE_AS_STPCPY + lea 20(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $21, %ebx + lea 21(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 14(%edi) +#ifdef USE_AS_STPCPY + lea 21(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $22, %ebx + lea 22(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 15(%edi) +#ifdef USE_AS_STPCPY + lea 22(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $23, %ebx + lea 23(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) +#ifdef USE_AS_STPCPY + lea 23(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $24, %ebx + lea 24(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + xor %cl, %cl + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movb %cl, 24(%edi) +#ifdef USE_AS_STPCPY + lea 24(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $25, %ebx + lea 25(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movw %cx, 24(%edi) +#ifdef USE_AS_STPCPY + lea 25(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $26, %ebx + lea 26(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 23(%edi) +#ifdef USE_AS_STPCPY + lea 26(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $27, %ebx + lea 27(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 24(%edi) +#ifdef USE_AS_STPCPY + lea 27(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $28, %ebx + lea 28(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 13(%edi) +#ifdef USE_AS_STPCPY + lea 28(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $29, %ebx + lea 29(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 14(%edi) +#ifdef USE_AS_STPCPY + lea 29(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $30, %ebx + lea 30(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + + .p2align 4 +L(Exit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 15(%edi) +#ifdef USE_AS_STPCPY + lea 30(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $31, %ebx + lea 31(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) +#ifdef USE_AS_STPCPY + lea 31(%edi), %eax +#endif +#ifdef USE_AS_STRNCPY + sub $32, %ebx + lea 32(%edi), %edi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + +#ifdef USE_AS_STRNCPY + + .p2align 4 +L(StrncpyExit1): + movb (%esi), %dl + movb %dl, (%edi) +#ifdef USE_AS_STPCPY + lea 1(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit2): + movw (%esi), %dx + movw %dx, (%edi) +#ifdef USE_AS_STPCPY + lea 2(%edi), %eax +#endif + RETURN + .p2align 4 +L(StrncpyExit3): + movw (%esi), %cx + movb 2(%esi), %dl + movw %cx, (%edi) + movb %dl, 2(%edi) +#ifdef USE_AS_STPCPY + lea 3(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit4): + movl (%esi), %edx + movl %edx, (%edi) +#ifdef USE_AS_STPCPY + lea 4(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit5): + movl (%esi), %ecx + movb 4(%esi), %dl + movl %ecx, (%edi) + movb %dl, 4(%edi) +#ifdef USE_AS_STPCPY + lea 5(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%edi) + movw %dx, 4(%edi) +#ifdef USE_AS_STPCPY + lea 6(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%edi) + movl %edx, 3(%edi) +#ifdef USE_AS_STPCPY + lea 7(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%edi) +#ifdef USE_AS_STPCPY + lea 8(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit9): + movlpd (%esi), %xmm0 + movb 8(%esi), %dl + movlpd %xmm0, (%edi) + movb %dl, 8(%edi) +#ifdef USE_AS_STPCPY + lea 9(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) +#ifdef USE_AS_STPCPY + lea 10(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) +#ifdef USE_AS_STPCPY + lea 11(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) +#ifdef USE_AS_STPCPY + lea 12(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 5(%edi) +#ifdef USE_AS_STPCPY + lea 13(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 6(%edi) +#ifdef USE_AS_STPCPY + lea 14(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 7(%edi) +#ifdef USE_AS_STPCPY + lea 15(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%edi) +#ifdef USE_AS_STPCPY + lea 16(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit17): + movdqu (%esi), %xmm0 + movb 16(%esi), %cl + movdqu %xmm0, (%edi) + movb %cl, 16(%edi) +#ifdef USE_AS_STPCPY + lea 17(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%edi) + movw %cx, 16(%edi) +#ifdef USE_AS_STPCPY + lea 18(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 15(%edi) +#ifdef USE_AS_STPCPY + lea 19(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) +#ifdef USE_AS_STPCPY + lea 20(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movb 20(%esi), %dl + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) + movb %dl, 20(%edi) +#ifdef USE_AS_STPCPY + lea 21(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 14(%edi) +#ifdef USE_AS_STPCPY + lea 22(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 15(%edi) +#ifdef USE_AS_STPCPY + lea 23(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) +#ifdef USE_AS_STPCPY + lea 24(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movb 24(%esi), %cl + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movb %cl, 24(%edi) +#ifdef USE_AS_STPCPY + lea 25(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movw %cx, 24(%edi) +#ifdef USE_AS_STPCPY + lea 26(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 23(%edi) +#ifdef USE_AS_STPCPY + lea 27(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 24(%edi) +#ifdef USE_AS_STPCPY + lea 28(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 13(%edi) +#ifdef USE_AS_STPCPY + lea 29(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 14(%edi) +#ifdef USE_AS_STPCPY + lea 30(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 15(%edi) +#ifdef USE_AS_STPCPY + lea 31(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) +#ifdef USE_AS_STPCPY + lea 32(%edi), %eax +#endif + RETURN + + .p2align 4 +L(StrncpyExit33): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movb 32(%esi), %cl + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) + movb %cl, 32(%edi) + RETURN + + .p2align 4 +L(Fill0): + RETURN + + .p2align 4 +L(Fill1): + movb %dl, (%edi) + RETURN + + .p2align 4 +L(Fill2): + movw %dx, (%edi) + RETURN + + .p2align 4 +L(Fill3): + movl %edx, -1(%edi) + RETURN + + .p2align 4 +L(Fill4): + movl %edx, (%edi) + RETURN + + .p2align 4 +L(Fill5): + movl %edx, (%edi) + movb %dl, 4(%edi) + RETURN + + .p2align 4 +L(Fill6): + movl %edx, (%edi) + movw %dx, 4(%edi) + RETURN + + .p2align 4 +L(Fill7): + movlpd %xmm0, -1(%edi) + RETURN + + .p2align 4 +L(Fill8): + movlpd %xmm0, (%edi) + RETURN + + .p2align 4 +L(Fill9): + movlpd %xmm0, (%edi) + movb %dl, 8(%edi) + RETURN + + .p2align 4 +L(Fill10): + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) + RETURN + + .p2align 4 +L(Fill11): + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) + RETURN + + .p2align 4 +L(Fill12): + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) + RETURN + + .p2align 4 +L(Fill13): + movlpd %xmm0, (%edi) + movlpd %xmm0, 5(%edi) + RETURN + + .p2align 4 +L(Fill14): + movlpd %xmm0, (%edi) + movlpd %xmm0, 6(%edi) + RETURN + + .p2align 4 +L(Fill15): + movdqu %xmm0, -1(%edi) + RETURN + + .p2align 4 +L(Fill16): + movdqu %xmm0, (%edi) + RETURN + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm2): + movdqu %xmm2, (%edi, %ecx) + + .p2align 4 +L(CopyFrom1To16BytesXmmExit): + bsf %edx, %edx + add $15, %ebx + add %ecx, %edi +#ifdef USE_AS_STPCPY + lea (%edi, %edx), %eax +#endif + sub %edx, %ebx + lea 1(%edi, %edx), %edi + + .p2align 4 +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %edx, %edx + sub $16, %ebx + jbe L(StrncpyFillExit) + + movdqu %xmm0, (%edi) + add $16, %edi + + mov %edi, %esi + and $0xf, %esi + sub %esi, %edi + add %esi, %ebx + sub $64, %ebx + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%edi) + movdqa %xmm0, 16(%edi) + movdqa %xmm0, 32(%edi) + movdqa %xmm0, 48(%edi) + add $64, %edi + sub $64, %ebx + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %ebx + jl L(StrncpyFillLess32) + movdqa %xmm0, (%edi) + movdqa %xmm0, 16(%edi) + add $32, %edi + sub $16, %ebx + jl L(StrncpyFillExit) + movdqa %xmm0, (%edi) + add $16, %edi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + +L(StrncpyFillLess32): + add $16, %ebx + jl L(StrncpyFillExit) + movdqa %xmm0, (%edi) + add $16, %edi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + +L(StrncpyFillExit): + add $16, %ebx + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + + .p2align 4 +L(AlignedLeaveCase2OrCase3): + test %edx, %edx + jnz L(Aligned64LeaveCase2) +L(Aligned64LeaveCase3): + lea 64(%ebx), %ecx + and $-16, %ecx + add $48, %ebx + jl L(CopyFrom1To16BytesCase3) + movdqa %xmm4, (%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqa %xmm5, 16(%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqa %xmm6, 32(%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqa %xmm7, 48(%edi) +#ifdef USE_AS_STPCPY + lea 64(%edi), %eax +#endif + RETURN + + .p2align 4 +L(Aligned64LeaveCase2): + pxor %xmm0, %xmm0 + xor %ecx, %ecx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesXmm4) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %edx + movdqa %xmm4, (%edi) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesXmm5) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %edx + movdqa %xmm5, 16(%edi) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesXmm6) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %edx + movdqa %xmm6, 32(%edi) + lea 16(%edi, %ecx), %edi + lea 16(%esi, %ecx), %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %edx, %edx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%ebx), %ecx + and $-16, %ecx + add $48, %ebx + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%edi) +#ifdef USE_AS_STPCPY + lea 64(%edi), %eax +#endif + RETURN + + .p2align 4 +L(Unaligned64LeaveCase2): + pxor %xmm0, %xmm0 + xor %ecx, %ecx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm4) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm4, (%edi) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm5) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm5, 16(%edi) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm6) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm6, 32(%edi) + lea 16(%edi, %ecx), %edi + lea 16(%esi, %ecx), %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(ExitZero): + movl %edi, %eax + RETURN +#endif + +END (STRCPY) + + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) +#ifdef USE_AS_STRNCPY +L(ExitStrncpyTable): + .int JMPTBL(L(Exit0), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable)) + + .p2align 4 +L(FillTable): + .int JMPTBL(L(Fill0), L(FillTable)) + .int JMPTBL(L(Fill1), L(FillTable)) + .int JMPTBL(L(Fill2), L(FillTable)) + .int JMPTBL(L(Fill3), L(FillTable)) + .int JMPTBL(L(Fill4), L(FillTable)) + .int JMPTBL(L(Fill5), L(FillTable)) + .int JMPTBL(L(Fill6), L(FillTable)) + .int JMPTBL(L(Fill7), L(FillTable)) + .int JMPTBL(L(Fill8), L(FillTable)) + .int JMPTBL(L(Fill9), L(FillTable)) + .int JMPTBL(L(Fill10), L(FillTable)) + .int JMPTBL(L(Fill11), L(FillTable)) + .int JMPTBL(L(Fill12), L(FillTable)) + .int JMPTBL(L(Fill13), L(FillTable)) + .int JMPTBL(L(Fill14), L(FillTable)) + .int JMPTBL(L(Fill15), L(FillTable)) + .int JMPTBL(L(Fill16), L(FillTable)) +#endif diff --git a/libc/arch-x86/silvermont/string/sse2-strlen-slm.S b/libc/arch-x86/silvermont/string/sse2-strlen-slm.S new file mode 100755 index 000000000..27cc0258c --- /dev/null +++ b/libc/arch-x86/silvermont/string/sse2-strlen-slm.S @@ -0,0 +1,328 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef STRLEN +# define STRLEN strlen +#endif + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +#endif + +#ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +#endif + +#ifndef cfi_rel_offset +# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off +#endif + +#ifndef cfi_restore +# define cfi_restore(reg) .cfi_restore reg +#endif + +#ifndef cfi_adjust_cfa_offset +# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off +#endif + +#ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +#endif + +#ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + + .section .text.sse2,"ax",@progbits +ENTRY (STRLEN) + mov 4(%esp), %edx + mov %edx, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx + ja L(next) + movdqu (%edx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(exit_less16) + mov %edx, %eax + and $-16, %eax + jmp L(align16_start) +L(next): + mov %edx, %eax + and $-16, %eax + PUSH (%edi) + pcmpeqb (%eax), %xmm0 + mov $-1, %edi + sub %eax, %ecx + shl %cl, %edi + pmovmskb %xmm0, %ecx + and %edi, %ecx + POP (%edi) + jnz L(exit_unaligned) + pxor %xmm0, %xmm0 +L(align16_start): + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(exit16) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %ecx + test %ecx, %ecx + jnz L(exit32) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(exit48) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %ecx + test %ecx, %ecx + jnz L(exit64) + + pcmpeqb 80(%eax), %xmm0 + add $64, %eax + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(exit16) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %ecx + test %ecx, %ecx + jnz L(exit32) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(exit48) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %ecx + test %ecx, %ecx + jnz L(exit64) + + pcmpeqb 80(%eax), %xmm0 + add $64, %eax + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(exit16) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %ecx + test %ecx, %ecx + jnz L(exit32) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(exit48) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %ecx + test %ecx, %ecx + jnz L(exit64) + + pcmpeqb 80(%eax), %xmm0 + add $64, %eax + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(exit16) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %ecx + test %ecx, %ecx + jnz L(exit32) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(exit48) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %ecx + test %ecx, %ecx + jnz L(exit64) + + + test $0x3f, %eax + jz L(align64_loop) + + pcmpeqb 80(%eax), %xmm0 + add $80, %eax + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(exit) + + test $0x3f, %eax + jz L(align64_loop) + + pcmpeqb 16(%eax), %xmm1 + add $16, %eax + pmovmskb %xmm1, %ecx + test %ecx, %ecx + jnz L(exit) + + test $0x3f, %eax + jz L(align64_loop) + + pcmpeqb 16(%eax), %xmm2 + add $16, %eax + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(exit) + + test $0x3f, %eax + jz L(align64_loop) + + pcmpeqb 16(%eax), %xmm3 + add $16, %eax + pmovmskb %xmm3, %ecx + test %ecx, %ecx + jnz L(exit) + + add $16, %eax + .p2align 4 +L(align64_loop): + movaps (%eax), %xmm4 + pminub 16(%eax), %xmm4 + movaps 32(%eax), %xmm5 + pminub 48(%eax), %xmm5 + add $64, %eax + pminub %xmm4, %xmm5 + pcmpeqb %xmm0, %xmm5 + pmovmskb %xmm5, %ecx + test %ecx, %ecx + jz L(align64_loop) + + + pcmpeqb -64(%eax), %xmm0 + sub $80, %eax + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(exit16) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %ecx + test %ecx, %ecx + jnz L(exit32) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(exit48) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %ecx + sub %edx, %eax + bsf %ecx, %ecx + add %ecx, %eax + add $64, %eax + ret + + .p2align 4 +L(exit): + sub %edx, %eax + bsf %ecx, %ecx + add %ecx, %eax + ret + +L(exit_less16): + bsf %ecx, %eax + ret + + .p2align 4 +L(exit_unaligned): + sub %edx, %eax + bsf %ecx, %ecx + add %ecx, %eax + ret + + .p2align 4 +L(exit16): + sub %edx, %eax + bsf %ecx, %ecx + add %ecx, %eax + add $16, %eax + ret + + .p2align 4 +L(exit32): + sub %edx, %eax + bsf %ecx, %ecx + add %ecx, %eax + add $32, %eax + ret + + .p2align 4 +L(exit48): + sub %edx, %eax + bsf %ecx, %ecx + add %ecx, %eax + add $48, %eax + ret + + .p2align 4 +L(exit64): + sub %edx, %eax + bsf %ecx, %ecx + add %ecx, %eax + add $64, %eax + ret + +END (STRLEN) + diff --git a/libc/arch-x86/silvermont/string/sse2-strncpy-slm.S b/libc/arch-x86/silvermont/string/sse2-strncpy-slm.S new file mode 100755 index 000000000..591419fcf --- /dev/null +++ b/libc/arch-x86/silvermont/string/sse2-strncpy-slm.S @@ -0,0 +1,33 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define USE_AS_STRNCPY +#define STRCPY strncpy +#include "sse2-strcpy-slm.S" diff --git a/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S b/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S new file mode 100755 index 000000000..b30288349 --- /dev/null +++ b/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S @@ -0,0 +1,1277 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +#endif + +#ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +#endif + +#ifndef cfi_rel_offset +# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off +#endif + +#ifndef cfi_restore +# define cfi_restore(reg) .cfi_restore reg +#endif + +#ifndef cfi_adjust_cfa_offset +# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off +#endif + +#ifndef cfi_remember_state +# define cfi_remember_state .cfi_remember_state +#endif + +#ifndef cfi_restore_state +# define cfi_restore_state .cfi_restore_state +#endif + +#ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +#endif + +#ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +#endif + +#ifndef MEMCMP +# define MEMCMP memcmp +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#define PARMS 4 +#define BLK1 PARMS +#define BLK2 BLK1 + 4 +#define LEN BLK2 + 4 +#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) + + +#if (defined SHARED || defined __PIC__) +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ +/* We first load PC into EBX. */ \ + call __x86.get_pc_thunk.bx; \ +/* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ +/* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ +/* We loaded the jump table and adjuested EDX/ESI. Go. */ \ + jmp *%ebx +#else +# define JMPTBL(I, B) I + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +#endif + + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + .section .text.sse4.2,"ax",@progbits +ENTRY (MEMCMP) + movl BLK1(%esp), %eax + movl BLK2(%esp), %edx + movl LEN(%esp), %ecx + +#ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(return0) +#else + cmp $1, %ecx + jbe L(less1bytes) +#endif + + pxor %xmm0, %xmm0 + cmp $64, %ecx + ja L(64bytesormore) + cmp $8, %ecx + +#ifndef USE_AS_WMEMCMP + PUSH (%ebx) + jb L(less8bytes) +#else + jb L(less8bytes) + PUSH (%ebx) +#endif + + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) + +#ifndef USE_AS_WMEMCMP + .p2align 4 +L(less8bytes): + mov (%eax), %bl + cmpb (%edx), %bl + jne L(nonzero) + + mov 1(%eax), %bl + cmpb 1(%edx), %bl + jne L(nonzero) + + cmp $2, %ecx + jz L(0bytes) + + mov 2(%eax), %bl + cmpb 2(%edx), %bl + jne L(nonzero) + + cmp $3, %ecx + jz L(0bytes) + + mov 3(%eax), %bl + cmpb 3(%edx), %bl + jne L(nonzero) + + cmp $4, %ecx + jz L(0bytes) + + mov 4(%eax), %bl + cmpb 4(%edx), %bl + jne L(nonzero) + + cmp $5, %ecx + jz L(0bytes) + + mov 5(%eax), %bl + cmpb 5(%edx), %bl + jne L(nonzero) + + cmp $6, %ecx + jz L(0bytes) + + mov 6(%eax), %bl + cmpb 6(%edx), %bl + je L(0bytes) + +L(nonzero): + POP (%ebx) + mov $1, %eax + ja L(above) + neg %eax +L(above): + ret + CFI_PUSH (%ebx) +#endif + + .p2align 4 +L(0bytes): + POP (%ebx) + xor %eax, %eax + ret + +#ifdef USE_AS_WMEMCMP + +/* for wmemcmp, case N == 1 */ + + .p2align 4 +L(less8bytes): + mov (%eax), %ecx + cmp (%edx), %ecx + je L(return0) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + .p2align 4 +L(find_diff_bigger): + ret + + .p2align 4 +L(return0): + xor %eax, %eax + ret +#endif + +#ifndef USE_AS_WMEMCMP + .p2align 4 +L(less1bytes): + jb L(0bytesend) + movzbl (%eax), %eax + movzbl (%edx), %edx + sub %edx, %eax + ret + + .p2align 4 +L(0bytesend): + xor %eax, %eax + ret +#endif + .p2align 4 +L(64bytesormore): + PUSH (%ebx) + mov %ecx, %ebx + mov $64, %ecx + sub $64, %ebx +L(64bytesormore_loop): + movdqu (%eax), %xmm1 + movdqu (%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_16diff) + + movdqu 16(%eax), %xmm1 + movdqu 16(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_32diff) + + movdqu 32(%eax), %xmm1 + movdqu 32(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_48diff) + + movdqu 48(%eax), %xmm1 + movdqu 48(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_64diff) + add %ecx, %eax + add %ecx, %edx + sub %ecx, %ebx + jae L(64bytesormore_loop) + add %ebx, %ecx + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) + +#ifdef USE_AS_WMEMCMP + +/* Label needs only for table_64bytes filling */ +L(unreal_case): +/* no code here */ + +#endif + .p2align 4 +L(find_16diff): + sub $16, %ecx +L(find_32diff): + sub $16, %ecx +L(find_48diff): + sub $16, %ecx +L(find_64diff): + add %ecx, %edx + add %ecx, %eax + +#ifndef USE_AS_WMEMCMP + .p2align 4 +L(16bytes): + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN +#else + .p2align 4 +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + cmp -4(%edx), %ecx + mov $0, %eax + jne L(find_diff) + RETURN +#endif + +#ifndef USE_AS_WMEMCMP + .p2align 4 +L(49bytes): + movdqu -49(%eax), %xmm1 + movdqu -49(%edx), %xmm2 + mov $-49, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(33bytes): + movdqu -33(%eax), %xmm1 + movdqu -33(%edx), %xmm2 + mov $-33, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(17bytes): + mov -17(%eax), %ecx + mov -17(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(13bytes): + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(9bytes): + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(5bytes): + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(50bytes): + mov $-50, %ebx + movdqu -50(%eax), %xmm1 + movdqu -50(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(34bytes): + mov $-34, %ebx + movdqu -34(%eax), %xmm1 + movdqu -34(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(18bytes): + mov -18(%eax), %ecx + mov -18(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(14bytes): + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(10bytes): + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(6bytes): + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(2bytes): + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(51bytes): + mov $-51, %ebx + movdqu -51(%eax), %xmm1 + movdqu -51(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(35bytes): + mov $-35, %ebx + movdqu -35(%eax), %xmm1 + movdqu -35(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(19bytes): + movl -19(%eax), %ecx + movl -19(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(15bytes): + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(11bytes): + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(7bytes): + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(3bytes): + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) +L(1bytes): + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +#endif + .p2align 4 +L(52bytes): + movdqu -52(%eax), %xmm1 + movdqu -52(%edx), %xmm2 + mov $-52, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(36bytes): + movdqu -36(%eax), %xmm1 + movdqu -36(%edx), %xmm2 + mov $-36, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(20bytes): + movdqu -20(%eax), %xmm1 + movdqu -20(%edx), %xmm2 + mov $-20, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -4(%eax), %ecx +#ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +#else + cmp -4(%edx), %ecx +#endif + mov $0, %eax + jne L(find_diff) + RETURN + +#ifndef USE_AS_WMEMCMP + .p2align 4 +L(53bytes): + movdqu -53(%eax), %xmm1 + movdqu -53(%edx), %xmm2 + mov $-53, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(37bytes): + mov $-37, %ebx + movdqu -37(%eax), %xmm1 + movdqu -37(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(21bytes): + mov $-21, %ebx + movdqu -21(%eax), %xmm1 + movdqu -21(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(54bytes): + movdqu -54(%eax), %xmm1 + movdqu -54(%edx), %xmm2 + mov $-54, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(38bytes): + mov $-38, %ebx + movdqu -38(%eax), %xmm1 + movdqu -38(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(22bytes): + mov $-22, %ebx + movdqu -22(%eax), %xmm1 + movdqu -22(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(55bytes): + movdqu -55(%eax), %xmm1 + movdqu -55(%edx), %xmm2 + mov $-55, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(39bytes): + mov $-39, %ebx + movdqu -39(%eax), %xmm1 + movdqu -39(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(23bytes): + mov $-23, %ebx + movdqu -23(%eax), %xmm1 + movdqu -23(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +#endif + .p2align 4 +L(56bytes): + movdqu -56(%eax), %xmm1 + movdqu -56(%edx), %xmm2 + mov $-56, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(40bytes): + mov $-40, %ebx + movdqu -40(%eax), %xmm1 + movdqu -40(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(24bytes): + mov $-24, %ebx + movdqu -24(%eax), %xmm1 + movdqu -24(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -8(%eax), %ecx +#ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +#else + cmp -8(%edx), %ecx +#endif + jne L(find_diff) + + mov -4(%eax), %ecx +#ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +#else + cmp -4(%edx), %ecx +#endif + mov $0, %eax + jne L(find_diff) + RETURN + +#ifndef USE_AS_WMEMCMP + .p2align 4 +L(57bytes): + movdqu -57(%eax), %xmm1 + movdqu -57(%edx), %xmm2 + mov $-57, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(41bytes): + mov $-41, %ebx + movdqu -41(%eax), %xmm1 + movdqu -41(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(25bytes): + mov $-25, %ebx + movdqu -25(%eax), %xmm1 + movdqu -25(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(58bytes): + movdqu -58(%eax), %xmm1 + movdqu -58(%edx), %xmm2 + mov $-58, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(42bytes): + mov $-42, %ebx + movdqu -42(%eax), %xmm1 + movdqu -42(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(26bytes): + mov $-26, %ebx + movdqu -26(%eax), %xmm1 + movdqu -26(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(59bytes): + movdqu -59(%eax), %xmm1 + movdqu -59(%edx), %xmm2 + mov $-59, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(43bytes): + mov $-43, %ebx + movdqu -43(%eax), %xmm1 + movdqu -43(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(27bytes): + mov $-27, %ebx + movdqu -27(%eax), %xmm1 + movdqu -27(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +#endif + .p2align 4 +L(60bytes): + movdqu -60(%eax), %xmm1 + movdqu -60(%edx), %xmm2 + mov $-60, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(44bytes): + mov $-44, %ebx + movdqu -44(%eax), %xmm1 + movdqu -44(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(28bytes): + mov $-28, %ebx + movdqu -28(%eax), %xmm1 + movdqu -28(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -12(%eax), %ecx +#ifndef USE_AS_WMEMCMP + mov -12(%edx), %ebx + cmp %ebx, %ecx +#else + cmp -12(%edx), %ecx +#endif + jne L(find_diff) + + mov -8(%eax), %ecx +#ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +#else + cmp -8(%edx), %ecx +#endif + jne L(find_diff) + + mov -4(%eax), %ecx +#ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +#else + cmp -4(%edx), %ecx +#endif + mov $0, %eax + jne L(find_diff) + RETURN + +#ifndef USE_AS_WMEMCMP + .p2align 4 +L(61bytes): + movdqu -61(%eax), %xmm1 + movdqu -61(%edx), %xmm2 + mov $-61, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(45bytes): + mov $-45, %ebx + movdqu -45(%eax), %xmm1 + movdqu -45(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(29bytes): + mov $-29, %ebx + movdqu -29(%eax), %xmm1 + movdqu -29(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(62bytes): + movdqu -62(%eax), %xmm1 + movdqu -62(%edx), %xmm2 + mov $-62, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(46bytes): + mov $-46, %ebx + movdqu -46(%eax), %xmm1 + movdqu -46(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(30bytes): + mov $-30, %ebx + movdqu -30(%eax), %xmm1 + movdqu -30(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(63bytes): + movdqu -63(%eax), %xmm1 + movdqu -63(%edx), %xmm2 + mov $-63, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(47bytes): + mov $-47, %ebx + movdqu -47(%eax), %xmm1 + movdqu -47(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(31bytes): + mov $-31, %ebx + movdqu -31(%eax), %xmm1 + movdqu -31(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +#endif + + .p2align 4 +L(64bytes): + movdqu -64(%eax), %xmm1 + movdqu -64(%edx), %xmm2 + mov $-64, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(48bytes): + movdqu -48(%eax), %xmm1 + movdqu -48(%edx), %xmm2 + mov $-48, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(32bytes): + movdqu -32(%eax), %xmm1 + movdqu -32(%edx), %xmm2 + mov $-32, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -16(%eax), %ecx +#ifndef USE_AS_WMEMCMP + mov -16(%edx), %ebx + cmp %ebx, %ecx +#else + cmp -16(%edx), %ecx +#endif + jne L(find_diff) + + mov -12(%eax), %ecx +#ifndef USE_AS_WMEMCMP + mov -12(%edx), %ebx + cmp %ebx, %ecx +#else + cmp -12(%edx), %ecx +#endif + jne L(find_diff) + + mov -8(%eax), %ecx +#ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +#else + cmp -8(%edx), %ecx +#endif + jne L(find_diff) + + mov -4(%eax), %ecx +#ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +#else + cmp -4(%edx), %ecx +#endif + mov $0, %eax + jne L(find_diff) + RETURN + +#ifndef USE_AS_WMEMCMP + .p2align 4 +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + mov (%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + mov 4(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + mov 8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + mov 12(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN +#else + .p2align 4 +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + cmp (%edx), %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + cmp 4(%edx), %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + cmp 8(%edx), %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + cmp 12(%edx), %ecx + + mov $0, %eax + jne L(find_diff) + RETURN +#endif + + .p2align 4 +L(find_diff): +#ifndef USE_AS_WMEMCMP + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + shr $16,%ecx + shr $16,%ebx + cmp %bl, %cl + jne L(end) + cmp %bx, %cx +L(end): + POP (%ebx) + mov $1, %eax + ja L(bigger) + neg %eax +L(bigger): + ret +#else + POP (%ebx) + mov $1, %eax + jg L(bigger) + neg %eax + ret + + .p2align 4 +L(bigger): + ret +#endif +END (MEMCMP) + + .section .rodata.sse4.2,"a",@progbits + .p2align 2 + .type L(table_64bytes), @object +#ifndef USE_AS_WMEMCMP +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(1bytes), L(table_64bytes)) + .int JMPTBL (L(2bytes), L(table_64bytes)) + .int JMPTBL (L(3bytes), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(5bytes), L(table_64bytes)) + .int JMPTBL (L(6bytes), L(table_64bytes)) + .int JMPTBL (L(7bytes), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(9bytes), L(table_64bytes)) + .int JMPTBL (L(10bytes), L(table_64bytes)) + .int JMPTBL (L(11bytes), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(13bytes), L(table_64bytes)) + .int JMPTBL (L(14bytes), L(table_64bytes)) + .int JMPTBL (L(15bytes), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(17bytes), L(table_64bytes)) + .int JMPTBL (L(18bytes), L(table_64bytes)) + .int JMPTBL (L(19bytes), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(21bytes), L(table_64bytes)) + .int JMPTBL (L(22bytes), L(table_64bytes)) + .int JMPTBL (L(23bytes), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(25bytes), L(table_64bytes)) + .int JMPTBL (L(26bytes), L(table_64bytes)) + .int JMPTBL (L(27bytes), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(29bytes), L(table_64bytes)) + .int JMPTBL (L(30bytes), L(table_64bytes)) + .int JMPTBL (L(31bytes), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(33bytes), L(table_64bytes)) + .int JMPTBL (L(34bytes), L(table_64bytes)) + .int JMPTBL (L(35bytes), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(37bytes), L(table_64bytes)) + .int JMPTBL (L(38bytes), L(table_64bytes)) + .int JMPTBL (L(39bytes), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(41bytes), L(table_64bytes)) + .int JMPTBL (L(42bytes), L(table_64bytes)) + .int JMPTBL (L(43bytes), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(45bytes), L(table_64bytes)) + .int JMPTBL (L(46bytes), L(table_64bytes)) + .int JMPTBL (L(47bytes), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(49bytes), L(table_64bytes)) + .int JMPTBL (L(50bytes), L(table_64bytes)) + .int JMPTBL (L(51bytes), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(53bytes), L(table_64bytes)) + .int JMPTBL (L(54bytes), L(table_64bytes)) + .int JMPTBL (L(55bytes), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(57bytes), L(table_64bytes)) + .int JMPTBL (L(58bytes), L(table_64bytes)) + .int JMPTBL (L(59bytes), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(61bytes), L(table_64bytes)) + .int JMPTBL (L(62bytes), L(table_64bytes)) + .int JMPTBL (L(63bytes), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) +#else +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) +#endif diff --git a/libc/arch-x86/silvermont/string/sse4-wmemcmp-slm.S b/libc/arch-x86/silvermont/string/sse4-wmemcmp-slm.S new file mode 100755 index 000000000..2c350bb6a --- /dev/null +++ b/libc/arch-x86/silvermont/string/sse4-wmemcmp-slm.S @@ -0,0 +1,33 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define USE_AS_WMEMCMP +#define MEMCMP wmemcmp +#include "sse4-memcmp-slm.S" diff --git a/libc/arch-x86/x86.mk b/libc/arch-x86/x86.mk index a1d55f0e3..34da0ced1 100644 --- a/libc/arch-x86/x86.mk +++ b/libc/arch-x86/x86.mk @@ -32,60 +32,15 @@ libc_bionic_src_files_x86 += \ arch-x86/bionic/syscall.S \ arch-x86/bionic/vfork.S \ -ifeq ($(ARCH_X86_HAVE_SSSE3),true) -libc_bionic_src_files_x86 += \ - arch-x86/string/ssse3-memcpy-atom.S \ - arch-x86/string/ssse3-memmove-atom.S \ - arch-x86/string/ssse3-bcopy-atom.S \ - arch-x86/string/ssse3-strncat-atom.S \ - arch-x86/string/ssse3-strncpy-atom.S \ - arch-x86/string/ssse3-strlcat-atom.S \ - arch-x86/string/ssse3-strlcpy-atom.S \ - arch-x86/string/ssse3-strcmp-atom.S \ - arch-x86/string/ssse3-strncmp-atom.S \ - arch-x86/string/ssse3-strcat-atom.S \ - arch-x86/string/ssse3-strcpy-atom.S \ - arch-x86/string/ssse3-memcmp-atom.S \ - arch-x86/string/ssse3-wmemcmp-atom.S \ - arch-x86/string/ssse3-memcmp16-atom.S \ - arch-x86/string/ssse3-wcscat-atom.S \ - arch-x86/string/ssse3-wcscpy-atom.S -else -libc_bionic_src_files_x86 += \ - arch-x86/string/memcpy.S \ - arch-x86/string/memmove.S \ - arch-x86/string/bcopy.S \ - arch-x86/string/strcmp.S \ - arch-x86/string/strncmp.S \ - arch-x86/string/strcat.S \ - arch-x86/string/memcmp.S \ - bionic/__memcmp16.cpp \ - upstream-freebsd/lib/libc/string/wcscpy.c \ - upstream-freebsd/lib/libc/string/wcscat.c \ - upstream-freebsd/lib/libc/string/wmemcmp.c \ - upstream-openbsd/lib/libc/string/strcpy.c \ - upstream-openbsd/lib/libc/string/strlcat.c \ - upstream-openbsd/lib/libc/string/strlcpy.c \ - upstream-openbsd/lib/libc/string/strncat.c \ - upstream-openbsd/lib/libc/string/strncpy.c \ - +## ARCH variant specific source files +arch_variant_mk := $(LOCAL_PATH)/arch-x86/$(TARGET_ARCH_VARIANT)/$(TARGET_ARCH_VARIANT).mk +ifeq ($(wildcard $(arch_variant_mk)),) + arch_variant_mk := $(LOCAL_PATH)/arch-x86/generic/generic.mk endif +include $(arch_variant_mk) +libc_common_additional_dependencies += $(arch_variant_mk) -libc_bionic_src_files_x86 += \ - arch-x86/string/sse2-memset-atom.S \ - arch-x86/string/sse2-bzero-atom.S \ - arch-x86/string/sse2-memchr-atom.S \ - arch-x86/string/sse2-memrchr-atom.S \ - arch-x86/string/sse2-strchr-atom.S \ - arch-x86/string/sse2-strrchr-atom.S \ - arch-x86/string/sse2-index-atom.S \ - arch-x86/string/sse2-strlen-atom.S \ - arch-x86/string/sse2-strnlen-atom.S \ - arch-x86/string/sse2-wcschr-atom.S \ - arch-x86/string/sse2-wcsrchr-atom.S \ - arch-x86/string/sse2-wcslen-atom.S \ - arch-x86/string/sse2-wcscmp-atom.S \ - +arch_variant_mk := libc_crt_target_cflags_x86 := \ -m32 \ diff --git a/libc/arch-x86_64/x86_64.mk b/libc/arch-x86_64/x86_64.mk index 9bce065b3..f0010f3a7 100644 --- a/libc/arch-x86_64/x86_64.mk +++ b/libc/arch-x86_64/x86_64.mk @@ -19,6 +19,8 @@ libc_common_src_files_x86_64 := \ upstream-freebsd/lib/libc/string/wcsrchr.c \ upstream-freebsd/lib/libc/string/wmemcmp.c \ upstream-openbsd/lib/libc/string/bcopy.c \ + upstream-openbsd/lib/libc/string/stpcpy.c \ + upstream-openbsd/lib/libc/string/stpncpy.c \ upstream-openbsd/lib/libc/string/strcat.c \ upstream-openbsd/lib/libc/string/strcmp.c \ upstream-openbsd/lib/libc/string/strcpy.c \