Merge "Add 32-bit Silvermont-optimized string/memory functions."
This commit is contained in:
commit
c1d26965ae
@ -448,8 +448,6 @@ libc_upstream_openbsd_src_files := \
|
|||||||
upstream-openbsd/lib/libc/stdlib/strtoull.c \
|
upstream-openbsd/lib/libc/stdlib/strtoull.c \
|
||||||
upstream-openbsd/lib/libc/stdlib/strtoumax.c \
|
upstream-openbsd/lib/libc/stdlib/strtoumax.c \
|
||||||
upstream-openbsd/lib/libc/stdlib/system.c \
|
upstream-openbsd/lib/libc/stdlib/system.c \
|
||||||
upstream-openbsd/lib/libc/string/stpcpy.c \
|
|
||||||
upstream-openbsd/lib/libc/string/stpncpy.c \
|
|
||||||
upstream-openbsd/lib/libc/string/strcasecmp.c \
|
upstream-openbsd/lib/libc/string/strcasecmp.c \
|
||||||
upstream-openbsd/lib/libc/string/strcspn.c \
|
upstream-openbsd/lib/libc/string/strcspn.c \
|
||||||
upstream-openbsd/lib/libc/string/strdup.c \
|
upstream-openbsd/lib/libc/string/strdup.c \
|
||||||
|
@ -26,6 +26,8 @@ libc_common_src_files_arm += \
|
|||||||
upstream-freebsd/lib/libc/string/wcsrchr.c \
|
upstream-freebsd/lib/libc/string/wcsrchr.c \
|
||||||
upstream-freebsd/lib/libc/string/wmemcmp.c \
|
upstream-freebsd/lib/libc/string/wmemcmp.c \
|
||||||
upstream-openbsd/lib/libc/string/bcopy.c \
|
upstream-openbsd/lib/libc/string/bcopy.c \
|
||||||
|
upstream-openbsd/lib/libc/string/stpcpy.c \
|
||||||
|
upstream-openbsd/lib/libc/string/stpncpy.c \
|
||||||
upstream-openbsd/lib/libc/string/strlcat.c \
|
upstream-openbsd/lib/libc/string/strlcat.c \
|
||||||
upstream-openbsd/lib/libc/string/strlcpy.c \
|
upstream-openbsd/lib/libc/string/strlcpy.c \
|
||||||
upstream-openbsd/lib/libc/string/strncat.c \
|
upstream-openbsd/lib/libc/string/strncat.c \
|
||||||
|
@ -15,6 +15,8 @@ libc_common_src_files_arm64 := \
|
|||||||
upstream-freebsd/lib/libc/string/wcsrchr.c \
|
upstream-freebsd/lib/libc/string/wcsrchr.c \
|
||||||
upstream-freebsd/lib/libc/string/wmemcmp.c \
|
upstream-freebsd/lib/libc/string/wmemcmp.c \
|
||||||
upstream-openbsd/lib/libc/string/bcopy.c \
|
upstream-openbsd/lib/libc/string/bcopy.c \
|
||||||
|
upstream-openbsd/lib/libc/string/stpcpy.c \
|
||||||
|
upstream-openbsd/lib/libc/string/stpncpy.c \
|
||||||
upstream-openbsd/lib/libc/string/strcat.c \
|
upstream-openbsd/lib/libc/string/strcat.c \
|
||||||
upstream-openbsd/lib/libc/string/strcpy.c \
|
upstream-openbsd/lib/libc/string/strcpy.c \
|
||||||
upstream-openbsd/lib/libc/string/strlcat.c \
|
upstream-openbsd/lib/libc/string/strlcat.c \
|
||||||
|
@ -27,6 +27,8 @@ libc_common_src_files_mips += \
|
|||||||
upstream-freebsd/lib/libc/string/wcsrchr.c \
|
upstream-freebsd/lib/libc/string/wcsrchr.c \
|
||||||
upstream-freebsd/lib/libc/string/wmemcmp.c \
|
upstream-freebsd/lib/libc/string/wmemcmp.c \
|
||||||
upstream-openbsd/lib/libc/string/bcopy.c \
|
upstream-openbsd/lib/libc/string/bcopy.c \
|
||||||
|
upstream-openbsd/lib/libc/string/stpcpy.c \
|
||||||
|
upstream-openbsd/lib/libc/string/stpncpy.c \
|
||||||
upstream-openbsd/lib/libc/string/strcat.c \
|
upstream-openbsd/lib/libc/string/strcat.c \
|
||||||
upstream-openbsd/lib/libc/string/strcmp.c \
|
upstream-openbsd/lib/libc/string/strcmp.c \
|
||||||
upstream-openbsd/lib/libc/string/strcpy.c \
|
upstream-openbsd/lib/libc/string/strcpy.c \
|
||||||
|
@ -17,6 +17,8 @@ libc_common_src_files_mips64 := \
|
|||||||
upstream-freebsd/lib/libc/string/wcsrchr.c \
|
upstream-freebsd/lib/libc/string/wcsrchr.c \
|
||||||
upstream-freebsd/lib/libc/string/wmemcmp.c \
|
upstream-freebsd/lib/libc/string/wmemcmp.c \
|
||||||
upstream-openbsd/lib/libc/string/bcopy.c \
|
upstream-openbsd/lib/libc/string/bcopy.c \
|
||||||
|
upstream-openbsd/lib/libc/string/stpcpy.c \
|
||||||
|
upstream-openbsd/lib/libc/string/stpncpy.c \
|
||||||
upstream-openbsd/lib/libc/string/strcat.c \
|
upstream-openbsd/lib/libc/string/strcat.c \
|
||||||
upstream-openbsd/lib/libc/string/strcmp.c \
|
upstream-openbsd/lib/libc/string/strcmp.c \
|
||||||
upstream-openbsd/lib/libc/string/strcpy.c \
|
upstream-openbsd/lib/libc/string/strcpy.c \
|
||||||
|
34
libc/arch-x86/atom/atom.mk
Normal file
34
libc/arch-x86/atom/atom.mk
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
libc_bionic_src_files_x86 += \
|
||||||
|
arch-x86/atom/string/sse2-bzero-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-index-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-memchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-memrchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-memset-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-strchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-strlen-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-strnlen-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-strrchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-wcschr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-wcsrchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-wcslen-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-wcscmp-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-bcopy-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-memcmp-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-memcmp16-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-memcpy-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-memmove-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strcat-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strcmp-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strcpy-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strlcat-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strlcpy-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strncat-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strncmp-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strncpy-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-wcscat-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-wcscpy-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-wmemcmp-atom.S
|
||||||
|
|
||||||
|
libc_bionic_src_files_x86 += \
|
||||||
|
arch-x86/silvermont/string/sse2-stpcpy-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-stpncpy-slm.S
|
36
libc/arch-x86/atom/string/cache.h
Normal file
36
libc/arch-x86/atom/string/cache.h
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2010, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Values are optimized for Atom */
|
||||||
|
#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
|
||||||
|
#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
|
||||||
|
|
||||||
|
#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
|
||||||
|
#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
|
@ -29,7 +29,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "cache.h"
|
#include "cache.h"
|
||||||
#undef __i686
|
|
||||||
|
|
||||||
#ifndef L
|
#ifndef L
|
||||||
# define L(label) .L##label
|
# define L(label) .L##label
|
||||||
@ -107,7 +106,7 @@ name: \
|
|||||||
jump table with relative offsets. */
|
jump table with relative offsets. */
|
||||||
# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
|
# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
|
||||||
/* We first load PC into EBX. */ \
|
/* We first load PC into EBX. */ \
|
||||||
call __i686.get_pc_thunk.bx; \
|
call __x86.get_pc_thunk.bx; \
|
||||||
/* Get the address of the jump table. */ \
|
/* Get the address of the jump table. */ \
|
||||||
add $(TABLE - .), %ebx; \
|
add $(TABLE - .), %ebx; \
|
||||||
/* Get the entry and convert the relative offset to the \
|
/* Get the entry and convert the relative offset to the \
|
||||||
@ -117,12 +116,12 @@ name: \
|
|||||||
/* We loaded the jump table and adjuested EDX. Go. */ \
|
/* We loaded the jump table and adjuested EDX. Go. */ \
|
||||||
jmp *%ebx
|
jmp *%ebx
|
||||||
|
|
||||||
.section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
|
.section .gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
|
||||||
.globl __i686.get_pc_thunk.bx
|
.globl __x86.get_pc_thunk.bx
|
||||||
.hidden __i686.get_pc_thunk.bx
|
.hidden __x86.get_pc_thunk.bx
|
||||||
ALIGN (4)
|
ALIGN (4)
|
||||||
.type __i686.get_pc_thunk.bx,@function
|
.type __x86.get_pc_thunk.bx,@function
|
||||||
__i686.get_pc_thunk.bx:
|
__x86.get_pc_thunk.bx:
|
||||||
movl (%esp), %ebx
|
movl (%esp), %ebx
|
||||||
ret
|
ret
|
||||||
#else
|
#else
|
||||||
@ -321,7 +320,7 @@ L(128bytesormore):
|
|||||||
mov $SHARED_CACHE_SIZE, %ebx
|
mov $SHARED_CACHE_SIZE, %ebx
|
||||||
#else
|
#else
|
||||||
# if (defined SHARED || defined __PIC__)
|
# if (defined SHARED || defined __PIC__)
|
||||||
call __i686.get_pc_thunk.bx
|
call __x86.get_pc_thunk.bx
|
||||||
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
||||||
mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx
|
mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx
|
||||||
# else
|
# else
|
||||||
@ -340,7 +339,7 @@ L(128bytesormore):
|
|||||||
#else
|
#else
|
||||||
# if (defined SHARED || defined __PIC__)
|
# if (defined SHARED || defined __PIC__)
|
||||||
# define RESTORE_EBX_STATE
|
# define RESTORE_EBX_STATE
|
||||||
call __i686.get_pc_thunk.bx
|
call __x86.get_pc_thunk.bx
|
||||||
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
||||||
cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx
|
cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx
|
||||||
# else
|
# else
|
@ -29,7 +29,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "cache.h"
|
#include "cache.h"
|
||||||
#undef __i686
|
|
||||||
|
|
||||||
#ifndef MEMCPY
|
#ifndef MEMCPY
|
||||||
# define MEMCPY memcpy
|
# define MEMCPY memcpy
|
||||||
@ -101,9 +100,8 @@ name: \
|
|||||||
# define RETURN_END POP (%ebx); ret
|
# define RETURN_END POP (%ebx); ret
|
||||||
# define RETURN RETURN_END; CFI_PUSH (%ebx)
|
# define RETURN RETURN_END; CFI_PUSH (%ebx)
|
||||||
# define JMPTBL(I, B) I - B
|
# define JMPTBL(I, B) I - B
|
||||||
# undef __i686
|
|
||||||
|
|
||||||
# define SETUP_PIC_REG(x) call __i686.get_pc_thunk.x
|
# define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x
|
||||||
|
|
||||||
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
|
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
|
||||||
jump table with relative offsets. INDEX is a register contains the
|
jump table with relative offsets. INDEX is a register contains the
|
55
libc/arch-x86/generic/generic.mk
Normal file
55
libc/arch-x86/generic/generic.mk
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
libc_bionic_src_files_x86 += \
|
||||||
|
arch-x86/atom/string/sse2-index-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-memchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-memrchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-strchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-strnlen-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-strrchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-wcschr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-wcsrchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-wcslen-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-wcscmp-atom.S \
|
||||||
|
arch-x86/silvermont/string/sse2-bcopy-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-bzero-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-memcpy-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-memmove-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-memset-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-stpcpy-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-stpncpy-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-strcpy-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-strlen-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-strncpy-slm.S
|
||||||
|
|
||||||
|
ifeq ($(ARCH_X86_HAVE_SSSE3),true)
|
||||||
|
libc_bionic_src_files_x86 += \
|
||||||
|
arch-x86/atom/string/ssse3-strncat-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strlcat-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strlcpy-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strcmp-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strncmp-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strcat-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-memcmp16-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-wcscat-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-wcscpy-atom.S
|
||||||
|
else
|
||||||
|
libc_bionic_src_files_x86 += \
|
||||||
|
arch-x86/generic/string/strcmp.S \
|
||||||
|
arch-x86/generic/string/strncmp.S \
|
||||||
|
arch-x86/generic/string/strcat.S \
|
||||||
|
bionic/__memcmp16.cpp \
|
||||||
|
upstream-freebsd/lib/libc/string/wcscpy.c \
|
||||||
|
upstream-freebsd/lib/libc/string/wcscat.c \
|
||||||
|
upstream-openbsd/lib/libc/string/strlcat.c \
|
||||||
|
upstream-openbsd/lib/libc/string/strlcpy.c \
|
||||||
|
upstream-openbsd/lib/libc/string/strncat.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH_X86_HAVE_SSE4),true)
|
||||||
|
libc_bionic_src_files_x86 += \
|
||||||
|
arch-x86/silvermont/string/sse4-memcmp-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse4-wmemcmp-slm.S
|
||||||
|
else
|
||||||
|
libc_bionic_src_files_x86 += \
|
||||||
|
arch-x86/generic/string/memcmp.S \
|
||||||
|
upstream-freebsd/lib/libc/string/wmemcmp.c
|
||||||
|
endif
|
34
libc/arch-x86/silvermont/silvermont.mk
Normal file
34
libc/arch-x86/silvermont/silvermont.mk
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
libc_bionic_src_files_x86 += \
|
||||||
|
arch-x86/silvermont/string/sse2-bcopy-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-bzero-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-memcpy-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-memmove-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-memset-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-stpcpy-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-stpncpy-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-strcpy-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-strlen-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse2-strncpy-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse4-memcmp-slm.S \
|
||||||
|
arch-x86/silvermont/string/sse4-wmemcmp-slm.S
|
||||||
|
|
||||||
|
libc_bionic_src_files_x86 += \
|
||||||
|
arch-x86/atom/string/sse2-memchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-memrchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-strchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-strrchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-index-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-strnlen-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-wcschr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-wcsrchr-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-wcslen-atom.S \
|
||||||
|
arch-x86/atom/string/sse2-wcscmp-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strncat-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strlcat-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strlcpy-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strcmp-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strncmp-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-strcat-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-memcmp16-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-wcscat-atom.S \
|
||||||
|
arch-x86/atom/string/ssse3-wcscpy-atom.S
|
@ -28,15 +28,9 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#if defined(__slm__)
|
|
||||||
/* Values are optimized for Silvermont */
|
/* Values are optimized for Silvermont */
|
||||||
#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
|
#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
|
||||||
#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
|
#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
|
||||||
#else
|
|
||||||
/* Values are optimized for Atom */
|
|
||||||
#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
|
|
||||||
#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
|
#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
|
||||||
#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
|
#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
|
34
libc/arch-x86/silvermont/string/sse2-bcopy-slm.S
Normal file
34
libc/arch-x86/silvermont/string/sse2-bcopy-slm.S
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2014, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#define MEMMOVE bcopy
|
||||||
|
#define USE_AS_BCOPY
|
||||||
|
#include "sse2-memmove-slm.S"
|
33
libc/arch-x86/silvermont/string/sse2-bzero-slm.S
Normal file
33
libc/arch-x86/silvermont/string/sse2-bzero-slm.S
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2014, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define USE_AS_BZERO
|
||||||
|
#define MEMSET bzero
|
||||||
|
#include "sse2-memset-slm.S"
|
308
libc/arch-x86/silvermont/string/sse2-memcpy-slm.S
Normal file
308
libc/arch-x86/silvermont/string/sse2-memcpy-slm.S
Normal file
@ -0,0 +1,308 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2014, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "cache.h"
|
||||||
|
|
||||||
|
#ifndef MEMCPY
|
||||||
|
# define MEMCPY memcpy
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef L
|
||||||
|
# define L(label) .L##label
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_startproc
|
||||||
|
# define cfi_startproc .cfi_startproc
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_endproc
|
||||||
|
# define cfi_endproc .cfi_endproc
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_rel_offset
|
||||||
|
# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_restore
|
||||||
|
# define cfi_restore(reg) .cfi_restore reg
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_adjust_cfa_offset
|
||||||
|
# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef ENTRY
|
||||||
|
# define ENTRY(name) \
|
||||||
|
.type name, @function; \
|
||||||
|
.globl name; \
|
||||||
|
.p2align 4; \
|
||||||
|
name: \
|
||||||
|
cfi_startproc
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef END
|
||||||
|
# define END(name) \
|
||||||
|
cfi_endproc; \
|
||||||
|
.size name, .-name
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define DEST PARMS
|
||||||
|
#define SRC DEST+4
|
||||||
|
#define LEN SRC+4
|
||||||
|
|
||||||
|
#define CFI_PUSH(REG) \
|
||||||
|
cfi_adjust_cfa_offset (4); \
|
||||||
|
cfi_rel_offset (REG, 0)
|
||||||
|
|
||||||
|
#define CFI_POP(REG) \
|
||||||
|
cfi_adjust_cfa_offset (-4); \
|
||||||
|
cfi_restore (REG)
|
||||||
|
|
||||||
|
#define PUSH(REG) pushl REG; CFI_PUSH (REG)
|
||||||
|
#define POP(REG) popl REG; CFI_POP (REG)
|
||||||
|
|
||||||
|
#define PARMS 8 /* Preserve EBX. */
|
||||||
|
#define ENTRANCE PUSH (%ebx);
|
||||||
|
#define RETURN_END POP (%ebx); ret
|
||||||
|
#define RETURN RETURN_END; CFI_PUSH (%ebx)
|
||||||
|
|
||||||
|
.section .text.sse2,"ax",@progbits
|
||||||
|
ENTRY (MEMCPY)
|
||||||
|
ENTRANCE
|
||||||
|
movl LEN(%esp), %ecx
|
||||||
|
movl SRC(%esp), %eax
|
||||||
|
movl DEST(%esp), %edx
|
||||||
|
|
||||||
|
cmp %eax, %edx
|
||||||
|
je L(return)
|
||||||
|
|
||||||
|
cmp $16, %ecx
|
||||||
|
jbe L(len_0_16_bytes)
|
||||||
|
|
||||||
|
cmp $SHARED_CACHE_SIZE_HALF, %ecx
|
||||||
|
jae L(large_page)
|
||||||
|
|
||||||
|
movdqu (%eax), %xmm0
|
||||||
|
movdqu -16(%eax, %ecx), %xmm1
|
||||||
|
cmpl $32, %ecx
|
||||||
|
movdqu %xmm0, (%edx)
|
||||||
|
movdqu %xmm1, -16(%edx, %ecx)
|
||||||
|
jbe L(return)
|
||||||
|
|
||||||
|
movdqu 16(%eax), %xmm0
|
||||||
|
movdqu -32(%eax, %ecx), %xmm1
|
||||||
|
cmpl $64, %ecx
|
||||||
|
movdqu %xmm0, 16(%edx)
|
||||||
|
movdqu %xmm1, -32(%edx, %ecx)
|
||||||
|
jbe L(return)
|
||||||
|
|
||||||
|
movdqu 32(%eax), %xmm0
|
||||||
|
movdqu 48(%eax), %xmm1
|
||||||
|
movdqu -48(%eax, %ecx), %xmm2
|
||||||
|
movdqu -64(%eax, %ecx), %xmm3
|
||||||
|
cmpl $128, %ecx
|
||||||
|
movdqu %xmm0, 32(%edx)
|
||||||
|
movdqu %xmm1, 48(%edx)
|
||||||
|
movdqu %xmm2, -48(%edx, %ecx)
|
||||||
|
movdqu %xmm3, -64(%edx, %ecx)
|
||||||
|
jbe L(return)
|
||||||
|
|
||||||
|
/* Now the main loop: we align the address of the destination. */
|
||||||
|
leal 64(%edx), %ebx
|
||||||
|
andl $-64, %ebx
|
||||||
|
|
||||||
|
addl %edx, %ecx
|
||||||
|
andl $-64, %ecx
|
||||||
|
|
||||||
|
subl %edx, %eax
|
||||||
|
|
||||||
|
/* We should stop two iterations before the termination
|
||||||
|
(in order not to misprefetch). */
|
||||||
|
subl $64, %ecx
|
||||||
|
cmpl %ebx, %ecx
|
||||||
|
je L(main_loop_just_one_iteration)
|
||||||
|
|
||||||
|
subl $64, %ecx
|
||||||
|
cmpl %ebx, %ecx
|
||||||
|
je L(main_loop_last_two_iterations)
|
||||||
|
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
L(main_loop_cache):
|
||||||
|
|
||||||
|
prefetcht0 128(%ebx, %eax)
|
||||||
|
|
||||||
|
movdqu (%ebx, %eax), %xmm0
|
||||||
|
movdqu 16(%ebx, %eax), %xmm1
|
||||||
|
movdqu 32(%ebx, %eax), %xmm2
|
||||||
|
movdqu 48(%ebx, %eax), %xmm3
|
||||||
|
movdqa %xmm0, (%ebx)
|
||||||
|
movdqa %xmm1, 16(%ebx)
|
||||||
|
movdqa %xmm2, 32(%ebx)
|
||||||
|
movdqa %xmm3, 48(%ebx)
|
||||||
|
lea 64(%ebx), %ebx
|
||||||
|
cmpl %ebx, %ecx
|
||||||
|
jne L(main_loop_cache)
|
||||||
|
|
||||||
|
L(main_loop_last_two_iterations):
|
||||||
|
movdqu (%ebx, %eax), %xmm0
|
||||||
|
movdqu 16(%ebx, %eax), %xmm1
|
||||||
|
movdqu 32(%ebx, %eax), %xmm2
|
||||||
|
movdqu 48(%ebx, %eax), %xmm3
|
||||||
|
movdqu 64(%ebx, %eax), %xmm4
|
||||||
|
movdqu 80(%ebx, %eax), %xmm5
|
||||||
|
movdqu 96(%ebx, %eax), %xmm6
|
||||||
|
movdqu 112(%ebx, %eax), %xmm7
|
||||||
|
movdqa %xmm0, (%ebx)
|
||||||
|
movdqa %xmm1, 16(%ebx)
|
||||||
|
movdqa %xmm2, 32(%ebx)
|
||||||
|
movdqa %xmm3, 48(%ebx)
|
||||||
|
movdqa %xmm4, 64(%ebx)
|
||||||
|
movdqa %xmm5, 80(%ebx)
|
||||||
|
movdqa %xmm6, 96(%ebx)
|
||||||
|
movdqa %xmm7, 112(%ebx)
|
||||||
|
jmp L(return)
|
||||||
|
|
||||||
|
L(main_loop_just_one_iteration):
|
||||||
|
movdqu (%ebx, %eax), %xmm0
|
||||||
|
movdqu 16(%ebx, %eax), %xmm1
|
||||||
|
movdqu 32(%ebx, %eax), %xmm2
|
||||||
|
movdqu 48(%ebx, %eax), %xmm3
|
||||||
|
movdqa %xmm0, (%ebx)
|
||||||
|
movdqa %xmm1, 16(%ebx)
|
||||||
|
movdqa %xmm2, 32(%ebx)
|
||||||
|
movdqa %xmm3, 48(%ebx)
|
||||||
|
jmp L(return)
|
||||||
|
|
||||||
|
L(large_page):
|
||||||
|
movdqu (%eax), %xmm0
|
||||||
|
movdqu 16(%eax), %xmm1
|
||||||
|
movdqu 32(%eax), %xmm2
|
||||||
|
movdqu 48(%eax), %xmm3
|
||||||
|
movdqu -64(%eax, %ecx), %xmm4
|
||||||
|
movdqu -48(%eax, %ecx), %xmm5
|
||||||
|
movdqu -32(%eax, %ecx), %xmm6
|
||||||
|
movdqu -16(%eax, %ecx), %xmm7
|
||||||
|
movdqu %xmm0, (%edx)
|
||||||
|
movdqu %xmm1, 16(%edx)
|
||||||
|
movdqu %xmm2, 32(%edx)
|
||||||
|
movdqu %xmm3, 48(%edx)
|
||||||
|
movdqu %xmm4, -64(%edx, %ecx)
|
||||||
|
movdqu %xmm5, -48(%edx, %ecx)
|
||||||
|
movdqu %xmm6, -32(%edx, %ecx)
|
||||||
|
movdqu %xmm7, -16(%edx, %ecx)
|
||||||
|
|
||||||
|
movdqu 64(%eax), %xmm0
|
||||||
|
movdqu 80(%eax), %xmm1
|
||||||
|
movdqu 96(%eax), %xmm2
|
||||||
|
movdqu 112(%eax), %xmm3
|
||||||
|
movdqu -128(%eax, %ecx), %xmm4
|
||||||
|
movdqu -112(%eax, %ecx), %xmm5
|
||||||
|
movdqu -96(%eax, %ecx), %xmm6
|
||||||
|
movdqu -80(%eax, %ecx), %xmm7
|
||||||
|
movdqu %xmm0, 64(%edx)
|
||||||
|
movdqu %xmm1, 80(%edx)
|
||||||
|
movdqu %xmm2, 96(%edx)
|
||||||
|
movdqu %xmm3, 112(%edx)
|
||||||
|
movdqu %xmm4, -128(%edx, %ecx)
|
||||||
|
movdqu %xmm5, -112(%edx, %ecx)
|
||||||
|
movdqu %xmm6, -96(%edx, %ecx)
|
||||||
|
movdqu %xmm7, -80(%edx, %ecx)
|
||||||
|
|
||||||
|
/* Now the main loop with non temporal stores. We align
|
||||||
|
the address of the destination. */
|
||||||
|
leal 128(%edx), %ebx
|
||||||
|
andl $-128, %ebx
|
||||||
|
|
||||||
|
addl %edx, %ecx
|
||||||
|
andl $-128, %ecx
|
||||||
|
|
||||||
|
subl %edx, %eax
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
L(main_loop_large_page):
|
||||||
|
movdqu (%ebx, %eax), %xmm0
|
||||||
|
movdqu 16(%ebx, %eax), %xmm1
|
||||||
|
movdqu 32(%ebx, %eax), %xmm2
|
||||||
|
movdqu 48(%ebx, %eax), %xmm3
|
||||||
|
movdqu 64(%ebx, %eax), %xmm4
|
||||||
|
movdqu 80(%ebx, %eax), %xmm5
|
||||||
|
movdqu 96(%ebx, %eax), %xmm6
|
||||||
|
movdqu 112(%ebx, %eax), %xmm7
|
||||||
|
movntdq %xmm0, (%ebx)
|
||||||
|
movntdq %xmm1, 16(%ebx)
|
||||||
|
movntdq %xmm2, 32(%ebx)
|
||||||
|
movntdq %xmm3, 48(%ebx)
|
||||||
|
movntdq %xmm4, 64(%ebx)
|
||||||
|
movntdq %xmm5, 80(%ebx)
|
||||||
|
movntdq %xmm6, 96(%ebx)
|
||||||
|
movntdq %xmm7, 112(%ebx)
|
||||||
|
lea 128(%ebx), %ebx
|
||||||
|
cmpl %ebx, %ecx
|
||||||
|
jne L(main_loop_large_page)
|
||||||
|
sfence
|
||||||
|
jmp L(return)
|
||||||
|
|
||||||
|
L(len_0_16_bytes):
|
||||||
|
testb $24, %cl
|
||||||
|
jne L(len_9_16_bytes)
|
||||||
|
testb $4, %cl
|
||||||
|
.p2align 4,,5
|
||||||
|
jne L(len_5_8_bytes)
|
||||||
|
testl %ecx, %ecx
|
||||||
|
.p2align 4,,2
|
||||||
|
je L(return)
|
||||||
|
movzbl (%eax), %ebx
|
||||||
|
testb $2, %cl
|
||||||
|
movb %bl, (%edx)
|
||||||
|
je L(return)
|
||||||
|
movzwl -2(%eax,%ecx), %ebx
|
||||||
|
movw %bx, -2(%edx,%ecx)
|
||||||
|
jmp L(return)
|
||||||
|
|
||||||
|
L(len_9_16_bytes):
|
||||||
|
movq (%eax), %xmm0
|
||||||
|
movq -8(%eax, %ecx), %xmm1
|
||||||
|
movq %xmm0, (%edx)
|
||||||
|
movq %xmm1, -8(%edx, %ecx)
|
||||||
|
jmp L(return)
|
||||||
|
|
||||||
|
L(len_5_8_bytes):
|
||||||
|
movl (%eax), %ebx
|
||||||
|
movl %ebx, (%edx)
|
||||||
|
movl -4(%eax,%ecx), %ebx
|
||||||
|
movl %ebx, -4(%edx,%ecx)
|
||||||
|
jmp L(return)
|
||||||
|
|
||||||
|
L(return):
|
||||||
|
movl %edx, %eax
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
END (MEMCPY)
|
673
libc/arch-x86/silvermont/string/sse2-memmove-slm.S
Normal file
673
libc/arch-x86/silvermont/string/sse2-memmove-slm.S
Normal file
@ -0,0 +1,673 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2014, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "cache.h"
|
||||||
|
|
||||||
|
#ifndef MEMMOVE
|
||||||
|
# define MEMMOVE memmove
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef L
|
||||||
|
# define L(label) .L##label
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_startproc
|
||||||
|
# define cfi_startproc .cfi_startproc
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_endproc
|
||||||
|
# define cfi_endproc .cfi_endproc
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_rel_offset
|
||||||
|
# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_restore
|
||||||
|
# define cfi_restore(reg) .cfi_restore reg
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_adjust_cfa_offset
|
||||||
|
# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef ENTRY
|
||||||
|
# define ENTRY(name) \
|
||||||
|
.type name, @function; \
|
||||||
|
.globl name; \
|
||||||
|
.p2align 4; \
|
||||||
|
name: \
|
||||||
|
cfi_startproc
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef END
|
||||||
|
# define END(name) \
|
||||||
|
cfi_endproc; \
|
||||||
|
.size name, .-name
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef USE_AS_BCOPY
|
||||||
|
# define SRC PARMS
|
||||||
|
# define DEST SRC+4
|
||||||
|
# define LEN DEST+4
|
||||||
|
#else
|
||||||
|
# define DEST PARMS
|
||||||
|
# define SRC DEST+4
|
||||||
|
# define LEN SRC+4
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define CFI_PUSH(REG) \
|
||||||
|
cfi_adjust_cfa_offset (4); \
|
||||||
|
cfi_rel_offset (REG, 0)
|
||||||
|
|
||||||
|
#define CFI_POP(REG) \
|
||||||
|
cfi_adjust_cfa_offset (-4); \
|
||||||
|
cfi_restore (REG)
|
||||||
|
|
||||||
|
#define PUSH(REG) pushl REG; CFI_PUSH (REG)
|
||||||
|
#define POP(REG) popl REG; CFI_POP (REG)
|
||||||
|
|
||||||
|
#define PARMS 8 /* Preserve EBX. */
|
||||||
|
#define ENTRANCE PUSH (%ebx);
|
||||||
|
#define RETURN_END POP (%ebx); ret
|
||||||
|
#define RETURN RETURN_END; CFI_PUSH (%ebx)
|
||||||
|
|
||||||
|
.section .text.sse2,"ax",@progbits
|
||||||
|
ENTRY (MEMMOVE)
|
||||||
|
ENTRANCE
|
||||||
|
movl LEN(%esp), %ecx
|
||||||
|
movl SRC(%esp), %eax
|
||||||
|
movl DEST(%esp), %edx
|
||||||
|
|
||||||
|
/* Check whether we should copy backward or forward. */
|
||||||
|
cmp %eax, %edx
|
||||||
|
je L(mm_return)
|
||||||
|
ja L(mm_len_0_or_more_backward)
|
||||||
|
|
||||||
|
/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
|
||||||
|
separately. */
|
||||||
|
cmp $16, %ecx
|
||||||
|
jbe L(mm_len_0_16_bytes_forward)
|
||||||
|
|
||||||
|
cmpl $32, %ecx
|
||||||
|
jg L(mm_len_32_or_more_forward)
|
||||||
|
|
||||||
|
/* Copy [0..32] and return. */
|
||||||
|
movdqu (%eax), %xmm0
|
||||||
|
movdqu -16(%eax, %ecx), %xmm1
|
||||||
|
movdqu %xmm0, (%edx)
|
||||||
|
movdqu %xmm1, -16(%edx, %ecx)
|
||||||
|
jmp L(mm_return)
|
||||||
|
|
||||||
|
L(mm_len_32_or_more_forward):
|
||||||
|
cmpl $64, %ecx
|
||||||
|
jg L(mm_len_64_or_more_forward)
|
||||||
|
|
||||||
|
/* Copy [0..64] and return. */
|
||||||
|
movdqu (%eax), %xmm0
|
||||||
|
movdqu 16(%eax), %xmm1
|
||||||
|
movdqu -16(%eax, %ecx), %xmm2
|
||||||
|
movdqu -32(%eax, %ecx), %xmm3
|
||||||
|
movdqu %xmm0, (%edx)
|
||||||
|
movdqu %xmm1, 16(%edx)
|
||||||
|
movdqu %xmm2, -16(%edx, %ecx)
|
||||||
|
movdqu %xmm3, -32(%edx, %ecx)
|
||||||
|
jmp L(mm_return)
|
||||||
|
|
||||||
|
L(mm_len_64_or_more_forward):
|
||||||
|
cmpl $128, %ecx
|
||||||
|
jg L(mm_len_128_or_more_forward)
|
||||||
|
|
||||||
|
/* Copy [0..128] and return. */
|
||||||
|
movdqu (%eax), %xmm0
|
||||||
|
movdqu 16(%eax), %xmm1
|
||||||
|
movdqu 32(%eax), %xmm2
|
||||||
|
movdqu 48(%eax), %xmm3
|
||||||
|
movdqu -64(%eax, %ecx), %xmm4
|
||||||
|
movdqu -48(%eax, %ecx), %xmm5
|
||||||
|
movdqu -32(%eax, %ecx), %xmm6
|
||||||
|
movdqu -16(%eax, %ecx), %xmm7
|
||||||
|
movdqu %xmm0, (%edx)
|
||||||
|
movdqu %xmm1, 16(%edx)
|
||||||
|
movdqu %xmm2, 32(%edx)
|
||||||
|
movdqu %xmm3, 48(%edx)
|
||||||
|
movdqu %xmm4, -64(%edx, %ecx)
|
||||||
|
movdqu %xmm5, -48(%edx, %ecx)
|
||||||
|
movdqu %xmm6, -32(%edx, %ecx)
|
||||||
|
movdqu %xmm7, -16(%edx, %ecx)
|
||||||
|
jmp L(mm_return)
|
||||||
|
|
||||||
|
L(mm_len_128_or_more_forward):
|
||||||
|
|
||||||
|
cmp $SHARED_CACHE_SIZE_HALF, %ecx
|
||||||
|
jae L(mm_large_page_forward)
|
||||||
|
|
||||||
|
PUSH (%esi)
|
||||||
|
PUSH (%edi)
|
||||||
|
movl %eax, %esi
|
||||||
|
movl %edx, %edi
|
||||||
|
|
||||||
|
/* Aligning the address of destination. */
|
||||||
|
movdqu (%esi), %xmm0
|
||||||
|
movdqu 16(%esi), %xmm1
|
||||||
|
movdqu 32(%esi), %xmm2
|
||||||
|
movdqu 48(%esi), %xmm3
|
||||||
|
|
||||||
|
leal 64(%edi), %edx
|
||||||
|
andl $-64, %edx
|
||||||
|
|
||||||
|
movl %esi, %eax
|
||||||
|
subl %edi, %eax
|
||||||
|
|
||||||
|
movdqu (%edx, %eax), %xmm4
|
||||||
|
movdqu 16(%edx, %eax), %xmm5
|
||||||
|
movdqu 32(%edx, %eax), %xmm6
|
||||||
|
movdqu 48(%edx, %eax), %xmm7
|
||||||
|
|
||||||
|
movdqu %xmm0, (%edi)
|
||||||
|
movdqu %xmm1, 16(%edi)
|
||||||
|
movdqu %xmm2, 32(%edi)
|
||||||
|
movdqu %xmm3, 48(%edi)
|
||||||
|
movdqa %xmm4, (%edx)
|
||||||
|
movdqa %xmm5, 16(%edx)
|
||||||
|
movdqa %xmm6, 32(%edx)
|
||||||
|
movdqa %xmm7, 48(%edx)
|
||||||
|
addl $64, %edx
|
||||||
|
|
||||||
|
leal (%edi, %ecx), %ebx
|
||||||
|
andl $-64, %ebx
|
||||||
|
|
||||||
|
cmp %edx, %ebx
|
||||||
|
jbe L(mm_copy_remaining_forward)
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
L(mm_main_loop_forward):
|
||||||
|
|
||||||
|
prefetcht0 128(%edx, %eax)
|
||||||
|
|
||||||
|
movdqu (%edx, %eax), %xmm0
|
||||||
|
movdqu 16(%edx, %eax), %xmm1
|
||||||
|
movdqu 32(%edx, %eax), %xmm2
|
||||||
|
movdqu 48(%edx, %eax), %xmm3
|
||||||
|
movdqa %xmm0, (%edx)
|
||||||
|
movdqa %xmm1, 16(%edx)
|
||||||
|
movdqa %xmm2, 32(%edx)
|
||||||
|
movdqa %xmm3, 48(%edx)
|
||||||
|
leal 64(%edx), %edx
|
||||||
|
cmp %edx, %ebx
|
||||||
|
ja L(mm_main_loop_forward)
|
||||||
|
|
||||||
|
L(mm_copy_remaining_forward):
|
||||||
|
addl %edi, %ecx
|
||||||
|
subl %edx, %ecx
|
||||||
|
/* We copied all up till %edx position in the dst.
|
||||||
|
In %ecx now is how many bytes are left to copy.
|
||||||
|
Now we need to advance %esi. */
|
||||||
|
leal (%edx, %eax), %esi
|
||||||
|
|
||||||
|
L(mm_remaining_0_64_bytes_forward):
|
||||||
|
cmp $32, %ecx
|
||||||
|
ja L(mm_remaining_33_64_bytes_forward)
|
||||||
|
cmp $16, %ecx
|
||||||
|
ja L(mm_remaining_17_32_bytes_forward)
|
||||||
|
testl %ecx, %ecx
|
||||||
|
.p2align 4,,2
|
||||||
|
je L(mm_return_pop_all)
|
||||||
|
|
||||||
|
cmpb $8, %cl
|
||||||
|
ja L(mm_remaining_9_16_bytes_forward)
|
||||||
|
cmpb $4, %cl
|
||||||
|
.p2align 4,,5
|
||||||
|
ja L(mm_remaining_5_8_bytes_forward)
|
||||||
|
cmpb $2, %cl
|
||||||
|
.p2align 4,,1
|
||||||
|
ja L(mm_remaining_3_4_bytes_forward)
|
||||||
|
movzbl -1(%esi,%ecx), %eax
|
||||||
|
movzbl (%esi), %ebx
|
||||||
|
movb %al, -1(%edx,%ecx)
|
||||||
|
movb %bl, (%edx)
|
||||||
|
jmp L(mm_return_pop_all)
|
||||||
|
|
||||||
|
L(mm_remaining_33_64_bytes_forward):
|
||||||
|
movdqu (%esi), %xmm0
|
||||||
|
movdqu 16(%esi), %xmm1
|
||||||
|
movdqu -32(%esi, %ecx), %xmm2
|
||||||
|
movdqu -16(%esi, %ecx), %xmm3
|
||||||
|
movdqu %xmm0, (%edx)
|
||||||
|
movdqu %xmm1, 16(%edx)
|
||||||
|
movdqu %xmm2, -32(%edx, %ecx)
|
||||||
|
movdqu %xmm3, -16(%edx, %ecx)
|
||||||
|
jmp L(mm_return_pop_all)
|
||||||
|
|
||||||
|
L(mm_remaining_17_32_bytes_forward):
|
||||||
|
movdqu (%esi), %xmm0
|
||||||
|
movdqu -16(%esi, %ecx), %xmm1
|
||||||
|
movdqu %xmm0, (%edx)
|
||||||
|
movdqu %xmm1, -16(%edx, %ecx)
|
||||||
|
jmp L(mm_return_pop_all)
|
||||||
|
|
||||||
|
L(mm_remaining_3_4_bytes_forward):
|
||||||
|
movzwl -2(%esi,%ecx), %eax
|
||||||
|
movzwl (%esi), %ebx
|
||||||
|
movw %ax, -2(%edx,%ecx)
|
||||||
|
movw %bx, (%edx)
|
||||||
|
jmp L(mm_return_pop_all)
|
||||||
|
|
||||||
|
L(mm_remaining_5_8_bytes_forward):
|
||||||
|
movl (%esi), %eax
|
||||||
|
movl -4(%esi,%ecx), %ebx
|
||||||
|
movl %eax, (%edx)
|
||||||
|
movl %ebx, -4(%edx,%ecx)
|
||||||
|
jmp L(mm_return_pop_all)
|
||||||
|
|
||||||
|
L(mm_remaining_9_16_bytes_forward):
|
||||||
|
movq (%esi), %xmm0
|
||||||
|
movq -8(%esi, %ecx), %xmm1
|
||||||
|
movq %xmm0, (%edx)
|
||||||
|
movq %xmm1, -8(%edx, %ecx)
|
||||||
|
jmp L(mm_return_pop_all)
|
||||||
|
|
||||||
|
|
||||||
|
L(mm_len_0_16_bytes_forward):
|
||||||
|
testb $24, %cl
|
||||||
|
jne L(mm_len_9_16_bytes_forward)
|
||||||
|
testb $4, %cl
|
||||||
|
.p2align 4,,5
|
||||||
|
jne L(mm_len_5_8_bytes_forward)
|
||||||
|
testl %ecx, %ecx
|
||||||
|
.p2align 4,,2
|
||||||
|
je L(mm_return)
|
||||||
|
testb $2, %cl
|
||||||
|
.p2align 4,,1
|
||||||
|
jne L(mm_len_2_4_bytes_forward)
|
||||||
|
movzbl -1(%eax,%ecx), %ebx
|
||||||
|
movzbl (%eax), %eax
|
||||||
|
movb %bl, -1(%edx,%ecx)
|
||||||
|
movb %al, (%edx)
|
||||||
|
jmp L(mm_return)
|
||||||
|
|
||||||
|
L(mm_len_2_4_bytes_forward):
|
||||||
|
movzwl -2(%eax,%ecx), %ebx
|
||||||
|
movzwl (%eax), %eax
|
||||||
|
movw %bx, -2(%edx,%ecx)
|
||||||
|
movw %ax, (%edx)
|
||||||
|
jmp L(mm_return)
|
||||||
|
|
||||||
|
L(mm_len_5_8_bytes_forward):
|
||||||
|
movl (%eax), %ebx
|
||||||
|
movl -4(%eax,%ecx), %eax
|
||||||
|
movl %ebx, (%edx)
|
||||||
|
movl %eax, -4(%edx,%ecx)
|
||||||
|
jmp L(mm_return)
|
||||||
|
|
||||||
|
L(mm_len_9_16_bytes_forward):
|
||||||
|
movq (%eax), %xmm0
|
||||||
|
movq -8(%eax, %ecx), %xmm1
|
||||||
|
movq %xmm0, (%edx)
|
||||||
|
movq %xmm1, -8(%edx, %ecx)
|
||||||
|
jmp L(mm_return)
|
||||||
|
|
||||||
|
/* The code for copying backwards. */
|
||||||
|
L(mm_len_0_or_more_backward):
|
||||||
|
|
||||||
|
/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
|
||||||
|
separately. */
|
||||||
|
cmp $16, %ecx
|
||||||
|
jbe L(mm_len_0_16_bytes_backward)
|
||||||
|
|
||||||
|
cmpl $32, %ecx
|
||||||
|
jg L(mm_len_32_or_more_backward)
|
||||||
|
|
||||||
|
/* Copy [0..32] and return. */
|
||||||
|
movdqu (%eax), %xmm0
|
||||||
|
movdqu -16(%eax, %ecx), %xmm1
|
||||||
|
movdqu %xmm0, (%edx)
|
||||||
|
movdqu %xmm1, -16(%edx, %ecx)
|
||||||
|
jmp L(mm_return)
|
||||||
|
|
||||||
|
L(mm_len_32_or_more_backward):
|
||||||
|
cmpl $64, %ecx
|
||||||
|
jg L(mm_len_64_or_more_backward)
|
||||||
|
|
||||||
|
/* Copy [0..64] and return. */
|
||||||
|
movdqu (%eax), %xmm0
|
||||||
|
movdqu 16(%eax), %xmm1
|
||||||
|
movdqu -16(%eax, %ecx), %xmm2
|
||||||
|
movdqu -32(%eax, %ecx), %xmm3
|
||||||
|
movdqu %xmm0, (%edx)
|
||||||
|
movdqu %xmm1, 16(%edx)
|
||||||
|
movdqu %xmm2, -16(%edx, %ecx)
|
||||||
|
movdqu %xmm3, -32(%edx, %ecx)
|
||||||
|
jmp L(mm_return)
|
||||||
|
|
||||||
|
L(mm_len_64_or_more_backward):
|
||||||
|
cmpl $128, %ecx
|
||||||
|
jg L(mm_len_128_or_more_backward)
|
||||||
|
|
||||||
|
/* Copy [0..128] and return. */
|
||||||
|
movdqu (%eax), %xmm0
|
||||||
|
movdqu 16(%eax), %xmm1
|
||||||
|
movdqu 32(%eax), %xmm2
|
||||||
|
movdqu 48(%eax), %xmm3
|
||||||
|
movdqu -64(%eax, %ecx), %xmm4
|
||||||
|
movdqu -48(%eax, %ecx), %xmm5
|
||||||
|
movdqu -32(%eax, %ecx), %xmm6
|
||||||
|
movdqu -16(%eax, %ecx), %xmm7
|
||||||
|
movdqu %xmm0, (%edx)
|
||||||
|
movdqu %xmm1, 16(%edx)
|
||||||
|
movdqu %xmm2, 32(%edx)
|
||||||
|
movdqu %xmm3, 48(%edx)
|
||||||
|
movdqu %xmm4, -64(%edx, %ecx)
|
||||||
|
movdqu %xmm5, -48(%edx, %ecx)
|
||||||
|
movdqu %xmm6, -32(%edx, %ecx)
|
||||||
|
movdqu %xmm7, -16(%edx, %ecx)
|
||||||
|
jmp L(mm_return)
|
||||||
|
|
||||||
|
L(mm_len_128_or_more_backward):
|
||||||
|
|
||||||
|
cmp $SHARED_CACHE_SIZE_HALF, %ecx
|
||||||
|
jae L(mm_large_page_backward)
|
||||||
|
|
||||||
|
PUSH (%esi)
|
||||||
|
PUSH (%edi)
|
||||||
|
|
||||||
|
/* Aligning the address of destination. We need to save
|
||||||
|
16 bits from the source in order not to overwrite them. */
|
||||||
|
movdqu -16(%eax, %ecx), %xmm0
|
||||||
|
movdqu -32(%eax, %ecx), %xmm1
|
||||||
|
movdqu -48(%eax, %ecx), %xmm2
|
||||||
|
movdqu -64(%eax, %ecx), %xmm3
|
||||||
|
|
||||||
|
leal (%edx, %ecx), %edi
|
||||||
|
andl $-64, %edi
|
||||||
|
|
||||||
|
movl %eax, %esi
|
||||||
|
subl %edx, %esi
|
||||||
|
|
||||||
|
movdqu -16(%edi, %esi), %xmm4
|
||||||
|
movdqu -32(%edi, %esi), %xmm5
|
||||||
|
movdqu -48(%edi, %esi), %xmm6
|
||||||
|
movdqu -64(%edi, %esi), %xmm7
|
||||||
|
|
||||||
|
movdqu %xmm0, -16(%edx, %ecx)
|
||||||
|
movdqu %xmm1, -32(%edx, %ecx)
|
||||||
|
movdqu %xmm2, -48(%edx, %ecx)
|
||||||
|
movdqu %xmm3, -64(%edx, %ecx)
|
||||||
|
movdqa %xmm4, -16(%edi)
|
||||||
|
movdqa %xmm5, -32(%edi)
|
||||||
|
movdqa %xmm6, -48(%edi)
|
||||||
|
movdqa %xmm7, -64(%edi)
|
||||||
|
leal -64(%edi), %edi
|
||||||
|
|
||||||
|
leal 64(%edx), %ebx
|
||||||
|
andl $-64, %ebx
|
||||||
|
|
||||||
|
/* Compute in %ecx how many bytes are left to copy after
|
||||||
|
the main loop stops. */
|
||||||
|
movl %ebx, %ecx
|
||||||
|
subl %edx, %ecx
|
||||||
|
|
||||||
|
cmp %edi, %ebx
|
||||||
|
jb L(mm_main_loop_backward)
|
||||||
|
|
||||||
|
POP (%edi)
|
||||||
|
POP (%esi)
|
||||||
|
jmp L(mm_len_0_or_more_backward)
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
L(mm_main_loop_backward):
|
||||||
|
|
||||||
|
prefetcht0 -128(%edi, %esi)
|
||||||
|
|
||||||
|
movdqu -64(%edi, %esi), %xmm0
|
||||||
|
movdqu -48(%edi, %esi), %xmm1
|
||||||
|
movdqu -32(%edi, %esi), %xmm2
|
||||||
|
movdqu -16(%edi, %esi), %xmm3
|
||||||
|
movdqa %xmm0, -64(%edi)
|
||||||
|
movdqa %xmm1, -48(%edi)
|
||||||
|
movdqa %xmm2, -32(%edi)
|
||||||
|
movdqa %xmm3, -16(%edi)
|
||||||
|
leal -64(%edi), %edi
|
||||||
|
cmp %edi, %ebx
|
||||||
|
jb L(mm_main_loop_backward)
|
||||||
|
POP (%edi)
|
||||||
|
POP (%esi)
|
||||||
|
jmp L(mm_len_0_or_more_backward)
|
||||||
|
|
||||||
|
/* Copy [0..16] and return. */
|
||||||
|
L(mm_len_0_16_bytes_backward):
|
||||||
|
testb $24, %cl
|
||||||
|
jnz L(mm_len_9_16_bytes_backward)
|
||||||
|
testb $4, %cl
|
||||||
|
.p2align 4,,5
|
||||||
|
jnz L(mm_len_5_8_bytes_backward)
|
||||||
|
testl %ecx, %ecx
|
||||||
|
.p2align 4,,2
|
||||||
|
je L(mm_return)
|
||||||
|
testb $2, %cl
|
||||||
|
.p2align 4,,1
|
||||||
|
jne L(mm_len_3_4_bytes_backward)
|
||||||
|
movzbl -1(%eax,%ecx), %ebx
|
||||||
|
movzbl (%eax), %eax
|
||||||
|
movb %bl, -1(%edx,%ecx)
|
||||||
|
movb %al, (%edx)
|
||||||
|
jmp L(mm_return)
|
||||||
|
|
||||||
|
L(mm_len_3_4_bytes_backward):
|
||||||
|
movzwl -2(%eax,%ecx), %ebx
|
||||||
|
movzwl (%eax), %eax
|
||||||
|
movw %bx, -2(%edx,%ecx)
|
||||||
|
movw %ax, (%edx)
|
||||||
|
jmp L(mm_return)
|
||||||
|
|
||||||
|
L(mm_len_9_16_bytes_backward):
|
||||||
|
PUSH (%esi)
|
||||||
|
movl -4(%eax,%ecx), %ebx
|
||||||
|
movl -8(%eax,%ecx), %esi
|
||||||
|
movl %ebx, -4(%edx,%ecx)
|
||||||
|
movl %esi, -8(%edx,%ecx)
|
||||||
|
subl $8, %ecx
|
||||||
|
POP (%esi)
|
||||||
|
jmp L(mm_len_0_16_bytes_backward)
|
||||||
|
|
||||||
|
L(mm_len_5_8_bytes_backward):
|
||||||
|
movl (%eax), %ebx
|
||||||
|
movl -4(%eax,%ecx), %eax
|
||||||
|
movl %ebx, (%edx)
|
||||||
|
movl %eax, -4(%edx,%ecx)
|
||||||
|
|
||||||
|
L(mm_return):
|
||||||
|
movl %edx, %eax
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
L(mm_return_pop_all):
|
||||||
|
movl %edi, %eax
|
||||||
|
POP (%edi)
|
||||||
|
POP (%esi)
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
/* Big length copy forward part. */
|
||||||
|
|
||||||
|
L(mm_large_page_forward):
|
||||||
|
/* Aligning the address of destination. We need to save
|
||||||
|
16 bits from the source in order not to overwrite them. */
|
||||||
|
|
||||||
|
PUSH (%esi)
|
||||||
|
PUSH (%edi)
|
||||||
|
movl %eax, %esi
|
||||||
|
movl %edx, %edi
|
||||||
|
|
||||||
|
movdqu (%esi), %xmm0
|
||||||
|
movdqu 16(%esi), %xmm1
|
||||||
|
movdqu 32(%esi), %xmm2
|
||||||
|
movdqu 48(%esi), %xmm3
|
||||||
|
|
||||||
|
leal 64(%edi), %edx
|
||||||
|
andl $-64, %edx
|
||||||
|
|
||||||
|
movl %esi, %eax
|
||||||
|
subl %edi, %eax
|
||||||
|
|
||||||
|
movdqu (%edx, %eax), %xmm4
|
||||||
|
movdqu 16(%edx, %eax), %xmm5
|
||||||
|
movdqu 32(%edx, %eax), %xmm6
|
||||||
|
movdqu 48(%edx, %eax), %xmm7
|
||||||
|
|
||||||
|
movdqu %xmm0, (%edi)
|
||||||
|
movdqu %xmm1, 16(%edi)
|
||||||
|
movdqu %xmm2, 32(%edi)
|
||||||
|
movdqu %xmm3, 48(%edi)
|
||||||
|
movntdq %xmm4, (%edx)
|
||||||
|
movntdq %xmm5, 16(%edx)
|
||||||
|
movntdq %xmm6, 32(%edx)
|
||||||
|
movntdq %xmm7, 48(%edx)
|
||||||
|
addl $64, %edx
|
||||||
|
|
||||||
|
leal (%edi, %ecx), %ebx
|
||||||
|
andl $-128, %ebx
|
||||||
|
|
||||||
|
cmp %edx, %ebx
|
||||||
|
jbe L(mm_copy_remaining_forward)
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
L(mm_large_page_loop_forward):
|
||||||
|
movdqu (%edx, %eax), %xmm0
|
||||||
|
movdqu 16(%edx, %eax), %xmm1
|
||||||
|
movdqu 32(%edx, %eax), %xmm2
|
||||||
|
movdqu 48(%edx, %eax), %xmm3
|
||||||
|
movdqu 64(%edx, %eax), %xmm4
|
||||||
|
movdqu 80(%edx, %eax), %xmm5
|
||||||
|
movdqu 96(%edx, %eax), %xmm6
|
||||||
|
movdqu 112(%edx, %eax), %xmm7
|
||||||
|
movntdq %xmm0, (%edx)
|
||||||
|
movntdq %xmm1, 16(%edx)
|
||||||
|
movntdq %xmm2, 32(%edx)
|
||||||
|
movntdq %xmm3, 48(%edx)
|
||||||
|
movntdq %xmm4, 64(%edx)
|
||||||
|
movntdq %xmm5, 80(%edx)
|
||||||
|
movntdq %xmm6, 96(%edx)
|
||||||
|
movntdq %xmm7, 112(%edx)
|
||||||
|
leal 128(%edx), %edx
|
||||||
|
cmp %edx, %ebx
|
||||||
|
ja L(mm_large_page_loop_forward)
|
||||||
|
sfence
|
||||||
|
|
||||||
|
addl %edi, %ecx
|
||||||
|
subl %edx, %ecx
|
||||||
|
/* We copied all up till %edx position in the dst.
|
||||||
|
In %ecx now is how many bytes are left to copy.
|
||||||
|
Now we need to advance %esi. */
|
||||||
|
leal (%edx, %eax), %esi
|
||||||
|
|
||||||
|
cmp $64, %ecx
|
||||||
|
jb L(mm_remaining_0_64_bytes_forward)
|
||||||
|
|
||||||
|
movdqu (%esi), %xmm0
|
||||||
|
movdqu 16(%esi), %xmm1
|
||||||
|
movdqu 32(%esi), %xmm2
|
||||||
|
movdqu 48(%esi), %xmm3
|
||||||
|
movdqu -64(%esi, %ecx), %xmm4
|
||||||
|
movdqu -48(%esi, %ecx), %xmm5
|
||||||
|
movdqu -32(%esi, %ecx), %xmm6
|
||||||
|
movdqu -16(%esi, %ecx), %xmm7
|
||||||
|
movdqu %xmm0, (%edx)
|
||||||
|
movdqu %xmm1, 16(%edx)
|
||||||
|
movdqu %xmm2, 32(%edx)
|
||||||
|
movdqu %xmm3, 48(%edx)
|
||||||
|
movdqu %xmm4, -64(%edx, %ecx)
|
||||||
|
movdqu %xmm5, -48(%edx, %ecx)
|
||||||
|
movdqu %xmm6, -32(%edx, %ecx)
|
||||||
|
movdqu %xmm7, -16(%edx, %ecx)
|
||||||
|
jmp L(mm_return_pop_all)
|
||||||
|
|
||||||
|
|
||||||
|
/* Big length copy backward part. */
|
||||||
|
L(mm_large_page_backward):
|
||||||
|
/* Aligning the address of destination. We need to save
|
||||||
|
16 bits from the source in order not to overwrite them. */
|
||||||
|
|
||||||
|
PUSH (%esi)
|
||||||
|
PUSH (%edi)
|
||||||
|
|
||||||
|
movdqu -16(%eax, %ecx), %xmm0
|
||||||
|
movdqu -32(%eax, %ecx), %xmm1
|
||||||
|
movdqu -48(%eax, %ecx), %xmm2
|
||||||
|
movdqu -64(%eax, %ecx), %xmm3
|
||||||
|
|
||||||
|
leal (%edx, %ecx), %edi
|
||||||
|
andl $-64, %edi
|
||||||
|
|
||||||
|
movl %eax, %esi
|
||||||
|
subl %edx, %esi
|
||||||
|
|
||||||
|
movdqu -16(%edi, %esi), %xmm4
|
||||||
|
movdqu -32(%edi, %esi), %xmm5
|
||||||
|
movdqu -48(%edi, %esi), %xmm6
|
||||||
|
movdqu -64(%edi, %esi), %xmm7
|
||||||
|
|
||||||
|
movdqu %xmm0, -16(%edx, %ecx)
|
||||||
|
movdqu %xmm1, -32(%edx, %ecx)
|
||||||
|
movdqu %xmm2, -48(%edx, %ecx)
|
||||||
|
movdqu %xmm3, -64(%edx, %ecx)
|
||||||
|
movntdq %xmm4, -16(%edi)
|
||||||
|
movntdq %xmm5, -32(%edi)
|
||||||
|
movntdq %xmm6, -48(%edi)
|
||||||
|
movntdq %xmm7, -64(%edi)
|
||||||
|
leal -64(%edi), %edi
|
||||||
|
|
||||||
|
leal 128(%edx), %ebx
|
||||||
|
andl $-64, %ebx
|
||||||
|
|
||||||
|
/* Compute in %ecx how many bytes are left to copy after
|
||||||
|
the main loop stops. */
|
||||||
|
movl %ebx, %ecx
|
||||||
|
subl %edx, %ecx
|
||||||
|
|
||||||
|
cmp %edi, %ebx
|
||||||
|
jae L(mm_len_0_or_more_backward)
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
L(mm_large_page_loop_backward):
|
||||||
|
movdqu -64(%edi, %esi), %xmm0
|
||||||
|
movdqu -48(%edi, %esi), %xmm1
|
||||||
|
movdqu -32(%edi, %esi), %xmm2
|
||||||
|
movdqu -16(%edi, %esi), %xmm3
|
||||||
|
movntdq %xmm0, -64(%edi)
|
||||||
|
movntdq %xmm1, -48(%edi)
|
||||||
|
movntdq %xmm2, -32(%edi)
|
||||||
|
movntdq %xmm3, -16(%edi)
|
||||||
|
leal -64(%edi), %edi
|
||||||
|
cmp %edi, %ebx
|
||||||
|
jb L(mm_large_page_loop_backward)
|
||||||
|
POP (%edi)
|
||||||
|
POP (%esi)
|
||||||
|
jmp L(mm_len_0_or_more_backward)
|
||||||
|
|
||||||
|
END (MEMMOVE)
|
841
libc/arch-x86/silvermont/string/sse2-memset-slm.S
Normal file
841
libc/arch-x86/silvermont/string/sse2-memset-slm.S
Normal file
@ -0,0 +1,841 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2014, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "cache.h"
|
||||||
|
|
||||||
|
#ifndef MEMSET
|
||||||
|
# define MEMSET memset
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef L
|
||||||
|
# define L(label) .L##label
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef ALIGN
|
||||||
|
# define ALIGN(n) .p2align n
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_startproc
|
||||||
|
# define cfi_startproc .cfi_startproc
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_endproc
|
||||||
|
# define cfi_endproc .cfi_endproc
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_rel_offset
|
||||||
|
# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_restore
|
||||||
|
# define cfi_restore(reg) .cfi_restore reg
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_adjust_cfa_offset
|
||||||
|
# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef ENTRY
|
||||||
|
# define ENTRY(name) \
|
||||||
|
.type name, @function; \
|
||||||
|
.globl name; \
|
||||||
|
.p2align 4; \
|
||||||
|
name: \
|
||||||
|
cfi_startproc
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef END
|
||||||
|
# define END(name) \
|
||||||
|
cfi_endproc; \
|
||||||
|
.size name, .-name
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define CFI_PUSH(REG) \
|
||||||
|
cfi_adjust_cfa_offset (4); \
|
||||||
|
cfi_rel_offset (REG, 0)
|
||||||
|
|
||||||
|
#define CFI_POP(REG) \
|
||||||
|
cfi_adjust_cfa_offset (-4); \
|
||||||
|
cfi_restore (REG)
|
||||||
|
|
||||||
|
#define PUSH(REG) pushl REG; CFI_PUSH (REG)
|
||||||
|
#define POP(REG) popl REG; CFI_POP (REG)
|
||||||
|
|
||||||
|
#ifdef USE_AS_BZERO
|
||||||
|
# define DEST PARMS
|
||||||
|
# define LEN DEST+4
|
||||||
|
# define SETRTNVAL
|
||||||
|
#else
|
||||||
|
# define DEST PARMS
|
||||||
|
# define CHR DEST+4
|
||||||
|
# define LEN CHR+4
|
||||||
|
# define SETRTNVAL movl DEST(%esp), %eax
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (defined SHARED || defined __PIC__)
|
||||||
|
# define ENTRANCE PUSH (%ebx);
|
||||||
|
# define RETURN_END POP (%ebx); ret
|
||||||
|
# define RETURN RETURN_END; CFI_PUSH (%ebx)
|
||||||
|
# define PARMS 8 /* Preserve EBX. */
|
||||||
|
# define JMPTBL(I, B) I - B
|
||||||
|
|
||||||
|
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
|
||||||
|
jump table with relative offsets. */
|
||||||
|
# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
|
||||||
|
/* We first load PC into EBX. */ \
|
||||||
|
call __x86.get_pc_thunk.bx; \
|
||||||
|
/* Get the address of the jump table. */ \
|
||||||
|
add $(TABLE - .), %ebx; \
|
||||||
|
/* Get the entry and convert the relative offset to the \
|
||||||
|
absolute address. */ \
|
||||||
|
add (%ebx,%ecx,4), %ebx; \
|
||||||
|
add %ecx, %edx; \
|
||||||
|
/* We loaded the jump table and adjuested EDX. Go. */ \
|
||||||
|
jmp *%ebx
|
||||||
|
|
||||||
|
.section .gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
|
||||||
|
.globl __x86.get_pc_thunk.bx
|
||||||
|
.hidden __x86.get_pc_thunk.bx
|
||||||
|
ALIGN (4)
|
||||||
|
.type __x86.get_pc_thunk.bx,@function
|
||||||
|
__x86.get_pc_thunk.bx:
|
||||||
|
movl (%esp), %ebx
|
||||||
|
ret
|
||||||
|
#else
|
||||||
|
# define ENTRANCE
|
||||||
|
# define RETURN_END ret
|
||||||
|
# define RETURN RETURN_END
|
||||||
|
# define PARMS 4
|
||||||
|
# define JMPTBL(I, B) I
|
||||||
|
|
||||||
|
/* Branch to an entry in a jump table. TABLE is a jump table with
|
||||||
|
absolute offsets. */
|
||||||
|
# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
|
||||||
|
add %ecx, %edx; \
|
||||||
|
jmp *TABLE(,%ecx,4)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
.section .text.sse2,"ax",@progbits
|
||||||
|
ALIGN (4)
|
||||||
|
ENTRY (MEMSET)
|
||||||
|
ENTRANCE
|
||||||
|
|
||||||
|
movl LEN(%esp), %ecx
|
||||||
|
cmp $0, %ecx
|
||||||
|
ja L(1byteormore)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
L(1byteormore):
|
||||||
|
#ifdef USE_AS_BZERO
|
||||||
|
xor %eax, %eax
|
||||||
|
#else
|
||||||
|
movzbl CHR(%esp), %eax
|
||||||
|
movb %al, %ah
|
||||||
|
/* Fill the whole EAX with pattern. */
|
||||||
|
movl %eax, %edx
|
||||||
|
shl $16, %eax
|
||||||
|
or %edx, %eax
|
||||||
|
#endif
|
||||||
|
movl DEST(%esp), %edx
|
||||||
|
cmp $1, %ecx
|
||||||
|
je L(1byte)
|
||||||
|
cmp $16, %ecx
|
||||||
|
jae L(16bytesormore)
|
||||||
|
|
||||||
|
cmp $4, %ecx
|
||||||
|
jb L(4bytesless)
|
||||||
|
movl %eax, (%edx)
|
||||||
|
movl %eax, -4(%edx, %ecx)
|
||||||
|
cmp $8, %ecx
|
||||||
|
jb L(8bytesless)
|
||||||
|
movl %eax, 4(%edx)
|
||||||
|
movl %eax, -8(%edx, %ecx)
|
||||||
|
L(8bytesless):
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
L(4bytesless):
|
||||||
|
movw %ax, (%edx)
|
||||||
|
movw %ax, -2(%edx, %ecx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
L(1byte):
|
||||||
|
movb %al, (%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(16bytesormore):
|
||||||
|
#ifdef USE_AS_BZERO
|
||||||
|
pxor %xmm0, %xmm0
|
||||||
|
#else
|
||||||
|
movd %eax, %xmm0
|
||||||
|
pshufd $0, %xmm0, %xmm0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
cmp $64, %ecx
|
||||||
|
ja L(64bytesmore)
|
||||||
|
movdqu %xmm0, (%edx)
|
||||||
|
movdqu %xmm0, -16(%edx, %ecx)
|
||||||
|
cmp $32, %ecx
|
||||||
|
jbe L(32bytesless)
|
||||||
|
movdqu %xmm0, 16(%edx)
|
||||||
|
movdqu %xmm0, -32(%edx, %ecx)
|
||||||
|
L(32bytesless):
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
L(64bytesmore):
|
||||||
|
testl $0xf, %edx
|
||||||
|
jz L(aligned_16)
|
||||||
|
L(not_aligned_16):
|
||||||
|
movdqu %xmm0, (%edx)
|
||||||
|
movl %edx, %eax
|
||||||
|
and $-16, %edx
|
||||||
|
add $16, %edx
|
||||||
|
sub %edx, %eax
|
||||||
|
add %eax, %ecx
|
||||||
|
movd %xmm0, %eax
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16):
|
||||||
|
cmp $128, %ecx
|
||||||
|
jae L(128bytesormore)
|
||||||
|
|
||||||
|
L(aligned_16_less128bytes):
|
||||||
|
BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(128bytesormore):
|
||||||
|
#ifdef SHARED_CACHE_SIZE
|
||||||
|
PUSH (%ebx)
|
||||||
|
mov $SHARED_CACHE_SIZE, %ebx
|
||||||
|
#else
|
||||||
|
# if (defined SHARED || defined __PIC__)
|
||||||
|
call __x86.get_pc_thunk.bx
|
||||||
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
||||||
|
mov $__x86_shared_cache_size@GOTOFF(%ebx), %ebx
|
||||||
|
# else
|
||||||
|
PUSH (%ebx)
|
||||||
|
mov $__x86_shared_cache_size, %ebx
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
cmp %ebx, %ecx
|
||||||
|
jae L(128bytesormore_nt_start)
|
||||||
|
|
||||||
|
POP (%ebx)
|
||||||
|
|
||||||
|
#ifdef DATA_CACHE_SIZE
|
||||||
|
PUSH (%ebx)
|
||||||
|
mov $DATA_CACHE_SIZE, %ebx
|
||||||
|
#else
|
||||||
|
# if (defined SHARED || defined __PIC__)
|
||||||
|
call __x86.get_pc_thunk.bx
|
||||||
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
||||||
|
mov $__x86_data_cache_size@GOTOFF(%ebx), %ebx
|
||||||
|
# else
|
||||||
|
PUSH (%ebx)
|
||||||
|
mov $__x86_data_cache_size, %ebx
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
cmp %ebx, %ecx
|
||||||
|
jae L(128bytes_L2_normal)
|
||||||
|
subl $128, %ecx
|
||||||
|
L(128bytesormore_normal):
|
||||||
|
sub $128, %ecx
|
||||||
|
movdqa %xmm0, (%edx)
|
||||||
|
movaps %xmm0, 0x10(%edx)
|
||||||
|
movaps %xmm0, 0x20(%edx)
|
||||||
|
movaps %xmm0, 0x30(%edx)
|
||||||
|
movaps %xmm0, 0x40(%edx)
|
||||||
|
movaps %xmm0, 0x50(%edx)
|
||||||
|
movaps %xmm0, 0x60(%edx)
|
||||||
|
movaps %xmm0, 0x70(%edx)
|
||||||
|
lea 128(%edx), %edx
|
||||||
|
jb L(128bytesless_normal)
|
||||||
|
|
||||||
|
|
||||||
|
sub $128, %ecx
|
||||||
|
movdqa %xmm0, (%edx)
|
||||||
|
movaps %xmm0, 0x10(%edx)
|
||||||
|
movaps %xmm0, 0x20(%edx)
|
||||||
|
movaps %xmm0, 0x30(%edx)
|
||||||
|
movaps %xmm0, 0x40(%edx)
|
||||||
|
movaps %xmm0, 0x50(%edx)
|
||||||
|
movaps %xmm0, 0x60(%edx)
|
||||||
|
movaps %xmm0, 0x70(%edx)
|
||||||
|
lea 128(%edx), %edx
|
||||||
|
jae L(128bytesormore_normal)
|
||||||
|
|
||||||
|
L(128bytesless_normal):
|
||||||
|
lea 128(%ecx), %ecx
|
||||||
|
#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
|
||||||
|
POP (%ebx)
|
||||||
|
#endif
|
||||||
|
BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(128bytes_L2_normal):
|
||||||
|
prefetchnta 0x380(%edx)
|
||||||
|
prefetchnta 0x3c0(%edx)
|
||||||
|
sub $128, %ecx
|
||||||
|
movdqa %xmm0, (%edx)
|
||||||
|
movaps %xmm0, 0x10(%edx)
|
||||||
|
movaps %xmm0, 0x20(%edx)
|
||||||
|
movaps %xmm0, 0x30(%edx)
|
||||||
|
movaps %xmm0, 0x40(%edx)
|
||||||
|
movaps %xmm0, 0x50(%edx)
|
||||||
|
movaps %xmm0, 0x60(%edx)
|
||||||
|
movaps %xmm0, 0x70(%edx)
|
||||||
|
add $128, %edx
|
||||||
|
cmp $128, %ecx
|
||||||
|
jae L(128bytes_L2_normal)
|
||||||
|
|
||||||
|
L(128bytesless_L2_normal):
|
||||||
|
#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
|
||||||
|
POP (%ebx)
|
||||||
|
#endif
|
||||||
|
BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
|
||||||
|
|
||||||
|
L(128bytesormore_nt_start):
|
||||||
|
sub %ebx, %ecx
|
||||||
|
ALIGN (4)
|
||||||
|
L(128bytesormore_shared_cache_loop):
|
||||||
|
prefetchnta 0x3c0(%edx)
|
||||||
|
prefetchnta 0x380(%edx)
|
||||||
|
sub $0x80, %ebx
|
||||||
|
movdqa %xmm0, (%edx)
|
||||||
|
movaps %xmm0, 0x10(%edx)
|
||||||
|
movaps %xmm0, 0x20(%edx)
|
||||||
|
movaps %xmm0, 0x30(%edx)
|
||||||
|
movaps %xmm0, 0x40(%edx)
|
||||||
|
movaps %xmm0, 0x50(%edx)
|
||||||
|
movaps %xmm0, 0x60(%edx)
|
||||||
|
movaps %xmm0, 0x70(%edx)
|
||||||
|
add $0x80, %edx
|
||||||
|
cmp $0x80, %ebx
|
||||||
|
jae L(128bytesormore_shared_cache_loop)
|
||||||
|
cmp $0x80, %ecx
|
||||||
|
jb L(shared_cache_loop_end)
|
||||||
|
ALIGN (4)
|
||||||
|
L(128bytesormore_nt):
|
||||||
|
sub $0x80, %ecx
|
||||||
|
movntdq %xmm0, (%edx)
|
||||||
|
movntdq %xmm0, 0x10(%edx)
|
||||||
|
movntdq %xmm0, 0x20(%edx)
|
||||||
|
movntdq %xmm0, 0x30(%edx)
|
||||||
|
movntdq %xmm0, 0x40(%edx)
|
||||||
|
movntdq %xmm0, 0x50(%edx)
|
||||||
|
movntdq %xmm0, 0x60(%edx)
|
||||||
|
movntdq %xmm0, 0x70(%edx)
|
||||||
|
add $0x80, %edx
|
||||||
|
cmp $0x80, %ecx
|
||||||
|
jae L(128bytesormore_nt)
|
||||||
|
sfence
|
||||||
|
L(shared_cache_loop_end):
|
||||||
|
#if defined SHARED_CACHE_SIZE || !(defined SHARED || defined __PIC__)
|
||||||
|
POP (%ebx)
|
||||||
|
#endif
|
||||||
|
BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
|
||||||
|
|
||||||
|
|
||||||
|
.pushsection .rodata.sse2,"a",@progbits
|
||||||
|
ALIGN (2)
|
||||||
|
L(table_16_128bytes):
|
||||||
|
.int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
|
||||||
|
.int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
|
||||||
|
.popsection
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_112bytes):
|
||||||
|
movdqa %xmm0, -112(%edx)
|
||||||
|
L(aligned_16_96bytes):
|
||||||
|
movdqa %xmm0, -96(%edx)
|
||||||
|
L(aligned_16_80bytes):
|
||||||
|
movdqa %xmm0, -80(%edx)
|
||||||
|
L(aligned_16_64bytes):
|
||||||
|
movdqa %xmm0, -64(%edx)
|
||||||
|
L(aligned_16_48bytes):
|
||||||
|
movdqa %xmm0, -48(%edx)
|
||||||
|
L(aligned_16_32bytes):
|
||||||
|
movdqa %xmm0, -32(%edx)
|
||||||
|
L(aligned_16_16bytes):
|
||||||
|
movdqa %xmm0, -16(%edx)
|
||||||
|
L(aligned_16_0bytes):
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_113bytes):
|
||||||
|
movdqa %xmm0, -113(%edx)
|
||||||
|
L(aligned_16_97bytes):
|
||||||
|
movdqa %xmm0, -97(%edx)
|
||||||
|
L(aligned_16_81bytes):
|
||||||
|
movdqa %xmm0, -81(%edx)
|
||||||
|
L(aligned_16_65bytes):
|
||||||
|
movdqa %xmm0, -65(%edx)
|
||||||
|
L(aligned_16_49bytes):
|
||||||
|
movdqa %xmm0, -49(%edx)
|
||||||
|
L(aligned_16_33bytes):
|
||||||
|
movdqa %xmm0, -33(%edx)
|
||||||
|
L(aligned_16_17bytes):
|
||||||
|
movdqa %xmm0, -17(%edx)
|
||||||
|
L(aligned_16_1bytes):
|
||||||
|
movb %al, -1(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_114bytes):
|
||||||
|
movdqa %xmm0, -114(%edx)
|
||||||
|
L(aligned_16_98bytes):
|
||||||
|
movdqa %xmm0, -98(%edx)
|
||||||
|
L(aligned_16_82bytes):
|
||||||
|
movdqa %xmm0, -82(%edx)
|
||||||
|
L(aligned_16_66bytes):
|
||||||
|
movdqa %xmm0, -66(%edx)
|
||||||
|
L(aligned_16_50bytes):
|
||||||
|
movdqa %xmm0, -50(%edx)
|
||||||
|
L(aligned_16_34bytes):
|
||||||
|
movdqa %xmm0, -34(%edx)
|
||||||
|
L(aligned_16_18bytes):
|
||||||
|
movdqa %xmm0, -18(%edx)
|
||||||
|
L(aligned_16_2bytes):
|
||||||
|
movw %ax, -2(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_115bytes):
|
||||||
|
movdqa %xmm0, -115(%edx)
|
||||||
|
L(aligned_16_99bytes):
|
||||||
|
movdqa %xmm0, -99(%edx)
|
||||||
|
L(aligned_16_83bytes):
|
||||||
|
movdqa %xmm0, -83(%edx)
|
||||||
|
L(aligned_16_67bytes):
|
||||||
|
movdqa %xmm0, -67(%edx)
|
||||||
|
L(aligned_16_51bytes):
|
||||||
|
movdqa %xmm0, -51(%edx)
|
||||||
|
L(aligned_16_35bytes):
|
||||||
|
movdqa %xmm0, -35(%edx)
|
||||||
|
L(aligned_16_19bytes):
|
||||||
|
movdqa %xmm0, -19(%edx)
|
||||||
|
L(aligned_16_3bytes):
|
||||||
|
movw %ax, -3(%edx)
|
||||||
|
movb %al, -1(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_116bytes):
|
||||||
|
movdqa %xmm0, -116(%edx)
|
||||||
|
L(aligned_16_100bytes):
|
||||||
|
movdqa %xmm0, -100(%edx)
|
||||||
|
L(aligned_16_84bytes):
|
||||||
|
movdqa %xmm0, -84(%edx)
|
||||||
|
L(aligned_16_68bytes):
|
||||||
|
movdqa %xmm0, -68(%edx)
|
||||||
|
L(aligned_16_52bytes):
|
||||||
|
movdqa %xmm0, -52(%edx)
|
||||||
|
L(aligned_16_36bytes):
|
||||||
|
movdqa %xmm0, -36(%edx)
|
||||||
|
L(aligned_16_20bytes):
|
||||||
|
movdqa %xmm0, -20(%edx)
|
||||||
|
L(aligned_16_4bytes):
|
||||||
|
movl %eax, -4(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_117bytes):
|
||||||
|
movdqa %xmm0, -117(%edx)
|
||||||
|
L(aligned_16_101bytes):
|
||||||
|
movdqa %xmm0, -101(%edx)
|
||||||
|
L(aligned_16_85bytes):
|
||||||
|
movdqa %xmm0, -85(%edx)
|
||||||
|
L(aligned_16_69bytes):
|
||||||
|
movdqa %xmm0, -69(%edx)
|
||||||
|
L(aligned_16_53bytes):
|
||||||
|
movdqa %xmm0, -53(%edx)
|
||||||
|
L(aligned_16_37bytes):
|
||||||
|
movdqa %xmm0, -37(%edx)
|
||||||
|
L(aligned_16_21bytes):
|
||||||
|
movdqa %xmm0, -21(%edx)
|
||||||
|
L(aligned_16_5bytes):
|
||||||
|
movl %eax, -5(%edx)
|
||||||
|
movb %al, -1(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_118bytes):
|
||||||
|
movdqa %xmm0, -118(%edx)
|
||||||
|
L(aligned_16_102bytes):
|
||||||
|
movdqa %xmm0, -102(%edx)
|
||||||
|
L(aligned_16_86bytes):
|
||||||
|
movdqa %xmm0, -86(%edx)
|
||||||
|
L(aligned_16_70bytes):
|
||||||
|
movdqa %xmm0, -70(%edx)
|
||||||
|
L(aligned_16_54bytes):
|
||||||
|
movdqa %xmm0, -54(%edx)
|
||||||
|
L(aligned_16_38bytes):
|
||||||
|
movdqa %xmm0, -38(%edx)
|
||||||
|
L(aligned_16_22bytes):
|
||||||
|
movdqa %xmm0, -22(%edx)
|
||||||
|
L(aligned_16_6bytes):
|
||||||
|
movl %eax, -6(%edx)
|
||||||
|
movw %ax, -2(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_119bytes):
|
||||||
|
movdqa %xmm0, -119(%edx)
|
||||||
|
L(aligned_16_103bytes):
|
||||||
|
movdqa %xmm0, -103(%edx)
|
||||||
|
L(aligned_16_87bytes):
|
||||||
|
movdqa %xmm0, -87(%edx)
|
||||||
|
L(aligned_16_71bytes):
|
||||||
|
movdqa %xmm0, -71(%edx)
|
||||||
|
L(aligned_16_55bytes):
|
||||||
|
movdqa %xmm0, -55(%edx)
|
||||||
|
L(aligned_16_39bytes):
|
||||||
|
movdqa %xmm0, -39(%edx)
|
||||||
|
L(aligned_16_23bytes):
|
||||||
|
movdqa %xmm0, -23(%edx)
|
||||||
|
L(aligned_16_7bytes):
|
||||||
|
movl %eax, -7(%edx)
|
||||||
|
movw %ax, -3(%edx)
|
||||||
|
movb %al, -1(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_120bytes):
|
||||||
|
movdqa %xmm0, -120(%edx)
|
||||||
|
L(aligned_16_104bytes):
|
||||||
|
movdqa %xmm0, -104(%edx)
|
||||||
|
L(aligned_16_88bytes):
|
||||||
|
movdqa %xmm0, -88(%edx)
|
||||||
|
L(aligned_16_72bytes):
|
||||||
|
movdqa %xmm0, -72(%edx)
|
||||||
|
L(aligned_16_56bytes):
|
||||||
|
movdqa %xmm0, -56(%edx)
|
||||||
|
L(aligned_16_40bytes):
|
||||||
|
movdqa %xmm0, -40(%edx)
|
||||||
|
L(aligned_16_24bytes):
|
||||||
|
movdqa %xmm0, -24(%edx)
|
||||||
|
L(aligned_16_8bytes):
|
||||||
|
movq %xmm0, -8(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_121bytes):
|
||||||
|
movdqa %xmm0, -121(%edx)
|
||||||
|
L(aligned_16_105bytes):
|
||||||
|
movdqa %xmm0, -105(%edx)
|
||||||
|
L(aligned_16_89bytes):
|
||||||
|
movdqa %xmm0, -89(%edx)
|
||||||
|
L(aligned_16_73bytes):
|
||||||
|
movdqa %xmm0, -73(%edx)
|
||||||
|
L(aligned_16_57bytes):
|
||||||
|
movdqa %xmm0, -57(%edx)
|
||||||
|
L(aligned_16_41bytes):
|
||||||
|
movdqa %xmm0, -41(%edx)
|
||||||
|
L(aligned_16_25bytes):
|
||||||
|
movdqa %xmm0, -25(%edx)
|
||||||
|
L(aligned_16_9bytes):
|
||||||
|
movq %xmm0, -9(%edx)
|
||||||
|
movb %al, -1(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_122bytes):
|
||||||
|
movdqa %xmm0, -122(%edx)
|
||||||
|
L(aligned_16_106bytes):
|
||||||
|
movdqa %xmm0, -106(%edx)
|
||||||
|
L(aligned_16_90bytes):
|
||||||
|
movdqa %xmm0, -90(%edx)
|
||||||
|
L(aligned_16_74bytes):
|
||||||
|
movdqa %xmm0, -74(%edx)
|
||||||
|
L(aligned_16_58bytes):
|
||||||
|
movdqa %xmm0, -58(%edx)
|
||||||
|
L(aligned_16_42bytes):
|
||||||
|
movdqa %xmm0, -42(%edx)
|
||||||
|
L(aligned_16_26bytes):
|
||||||
|
movdqa %xmm0, -26(%edx)
|
||||||
|
L(aligned_16_10bytes):
|
||||||
|
movq %xmm0, -10(%edx)
|
||||||
|
movw %ax, -2(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_123bytes):
|
||||||
|
movdqa %xmm0, -123(%edx)
|
||||||
|
L(aligned_16_107bytes):
|
||||||
|
movdqa %xmm0, -107(%edx)
|
||||||
|
L(aligned_16_91bytes):
|
||||||
|
movdqa %xmm0, -91(%edx)
|
||||||
|
L(aligned_16_75bytes):
|
||||||
|
movdqa %xmm0, -75(%edx)
|
||||||
|
L(aligned_16_59bytes):
|
||||||
|
movdqa %xmm0, -59(%edx)
|
||||||
|
L(aligned_16_43bytes):
|
||||||
|
movdqa %xmm0, -43(%edx)
|
||||||
|
L(aligned_16_27bytes):
|
||||||
|
movdqa %xmm0, -27(%edx)
|
||||||
|
L(aligned_16_11bytes):
|
||||||
|
movq %xmm0, -11(%edx)
|
||||||
|
movw %ax, -3(%edx)
|
||||||
|
movb %al, -1(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_124bytes):
|
||||||
|
movdqa %xmm0, -124(%edx)
|
||||||
|
L(aligned_16_108bytes):
|
||||||
|
movdqa %xmm0, -108(%edx)
|
||||||
|
L(aligned_16_92bytes):
|
||||||
|
movdqa %xmm0, -92(%edx)
|
||||||
|
L(aligned_16_76bytes):
|
||||||
|
movdqa %xmm0, -76(%edx)
|
||||||
|
L(aligned_16_60bytes):
|
||||||
|
movdqa %xmm0, -60(%edx)
|
||||||
|
L(aligned_16_44bytes):
|
||||||
|
movdqa %xmm0, -44(%edx)
|
||||||
|
L(aligned_16_28bytes):
|
||||||
|
movdqa %xmm0, -28(%edx)
|
||||||
|
L(aligned_16_12bytes):
|
||||||
|
movq %xmm0, -12(%edx)
|
||||||
|
movl %eax, -4(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_125bytes):
|
||||||
|
movdqa %xmm0, -125(%edx)
|
||||||
|
L(aligned_16_109bytes):
|
||||||
|
movdqa %xmm0, -109(%edx)
|
||||||
|
L(aligned_16_93bytes):
|
||||||
|
movdqa %xmm0, -93(%edx)
|
||||||
|
L(aligned_16_77bytes):
|
||||||
|
movdqa %xmm0, -77(%edx)
|
||||||
|
L(aligned_16_61bytes):
|
||||||
|
movdqa %xmm0, -61(%edx)
|
||||||
|
L(aligned_16_45bytes):
|
||||||
|
movdqa %xmm0, -45(%edx)
|
||||||
|
L(aligned_16_29bytes):
|
||||||
|
movdqa %xmm0, -29(%edx)
|
||||||
|
L(aligned_16_13bytes):
|
||||||
|
movq %xmm0, -13(%edx)
|
||||||
|
movl %eax, -5(%edx)
|
||||||
|
movb %al, -1(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_126bytes):
|
||||||
|
movdqa %xmm0, -126(%edx)
|
||||||
|
L(aligned_16_110bytes):
|
||||||
|
movdqa %xmm0, -110(%edx)
|
||||||
|
L(aligned_16_94bytes):
|
||||||
|
movdqa %xmm0, -94(%edx)
|
||||||
|
L(aligned_16_78bytes):
|
||||||
|
movdqa %xmm0, -78(%edx)
|
||||||
|
L(aligned_16_62bytes):
|
||||||
|
movdqa %xmm0, -62(%edx)
|
||||||
|
L(aligned_16_46bytes):
|
||||||
|
movdqa %xmm0, -46(%edx)
|
||||||
|
L(aligned_16_30bytes):
|
||||||
|
movdqa %xmm0, -30(%edx)
|
||||||
|
L(aligned_16_14bytes):
|
||||||
|
movq %xmm0, -14(%edx)
|
||||||
|
movl %eax, -6(%edx)
|
||||||
|
movw %ax, -2(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN
|
||||||
|
|
||||||
|
ALIGN (4)
|
||||||
|
L(aligned_16_127bytes):
|
||||||
|
movdqa %xmm0, -127(%edx)
|
||||||
|
L(aligned_16_111bytes):
|
||||||
|
movdqa %xmm0, -111(%edx)
|
||||||
|
L(aligned_16_95bytes):
|
||||||
|
movdqa %xmm0, -95(%edx)
|
||||||
|
L(aligned_16_79bytes):
|
||||||
|
movdqa %xmm0, -79(%edx)
|
||||||
|
L(aligned_16_63bytes):
|
||||||
|
movdqa %xmm0, -63(%edx)
|
||||||
|
L(aligned_16_47bytes):
|
||||||
|
movdqa %xmm0, -47(%edx)
|
||||||
|
L(aligned_16_31bytes):
|
||||||
|
movdqa %xmm0, -31(%edx)
|
||||||
|
L(aligned_16_15bytes):
|
||||||
|
movq %xmm0, -15(%edx)
|
||||||
|
movl %eax, -7(%edx)
|
||||||
|
movw %ax, -3(%edx)
|
||||||
|
movb %al, -1(%edx)
|
||||||
|
SETRTNVAL
|
||||||
|
RETURN_END
|
||||||
|
|
||||||
|
END (MEMSET)
|
33
libc/arch-x86/silvermont/string/sse2-stpcpy-slm.S
Executable file
33
libc/arch-x86/silvermont/string/sse2-stpcpy-slm.S
Executable file
@ -0,0 +1,33 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2014, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define USE_AS_STPCPY
|
||||||
|
#define STRCPY stpcpy
|
||||||
|
#include "sse2-strcpy-slm.S"
|
34
libc/arch-x86/silvermont/string/sse2-stpncpy-slm.S
Normal file
34
libc/arch-x86/silvermont/string/sse2-stpncpy-slm.S
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2014, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define USE_AS_STRNCPY
|
||||||
|
#define USE_AS_STPCPY
|
||||||
|
#define STRCPY stpncpy
|
||||||
|
#include "sse2-strcpy-slm.S"
|
2157
libc/arch-x86/silvermont/string/sse2-strcpy-slm.S
Executable file
2157
libc/arch-x86/silvermont/string/sse2-strcpy-slm.S
Executable file
File diff suppressed because it is too large
Load Diff
328
libc/arch-x86/silvermont/string/sse2-strlen-slm.S
Executable file
328
libc/arch-x86/silvermont/string/sse2-strlen-slm.S
Executable file
@ -0,0 +1,328 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2014, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef STRLEN
|
||||||
|
# define STRLEN strlen
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef L
|
||||||
|
# define L(label) .L##label
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_startproc
|
||||||
|
# define cfi_startproc .cfi_startproc
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_endproc
|
||||||
|
# define cfi_endproc .cfi_endproc
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_rel_offset
|
||||||
|
# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_restore
|
||||||
|
# define cfi_restore(reg) .cfi_restore reg
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef cfi_adjust_cfa_offset
|
||||||
|
# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef ENTRY
|
||||||
|
# define ENTRY(name) \
|
||||||
|
.type name, @function; \
|
||||||
|
.globl name; \
|
||||||
|
.p2align 4; \
|
||||||
|
name: \
|
||||||
|
cfi_startproc
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef END
|
||||||
|
# define END(name) \
|
||||||
|
cfi_endproc; \
|
||||||
|
.size name, .-name
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define CFI_PUSH(REG) \
|
||||||
|
cfi_adjust_cfa_offset (4); \
|
||||||
|
cfi_rel_offset (REG, 0)
|
||||||
|
|
||||||
|
#define CFI_POP(REG) \
|
||||||
|
cfi_adjust_cfa_offset (-4); \
|
||||||
|
cfi_restore (REG)
|
||||||
|
|
||||||
|
#define PUSH(REG) pushl REG; CFI_PUSH (REG)
|
||||||
|
#define POP(REG) popl REG; CFI_POP (REG)
|
||||||
|
|
||||||
|
.section .text.sse2,"ax",@progbits
|
||||||
|
ENTRY (STRLEN)
|
||||||
|
mov 4(%esp), %edx
|
||||||
|
mov %edx, %ecx
|
||||||
|
and $0x3f, %ecx
|
||||||
|
pxor %xmm0, %xmm0
|
||||||
|
cmp $0x30, %ecx
|
||||||
|
ja L(next)
|
||||||
|
movdqu (%edx), %xmm1
|
||||||
|
pcmpeqb %xmm1, %xmm0
|
||||||
|
pmovmskb %xmm0, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit_less16)
|
||||||
|
mov %edx, %eax
|
||||||
|
and $-16, %eax
|
||||||
|
jmp L(align16_start)
|
||||||
|
L(next):
|
||||||
|
mov %edx, %eax
|
||||||
|
and $-16, %eax
|
||||||
|
PUSH (%edi)
|
||||||
|
pcmpeqb (%eax), %xmm0
|
||||||
|
mov $-1, %edi
|
||||||
|
sub %eax, %ecx
|
||||||
|
shl %cl, %edi
|
||||||
|
pmovmskb %xmm0, %ecx
|
||||||
|
and %edi, %ecx
|
||||||
|
POP (%edi)
|
||||||
|
jnz L(exit_unaligned)
|
||||||
|
pxor %xmm0, %xmm0
|
||||||
|
L(align16_start):
|
||||||
|
pxor %xmm1, %xmm1
|
||||||
|
pxor %xmm2, %xmm2
|
||||||
|
pxor %xmm3, %xmm3
|
||||||
|
pcmpeqb 16(%eax), %xmm0
|
||||||
|
pmovmskb %xmm0, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit16)
|
||||||
|
|
||||||
|
pcmpeqb 32(%eax), %xmm1
|
||||||
|
pmovmskb %xmm1, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit32)
|
||||||
|
|
||||||
|
pcmpeqb 48(%eax), %xmm2
|
||||||
|
pmovmskb %xmm2, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit48)
|
||||||
|
|
||||||
|
pcmpeqb 64(%eax), %xmm3
|
||||||
|
pmovmskb %xmm3, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit64)
|
||||||
|
|
||||||
|
pcmpeqb 80(%eax), %xmm0
|
||||||
|
add $64, %eax
|
||||||
|
pmovmskb %xmm0, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit16)
|
||||||
|
|
||||||
|
pcmpeqb 32(%eax), %xmm1
|
||||||
|
pmovmskb %xmm1, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit32)
|
||||||
|
|
||||||
|
pcmpeqb 48(%eax), %xmm2
|
||||||
|
pmovmskb %xmm2, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit48)
|
||||||
|
|
||||||
|
pcmpeqb 64(%eax), %xmm3
|
||||||
|
pmovmskb %xmm3, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit64)
|
||||||
|
|
||||||
|
pcmpeqb 80(%eax), %xmm0
|
||||||
|
add $64, %eax
|
||||||
|
pmovmskb %xmm0, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit16)
|
||||||
|
|
||||||
|
pcmpeqb 32(%eax), %xmm1
|
||||||
|
pmovmskb %xmm1, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit32)
|
||||||
|
|
||||||
|
pcmpeqb 48(%eax), %xmm2
|
||||||
|
pmovmskb %xmm2, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit48)
|
||||||
|
|
||||||
|
pcmpeqb 64(%eax), %xmm3
|
||||||
|
pmovmskb %xmm3, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit64)
|
||||||
|
|
||||||
|
pcmpeqb 80(%eax), %xmm0
|
||||||
|
add $64, %eax
|
||||||
|
pmovmskb %xmm0, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit16)
|
||||||
|
|
||||||
|
pcmpeqb 32(%eax), %xmm1
|
||||||
|
pmovmskb %xmm1, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit32)
|
||||||
|
|
||||||
|
pcmpeqb 48(%eax), %xmm2
|
||||||
|
pmovmskb %xmm2, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit48)
|
||||||
|
|
||||||
|
pcmpeqb 64(%eax), %xmm3
|
||||||
|
pmovmskb %xmm3, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit64)
|
||||||
|
|
||||||
|
|
||||||
|
test $0x3f, %eax
|
||||||
|
jz L(align64_loop)
|
||||||
|
|
||||||
|
pcmpeqb 80(%eax), %xmm0
|
||||||
|
add $80, %eax
|
||||||
|
pmovmskb %xmm0, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit)
|
||||||
|
|
||||||
|
test $0x3f, %eax
|
||||||
|
jz L(align64_loop)
|
||||||
|
|
||||||
|
pcmpeqb 16(%eax), %xmm1
|
||||||
|
add $16, %eax
|
||||||
|
pmovmskb %xmm1, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit)
|
||||||
|
|
||||||
|
test $0x3f, %eax
|
||||||
|
jz L(align64_loop)
|
||||||
|
|
||||||
|
pcmpeqb 16(%eax), %xmm2
|
||||||
|
add $16, %eax
|
||||||
|
pmovmskb %xmm2, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit)
|
||||||
|
|
||||||
|
test $0x3f, %eax
|
||||||
|
jz L(align64_loop)
|
||||||
|
|
||||||
|
pcmpeqb 16(%eax), %xmm3
|
||||||
|
add $16, %eax
|
||||||
|
pmovmskb %xmm3, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit)
|
||||||
|
|
||||||
|
add $16, %eax
|
||||||
|
.p2align 4
|
||||||
|
L(align64_loop):
|
||||||
|
movaps (%eax), %xmm4
|
||||||
|
pminub 16(%eax), %xmm4
|
||||||
|
movaps 32(%eax), %xmm5
|
||||||
|
pminub 48(%eax), %xmm5
|
||||||
|
add $64, %eax
|
||||||
|
pminub %xmm4, %xmm5
|
||||||
|
pcmpeqb %xmm0, %xmm5
|
||||||
|
pmovmskb %xmm5, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jz L(align64_loop)
|
||||||
|
|
||||||
|
|
||||||
|
pcmpeqb -64(%eax), %xmm0
|
||||||
|
sub $80, %eax
|
||||||
|
pmovmskb %xmm0, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit16)
|
||||||
|
|
||||||
|
pcmpeqb 32(%eax), %xmm1
|
||||||
|
pmovmskb %xmm1, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit32)
|
||||||
|
|
||||||
|
pcmpeqb 48(%eax), %xmm2
|
||||||
|
pmovmskb %xmm2, %ecx
|
||||||
|
test %ecx, %ecx
|
||||||
|
jnz L(exit48)
|
||||||
|
|
||||||
|
pcmpeqb 64(%eax), %xmm3
|
||||||
|
pmovmskb %xmm3, %ecx
|
||||||
|
sub %edx, %eax
|
||||||
|
bsf %ecx, %ecx
|
||||||
|
add %ecx, %eax
|
||||||
|
add $64, %eax
|
||||||
|
ret
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
L(exit):
|
||||||
|
sub %edx, %eax
|
||||||
|
bsf %ecx, %ecx
|
||||||
|
add %ecx, %eax
|
||||||
|
ret
|
||||||
|
|
||||||
|
L(exit_less16):
|
||||||
|
bsf %ecx, %eax
|
||||||
|
ret
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
L(exit_unaligned):
|
||||||
|
sub %edx, %eax
|
||||||
|
bsf %ecx, %ecx
|
||||||
|
add %ecx, %eax
|
||||||
|
ret
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
L(exit16):
|
||||||
|
sub %edx, %eax
|
||||||
|
bsf %ecx, %ecx
|
||||||
|
add %ecx, %eax
|
||||||
|
add $16, %eax
|
||||||
|
ret
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
L(exit32):
|
||||||
|
sub %edx, %eax
|
||||||
|
bsf %ecx, %ecx
|
||||||
|
add %ecx, %eax
|
||||||
|
add $32, %eax
|
||||||
|
ret
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
L(exit48):
|
||||||
|
sub %edx, %eax
|
||||||
|
bsf %ecx, %ecx
|
||||||
|
add %ecx, %eax
|
||||||
|
add $48, %eax
|
||||||
|
ret
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
L(exit64):
|
||||||
|
sub %edx, %eax
|
||||||
|
bsf %ecx, %ecx
|
||||||
|
add %ecx, %eax
|
||||||
|
add $64, %eax
|
||||||
|
ret
|
||||||
|
|
||||||
|
END (STRLEN)
|
||||||
|
|
33
libc/arch-x86/silvermont/string/sse2-strncpy-slm.S
Executable file
33
libc/arch-x86/silvermont/string/sse2-strncpy-slm.S
Executable file
@ -0,0 +1,33 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2014, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define USE_AS_STRNCPY
|
||||||
|
#define STRCPY strncpy
|
||||||
|
#include "sse2-strcpy-slm.S"
|
1277
libc/arch-x86/silvermont/string/sse4-memcmp-slm.S
Executable file
1277
libc/arch-x86/silvermont/string/sse4-memcmp-slm.S
Executable file
File diff suppressed because it is too large
Load Diff
33
libc/arch-x86/silvermont/string/sse4-wmemcmp-slm.S
Executable file
33
libc/arch-x86/silvermont/string/sse4-wmemcmp-slm.S
Executable file
@ -0,0 +1,33 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2014, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define USE_AS_WMEMCMP
|
||||||
|
#define MEMCMP wmemcmp
|
||||||
|
#include "sse4-memcmp-slm.S"
|
@ -32,60 +32,15 @@ libc_bionic_src_files_x86 += \
|
|||||||
arch-x86/bionic/syscall.S \
|
arch-x86/bionic/syscall.S \
|
||||||
arch-x86/bionic/vfork.S \
|
arch-x86/bionic/vfork.S \
|
||||||
|
|
||||||
ifeq ($(ARCH_X86_HAVE_SSSE3),true)
|
## ARCH variant specific source files
|
||||||
libc_bionic_src_files_x86 += \
|
arch_variant_mk := $(LOCAL_PATH)/arch-x86/$(TARGET_ARCH_VARIANT)/$(TARGET_ARCH_VARIANT).mk
|
||||||
arch-x86/string/ssse3-memcpy-atom.S \
|
ifeq ($(wildcard $(arch_variant_mk)),)
|
||||||
arch-x86/string/ssse3-memmove-atom.S \
|
arch_variant_mk := $(LOCAL_PATH)/arch-x86/generic/generic.mk
|
||||||
arch-x86/string/ssse3-bcopy-atom.S \
|
|
||||||
arch-x86/string/ssse3-strncat-atom.S \
|
|
||||||
arch-x86/string/ssse3-strncpy-atom.S \
|
|
||||||
arch-x86/string/ssse3-strlcat-atom.S \
|
|
||||||
arch-x86/string/ssse3-strlcpy-atom.S \
|
|
||||||
arch-x86/string/ssse3-strcmp-atom.S \
|
|
||||||
arch-x86/string/ssse3-strncmp-atom.S \
|
|
||||||
arch-x86/string/ssse3-strcat-atom.S \
|
|
||||||
arch-x86/string/ssse3-strcpy-atom.S \
|
|
||||||
arch-x86/string/ssse3-memcmp-atom.S \
|
|
||||||
arch-x86/string/ssse3-wmemcmp-atom.S \
|
|
||||||
arch-x86/string/ssse3-memcmp16-atom.S \
|
|
||||||
arch-x86/string/ssse3-wcscat-atom.S \
|
|
||||||
arch-x86/string/ssse3-wcscpy-atom.S
|
|
||||||
else
|
|
||||||
libc_bionic_src_files_x86 += \
|
|
||||||
arch-x86/string/memcpy.S \
|
|
||||||
arch-x86/string/memmove.S \
|
|
||||||
arch-x86/string/bcopy.S \
|
|
||||||
arch-x86/string/strcmp.S \
|
|
||||||
arch-x86/string/strncmp.S \
|
|
||||||
arch-x86/string/strcat.S \
|
|
||||||
arch-x86/string/memcmp.S \
|
|
||||||
bionic/__memcmp16.cpp \
|
|
||||||
upstream-freebsd/lib/libc/string/wcscpy.c \
|
|
||||||
upstream-freebsd/lib/libc/string/wcscat.c \
|
|
||||||
upstream-freebsd/lib/libc/string/wmemcmp.c \
|
|
||||||
upstream-openbsd/lib/libc/string/strcpy.c \
|
|
||||||
upstream-openbsd/lib/libc/string/strlcat.c \
|
|
||||||
upstream-openbsd/lib/libc/string/strlcpy.c \
|
|
||||||
upstream-openbsd/lib/libc/string/strncat.c \
|
|
||||||
upstream-openbsd/lib/libc/string/strncpy.c \
|
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
include $(arch_variant_mk)
|
||||||
|
libc_common_additional_dependencies += $(arch_variant_mk)
|
||||||
|
|
||||||
libc_bionic_src_files_x86 += \
|
arch_variant_mk :=
|
||||||
arch-x86/string/sse2-memset-atom.S \
|
|
||||||
arch-x86/string/sse2-bzero-atom.S \
|
|
||||||
arch-x86/string/sse2-memchr-atom.S \
|
|
||||||
arch-x86/string/sse2-memrchr-atom.S \
|
|
||||||
arch-x86/string/sse2-strchr-atom.S \
|
|
||||||
arch-x86/string/sse2-strrchr-atom.S \
|
|
||||||
arch-x86/string/sse2-index-atom.S \
|
|
||||||
arch-x86/string/sse2-strlen-atom.S \
|
|
||||||
arch-x86/string/sse2-strnlen-atom.S \
|
|
||||||
arch-x86/string/sse2-wcschr-atom.S \
|
|
||||||
arch-x86/string/sse2-wcsrchr-atom.S \
|
|
||||||
arch-x86/string/sse2-wcslen-atom.S \
|
|
||||||
arch-x86/string/sse2-wcscmp-atom.S \
|
|
||||||
|
|
||||||
|
|
||||||
libc_crt_target_cflags_x86 := \
|
libc_crt_target_cflags_x86 := \
|
||||||
-m32 \
|
-m32 \
|
||||||
|
@ -19,6 +19,8 @@ libc_common_src_files_x86_64 := \
|
|||||||
upstream-freebsd/lib/libc/string/wcsrchr.c \
|
upstream-freebsd/lib/libc/string/wcsrchr.c \
|
||||||
upstream-freebsd/lib/libc/string/wmemcmp.c \
|
upstream-freebsd/lib/libc/string/wmemcmp.c \
|
||||||
upstream-openbsd/lib/libc/string/bcopy.c \
|
upstream-openbsd/lib/libc/string/bcopy.c \
|
||||||
|
upstream-openbsd/lib/libc/string/stpcpy.c \
|
||||||
|
upstream-openbsd/lib/libc/string/stpncpy.c \
|
||||||
upstream-openbsd/lib/libc/string/strcat.c \
|
upstream-openbsd/lib/libc/string/strcat.c \
|
||||||
upstream-openbsd/lib/libc/string/strcmp.c \
|
upstream-openbsd/lib/libc/string/strcmp.c \
|
||||||
upstream-openbsd/lib/libc/string/strcpy.c \
|
upstream-openbsd/lib/libc/string/strcpy.c \
|
||||||
|
Loading…
Reference in New Issue
Block a user