this commit converts all sad ptrs to uint32

sse4_1 code used uint16_t for returning sad, but that
won't work for 32x32 or 64x64.   This code fixes the
assembly for those and also reenables sse4_1 on linux

Change-Id: I5ce7288d581db870a148e5f7c5092826f59edd81
This commit is contained in:
Jim Bankoski 2013-02-28 08:32:14 -08:00
parent b715e371c0
commit 714aa9f3c0
6 changed files with 258 additions and 252 deletions

View File

@ -997,17 +997,6 @@ process_common_toolchain() {
#error "not x32"
#endif
EOF
soft_enable runtime_cpu_detect
soft_enable mmx
soft_enable sse
soft_enable sse2
soft_enable sse3
soft_enable ssse3
if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then
RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 "
else
soft_enable sse4_1
fi
case ${tgt_os} in
win*)
@ -1061,6 +1050,18 @@ EOF
;;
esac
soft_enable runtime_cpu_detect
soft_enable mmx
soft_enable sse
soft_enable sse2
soft_enable sse3
soft_enable ssse3
if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then
RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 "
else
soft_enable sse4_1
fi
case "${AS}" in
auto|"")
which nasm >/dev/null 2>&1 && AS=nasm

View File

@ -449,25 +449,25 @@ specialize vp9_sad8x8x3 sse3
prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"
specialize vp9_sad4x4x3 sse3
prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
specialize vp9_sad64x64x8
prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
specialize vp9_sad32x32x8
prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
specialize vp9_sad16x16x8 sse4
prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
specialize vp9_sad16x8x8 sse4
prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
specialize vp9_sad8x16x8 sse4
prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
specialize vp9_sad8x8x8 sse4
prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
specialize vp9_sad4x4x8 sse4
prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array"
@ -490,7 +490,6 @@ specialize vp9_sad8x8x4d sse2
prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array"
specialize vp9_sad4x4x4d sse
prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
specialize vp9_sub_pixel_mse16x16 sse2 mmx

View File

@ -1782,7 +1782,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
int col_min = ref_col - distance;
int col_max = ref_col + distance;
DECLARE_ALIGNED_ARRAY(16, uint16_t, sad_array8, 8);
DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8);
unsigned int sad_array[3];
int_mv fcenter_mv;

View File

@ -103,62 +103,62 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
uint16_t *sad_array) {
sad_array[0] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
ref_ptr, ref_stride,
0x7fffffff);
sad_array[1] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
ref_ptr + 1, ref_stride,
0x7fffffff);
sad_array[2] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
ref_ptr + 2, ref_stride,
0x7fffffff);
sad_array[3] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
ref_ptr + 3, ref_stride,
0x7fffffff);
sad_array[4] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
ref_ptr + 4, ref_stride,
0x7fffffff);
sad_array[5] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
ref_ptr + 5, ref_stride,
0x7fffffff);
sad_array[6] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
ref_ptr + 6, ref_stride,
0x7fffffff);
sad_array[7] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
ref_ptr + 7, ref_stride,
0x7fffffff);
unsigned int *sad_array) {
sad_array[0] = vp9_sad64x64(src_ptr, src_stride,
ref_ptr, ref_stride,
0x7fffffff);
sad_array[1] = vp9_sad64x64(src_ptr, src_stride,
ref_ptr + 1, ref_stride,
0x7fffffff);
sad_array[2] = vp9_sad64x64(src_ptr, src_stride,
ref_ptr + 2, ref_stride,
0x7fffffff);
sad_array[3] = vp9_sad64x64(src_ptr, src_stride,
ref_ptr + 3, ref_stride,
0x7fffffff);
sad_array[4] = vp9_sad64x64(src_ptr, src_stride,
ref_ptr + 4, ref_stride,
0x7fffffff);
sad_array[5] = vp9_sad64x64(src_ptr, src_stride,
ref_ptr + 5, ref_stride,
0x7fffffff);
sad_array[6] = vp9_sad64x64(src_ptr, src_stride,
ref_ptr + 6, ref_stride,
0x7fffffff);
sad_array[7] = vp9_sad64x64(src_ptr, src_stride,
ref_ptr + 7, ref_stride,
0x7fffffff);
}
void vp9_sad32x32x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
uint16_t *sad_array) {
sad_array[0] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
ref_ptr, ref_stride,
0x7fffffff);
sad_array[1] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
ref_ptr + 1, ref_stride,
0x7fffffff);
sad_array[2] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
ref_ptr + 2, ref_stride,
0x7fffffff);
sad_array[3] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
ref_ptr + 3, ref_stride,
0x7fffffff);
sad_array[4] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
ref_ptr + 4, ref_stride,
0x7fffffff);
sad_array[5] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
ref_ptr + 5, ref_stride,
0x7fffffff);
sad_array[6] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
ref_ptr + 6, ref_stride,
0x7fffffff);
sad_array[7] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
ref_ptr + 7, ref_stride,
0x7fffffff);
unsigned int *sad_array) {
sad_array[0] = vp9_sad32x32(src_ptr, src_stride,
ref_ptr, ref_stride,
0x7fffffff);
sad_array[1] = vp9_sad32x32(src_ptr, src_stride,
ref_ptr + 1, ref_stride,
0x7fffffff);
sad_array[2] = vp9_sad32x32(src_ptr, src_stride,
ref_ptr + 2, ref_stride,
0x7fffffff);
sad_array[3] = vp9_sad32x32(src_ptr, src_stride,
ref_ptr + 3, ref_stride,
0x7fffffff);
sad_array[4] = vp9_sad32x32(src_ptr, src_stride,
ref_ptr + 4, ref_stride,
0x7fffffff);
sad_array[5] = vp9_sad32x32(src_ptr, src_stride,
ref_ptr + 5, ref_stride,
0x7fffffff);
sad_array[6] = vp9_sad32x32(src_ptr, src_stride,
ref_ptr + 6, ref_stride,
0x7fffffff);
sad_array[7] = vp9_sad32x32(src_ptr, src_stride,
ref_ptr + 7, ref_stride,
0x7fffffff);
}
void vp9_sad16x16x3_c(const uint8_t *src_ptr,
@ -178,31 +178,31 @@ void vp9_sad16x16x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
uint16_t *sad_array) {
sad_array[0] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
ref_ptr, ref_stride,
0x7fffffff);
sad_array[1] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
ref_ptr + 1, ref_stride,
0x7fffffff);
sad_array[2] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
ref_ptr + 2, ref_stride,
0x7fffffff);
sad_array[3] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
ref_ptr + 3, ref_stride,
0x7fffffff);
sad_array[4] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
ref_ptr + 4, ref_stride,
0x7fffffff);
sad_array[5] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
ref_ptr + 5, ref_stride,
0x7fffffff);
sad_array[6] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
ref_ptr + 6, ref_stride,
0x7fffffff);
sad_array[7] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
ref_ptr + 7, ref_stride,
0x7fffffff);
uint32_t *sad_array) {
sad_array[0] = vp9_sad16x16(src_ptr, src_stride,
ref_ptr, ref_stride,
0x7fffffff);
sad_array[1] = vp9_sad16x16(src_ptr, src_stride,
ref_ptr + 1, ref_stride,
0x7fffffff);
sad_array[2] = vp9_sad16x16(src_ptr, src_stride,
ref_ptr + 2, ref_stride,
0x7fffffff);
sad_array[3] = vp9_sad16x16(src_ptr, src_stride,
ref_ptr + 3, ref_stride,
0x7fffffff);
sad_array[4] = vp9_sad16x16(src_ptr, src_stride,
ref_ptr + 4, ref_stride,
0x7fffffff);
sad_array[5] = vp9_sad16x16(src_ptr, src_stride,
ref_ptr + 5, ref_stride,
0x7fffffff);
sad_array[6] = vp9_sad16x16(src_ptr, src_stride,
ref_ptr + 6, ref_stride,
0x7fffffff);
sad_array[7] = vp9_sad16x16(src_ptr, src_stride,
ref_ptr + 7, ref_stride,
0x7fffffff);
}
void vp9_sad16x8x3_c(const uint8_t *src_ptr,
@ -222,31 +222,31 @@ void vp9_sad16x8x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
uint16_t *sad_array) {
sad_array[0] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
ref_ptr, ref_stride,
0x7fffffff);
sad_array[1] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
ref_ptr + 1, ref_stride,
0x7fffffff);
sad_array[2] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
ref_ptr + 2, ref_stride,
0x7fffffff);
sad_array[3] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
ref_ptr + 3, ref_stride,
0x7fffffff);
sad_array[4] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
ref_ptr + 4, ref_stride,
0x7fffffff);
sad_array[5] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
ref_ptr + 5, ref_stride,
0x7fffffff);
sad_array[6] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
ref_ptr + 6, ref_stride,
0x7fffffff);
sad_array[7] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
ref_ptr + 7, ref_stride,
0x7fffffff);
uint32_t *sad_array) {
sad_array[0] = vp9_sad16x8(src_ptr, src_stride,
ref_ptr, ref_stride,
0x7fffffff);
sad_array[1] = vp9_sad16x8(src_ptr, src_stride,
ref_ptr + 1, ref_stride,
0x7fffffff);
sad_array[2] = vp9_sad16x8(src_ptr, src_stride,
ref_ptr + 2, ref_stride,
0x7fffffff);
sad_array[3] = vp9_sad16x8(src_ptr, src_stride,
ref_ptr + 3, ref_stride,
0x7fffffff);
sad_array[4] = vp9_sad16x8(src_ptr, src_stride,
ref_ptr + 4, ref_stride,
0x7fffffff);
sad_array[5] = vp9_sad16x8(src_ptr, src_stride,
ref_ptr + 5, ref_stride,
0x7fffffff);
sad_array[6] = vp9_sad16x8(src_ptr, src_stride,
ref_ptr + 6, ref_stride,
0x7fffffff);
sad_array[7] = vp9_sad16x8(src_ptr, src_stride,
ref_ptr + 7, ref_stride,
0x7fffffff);
}
void vp9_sad8x8x3_c(const uint8_t *src_ptr,
@ -266,31 +266,31 @@ void vp9_sad8x8x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
uint16_t *sad_array) {
sad_array[0] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
ref_ptr, ref_stride,
0x7fffffff);
sad_array[1] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
ref_ptr + 1, ref_stride,
0x7fffffff);
sad_array[2] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
ref_ptr + 2, ref_stride,
0x7fffffff);
sad_array[3] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
ref_ptr + 3, ref_stride,
0x7fffffff);
sad_array[4] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
ref_ptr + 4, ref_stride,
0x7fffffff);
sad_array[5] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
ref_ptr + 5, ref_stride,
0x7fffffff);
sad_array[6] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
ref_ptr + 6, ref_stride,
0x7fffffff);
sad_array[7] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
ref_ptr + 7, ref_stride,
0x7fffffff);
uint32_t *sad_array) {
sad_array[0] = vp9_sad8x8(src_ptr, src_stride,
ref_ptr, ref_stride,
0x7fffffff);
sad_array[1] = vp9_sad8x8(src_ptr, src_stride,
ref_ptr + 1, ref_stride,
0x7fffffff);
sad_array[2] = vp9_sad8x8(src_ptr, src_stride,
ref_ptr + 2, ref_stride,
0x7fffffff);
sad_array[3] = vp9_sad8x8(src_ptr, src_stride,
ref_ptr + 3, ref_stride,
0x7fffffff);
sad_array[4] = vp9_sad8x8(src_ptr, src_stride,
ref_ptr + 4, ref_stride,
0x7fffffff);
sad_array[5] = vp9_sad8x8(src_ptr, src_stride,
ref_ptr + 5, ref_stride,
0x7fffffff);
sad_array[6] = vp9_sad8x8(src_ptr, src_stride,
ref_ptr + 6, ref_stride,
0x7fffffff);
sad_array[7] = vp9_sad8x8(src_ptr, src_stride,
ref_ptr + 7, ref_stride,
0x7fffffff);
}
void vp9_sad8x16x3_c(const uint8_t *src_ptr,
@ -310,31 +310,31 @@ void vp9_sad8x16x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
uint16_t *sad_array) {
sad_array[0] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
ref_ptr, ref_stride,
0x7fffffff);
sad_array[1] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
ref_ptr + 1, ref_stride,
0x7fffffff);
sad_array[2] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
ref_ptr + 2, ref_stride,
0x7fffffff);
sad_array[3] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
ref_ptr + 3, ref_stride,
0x7fffffff);
sad_array[4] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
ref_ptr + 4, ref_stride,
0x7fffffff);
sad_array[5] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
ref_ptr + 5, ref_stride,
0x7fffffff);
sad_array[6] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
ref_ptr + 6, ref_stride,
0x7fffffff);
sad_array[7] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
ref_ptr + 7, ref_stride,
0x7fffffff);
uint32_t *sad_array) {
sad_array[0] = vp9_sad8x16(src_ptr, src_stride,
ref_ptr, ref_stride,
0x7fffffff);
sad_array[1] = vp9_sad8x16(src_ptr, src_stride,
ref_ptr + 1, ref_stride,
0x7fffffff);
sad_array[2] = vp9_sad8x16(src_ptr, src_stride,
ref_ptr + 2, ref_stride,
0x7fffffff);
sad_array[3] = vp9_sad8x16(src_ptr, src_stride,
ref_ptr + 3, ref_stride,
0x7fffffff);
sad_array[4] = vp9_sad8x16(src_ptr, src_stride,
ref_ptr + 4, ref_stride,
0x7fffffff);
sad_array[5] = vp9_sad8x16(src_ptr, src_stride,
ref_ptr + 5, ref_stride,
0x7fffffff);
sad_array[6] = vp9_sad8x16(src_ptr, src_stride,
ref_ptr + 6, ref_stride,
0x7fffffff);
sad_array[7] = vp9_sad8x16(src_ptr, src_stride,
ref_ptr + 7, ref_stride,
0x7fffffff);
}
void vp9_sad4x4x3_c(const uint8_t *src_ptr,
@ -354,31 +354,31 @@ void vp9_sad4x4x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
uint16_t *sad_array) {
sad_array[0] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
ref_ptr, ref_stride,
0x7fffffff);
sad_array[1] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
ref_ptr + 1, ref_stride,
0x7fffffff);
sad_array[2] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
ref_ptr + 2, ref_stride,
0x7fffffff);
sad_array[3] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
ref_ptr + 3, ref_stride,
0x7fffffff);
sad_array[4] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
ref_ptr + 4, ref_stride,
0x7fffffff);
sad_array[5] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
ref_ptr + 5, ref_stride,
0x7fffffff);
sad_array[6] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
ref_ptr + 6, ref_stride,
0x7fffffff);
sad_array[7] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
ref_ptr + 7, ref_stride,
0x7fffffff);
uint32_t *sad_array) {
sad_array[0] = vp9_sad4x4(src_ptr, src_stride,
ref_ptr, ref_stride,
0x7fffffff);
sad_array[1] = vp9_sad4x4(src_ptr, src_stride,
ref_ptr + 1, ref_stride,
0x7fffffff);
sad_array[2] = vp9_sad4x4(src_ptr, src_stride,
ref_ptr + 2, ref_stride,
0x7fffffff);
sad_array[3] = vp9_sad4x4(src_ptr, src_stride,
ref_ptr + 3, ref_stride,
0x7fffffff);
sad_array[4] = vp9_sad4x4(src_ptr, src_stride,
ref_ptr + 4, ref_stride,
0x7fffffff);
sad_array[5] = vp9_sad4x4(src_ptr, src_stride,
ref_ptr + 5, ref_stride,
0x7fffffff);
sad_array[6] = vp9_sad4x4(src_ptr, src_stride,
ref_ptr + 6, ref_stride,
0x7fffffff);
sad_array[7] = vp9_sad4x4(src_ptr, src_stride,
ref_ptr + 7, ref_stride,
0x7fffffff);
}
void vp9_sad64x64x4d_c(const uint8_t *src_ptr,

View File

@ -29,7 +29,7 @@ typedef void (*vp9_sad_multi1_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned short *sad_array);
unsigned int *sad_array);
typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
int source_stride,

View File

@ -154,6 +154,16 @@
paddw xmm1, xmm5
%endmacro
%macro WRITE_AS_INTS 0
mov rdi, arg(4) ;Results
pxor xmm0, xmm0
movdqa xmm2, xmm1
punpcklwd xmm1, xmm0
punpckhwd xmm2, xmm0
movdqa [rdi], xmm1
movdqa [rdi + 16], xmm2
%endmacro
;void vp9_sad16x16x8_sse4(
; const unsigned char *src_ptr,
@ -170,23 +180,22 @@ sym(vp9_sad16x16x8_sse4):
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_16X2X8 1
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 1
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
WRITE_AS_INTS
; begin epilog
pop rdi
@ -212,19 +221,18 @@ sym(vp9_sad16x8x8_sse4):
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_16X2X8 1
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 1
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
WRITE_AS_INTS
; begin epilog
pop rdi
@ -250,19 +258,18 @@ sym(vp9_sad8x8x8_sse4):
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_8X2X8 1
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 1
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
WRITE_AS_INTS
; begin epilog
pop rdi
@ -288,22 +295,22 @@ sym(vp9_sad8x16x8_sse4):
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_8X2X8 1
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
PROCESS_8X2X8 1
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
WRITE_AS_INTS
; begin epilog
pop rdi
@ -329,17 +336,16 @@ sym(vp9_sad4x4x8_sse4):
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_4X2X8 1
PROCESS_4X2X8 0
PROCESS_4X2X8 1
PROCESS_4X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
WRITE_AS_INTS
; begin epilog
pop rdi