Merge "Remove halfpix specialization"

This commit is contained in:
Johann Koenig
2016-08-26 21:28:01 +00:00
committed by Gerrit Code Review
8 changed files with 34 additions and 497 deletions

View File

@@ -409,7 +409,8 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
/* go left then right and check error */
this_mv.as_mv.row = startmv.as_mv.row;
this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
/* "halfpix" horizontal variance */
thismse = vfp->svf(y - 1, y_stride, 4, 0, z, b->src_stride, &sse);
left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (left < bestmse) {
@@ -420,7 +421,8 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
}
this_mv.as_mv.col += 8;
thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
/* "halfpix" horizontal variance */
thismse = vfp->svf(y, y_stride, 4, 0, z, b->src_stride, &sse);
right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (right < bestmse) {
@@ -433,7 +435,8 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
/* go up then down and check error */
this_mv.as_mv.col = startmv.as_mv.col;
this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
/* "halfpix" vertical variance */
thismse = vfp->svf(y - y_stride, y_stride, 0, 4, z, b->src_stride, &sse);
up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (up < bestmse) {
@@ -444,7 +447,8 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
}
this_mv.as_mv.row += 8;
thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
/* "halfpix" vertical variance */
thismse = vfp->svf(y, y_stride, 0, 4, z, b->src_stride, &sse);
down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (down < bestmse) {
@@ -462,25 +466,28 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
case 0:
this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z,
b->src_stride, &sse);
/* "halfpix" horizontal/vertical variance */
thismse =
vfp->svf(y - 1 - y_stride, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 1:
this_mv.as_mv.col += 4;
this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
thismse =
vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
/* "halfpix" horizontal/vertical variance */
thismse = vfp->svf(y - y_stride, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 2:
this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
this_mv.as_mv.row += 4;
thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
/* "halfpix" horizontal/vertical variance */
thismse = vfp->svf(y - 1, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 3:
default:
this_mv.as_mv.col += 4;
this_mv.as_mv.row += 4;
thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
/* "halfpix" horizontal/vertical variance */
thismse = vfp->svf(y, y_stride, 4, 4, z, b->src_stride, &sse);
break;
}
@@ -698,7 +705,8 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
/* go left then right and check error */
this_mv.as_mv.row = startmv.as_mv.row;
this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
/* "halfpix" horizontal variance */
thismse = vfp->svf(y - 1, y_stride, 4, 0, z, b->src_stride, &sse);
left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (left < bestmse) {
@@ -709,7 +717,8 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
}
this_mv.as_mv.col += 8;
thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
/* "halfpix" horizontal variance */
thismse = vfp->svf(y, y_stride, 4, 0, z, b->src_stride, &sse);
right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (right < bestmse) {
@@ -722,7 +731,8 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
/* go up then down and check error */
this_mv.as_mv.col = startmv.as_mv.col;
this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
/* "halfpix" vertical variance */
thismse = vfp->svf(y - y_stride, y_stride, 0, 4, z, b->src_stride, &sse);
up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (up < bestmse) {
@@ -733,7 +743,8 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
}
this_mv.as_mv.row += 8;
thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
/* "halfpix" vertical variance */
thismse = vfp->svf(y, y_stride, 0, 4, z, b->src_stride, &sse);
down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (down < bestmse) {
@@ -751,25 +762,28 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
case 0:
this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z,
b->src_stride, &sse);
/* "halfpix" horizontal/vertical variance */
thismse =
vfp->svf(y - 1 - y_stride, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 1:
this_mv.as_mv.col += 4;
this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
thismse =
vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
/* "halfpix" horizontal/vertical variance */
thismse = vfp->svf(y - y_stride, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 2:
this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
this_mv.as_mv.row += 4;
thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
/* "halfpix" horizontal/vertical variance */
thismse = vfp->svf(y - 1, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 3:
default:
this_mv.as_mv.col += 4;
this_mv.as_mv.row += 4;
thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
/* "halfpix" horizontal/vertical variance */
thismse = vfp->svf(y, y_stride, 4, 4, z, b->src_stride, &sse);
break;
}

View File

@@ -1914,9 +1914,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vpx_variance_halfpixvar16x16_h;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vpx_variance_halfpixvar16x16_v;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vpx_variance_halfpixvar16x16_hv;
cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3;
cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8;
cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
@@ -1924,9 +1921,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_16X8].sdx3f = vpx_sad16x8x3;
cpi->fn_ptr[BLOCK_16X8].sdx8f = vpx_sad16x8x8;
cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
@@ -1934,9 +1928,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_8X16].sdx3f = vpx_sad8x16x3;
cpi->fn_ptr[BLOCK_8X16].sdx8f = vpx_sad8x16x8;
cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
@@ -1944,9 +1935,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_8X8].sdx3f = vpx_sad8x8x3;
cpi->fn_ptr[BLOCK_8X8].sdx8f = vpx_sad8x8x8;
cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
@@ -1954,9 +1942,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_4X4].sdx3f = vpx_sad4x4x3;
cpi->fn_ptr[BLOCK_4X4].sdx8f = vpx_sad4x4x8;
cpi->fn_ptr[BLOCK_4X4].sdx4df = vpx_sad4x4x4d;

View File

@@ -49,24 +49,6 @@ uint32_t vpx_get_mb_ss_c(const int16_t *a) {
return sum;
}
uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
uint32_t *sse) {
return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0, b, b_stride, sse);
}
uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
uint32_t *sse) {
return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4, b, b_stride, sse);
}
uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
uint32_t *sse) {
return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4, b, b_stride, sse);
}
static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, int w, int h, uint32_t *sse, int *sum) {
int i, j;

View File

@@ -58,9 +58,6 @@ typedef struct variance_vtable {
vpx_sad_fn_t sdf;
vpx_variance_fn_t vf;
vpx_subpixvariance_fn_t svf;
vpx_variance_fn_t svf_halfpix_h;
vpx_variance_fn_t svf_halfpix_v;
vpx_variance_fn_t svf_halfpix_hv;
vpx_sad_multi_fn_t sdx3f;
vpx_sad_multi_fn_t sdx8f;
vpx_sad_multi_d_fn_t sdx4df;

View File

@@ -310,8 +310,6 @@ DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c
DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3
DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_impl_sse2.asm
DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c

View File

@@ -1511,23 +1511,6 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, i
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
#
# Specialty Subpixel
#
# TODO(johannkoenig): Add neon implementations of
# vpx_variance_halfpixvar16x16_h
# vpx_variance_halfpixvar16x16_v
# vpx_variance_halfpixvar16x16_hv
# https://bugs.chromium.org/p/webm/issues/detail?id=1273
add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_variance_halfpixvar16x16_h sse2/;
add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_variance_halfpixvar16x16_v sse2/;
add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_variance_halfpixvar16x16_hv sse2/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_variance64x64 sse2/;

View File

@@ -1,346 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vpx_half_horiz_vert_variance16x_h_sse2(unsigned char *ref,
; int ref_stride,
; unsigned char *src,
; int src_stride,
; unsigned int height,
; int *sum,
; unsigned int *sumsquared)
global sym(vpx_half_horiz_vert_variance16x_h_sse2) PRIVATE
sym(vpx_half_horiz_vert_variance16x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
pxor xmm6, xmm6 ; error accumulator
pxor xmm7, xmm7 ; sse eaccumulator
mov rsi, arg(0) ;ref
mov rdi, arg(2) ;src
movsxd rcx, dword ptr arg(4) ;height
movsxd rax, dword ptr arg(1) ;ref_stride
movsxd rdx, dword ptr arg(3) ;src_stride
pxor xmm0, xmm0 ;
movdqu xmm5, XMMWORD PTR [rsi]
movdqu xmm3, XMMWORD PTR [rsi+1]
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
lea rsi, [rsi + rax]
vpx_half_horiz_vert_variance16x_h_1:
movdqu xmm1, XMMWORD PTR [rsi] ;
movdqu xmm2, XMMWORD PTR [rsi+1] ;
pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
pavgb xmm5, xmm1 ; xmm = vertical average of the above
movdqa xmm4, xmm5
punpcklbw xmm5, xmm0 ; xmm5 = words of above
punpckhbw xmm4, xmm0
movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
punpcklbw xmm3, xmm0 ; xmm3 = words of above
psubw xmm5, xmm3 ; xmm5 -= xmm3
movq xmm3, QWORD PTR [rdi+8]
punpcklbw xmm3, xmm0
psubw xmm4, xmm3
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
paddw xmm6, xmm4
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
pmaddwd xmm4, xmm4
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
paddd xmm7, xmm4
movdqa xmm5, xmm1 ; save xmm1 for use on the next row
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
sub rcx, 1 ;
jnz vpx_half_horiz_vert_variance16x_h_1 ;
pxor xmm1, xmm1
pxor xmm5, xmm5
punpcklwd xmm0, xmm6
punpckhwd xmm1, xmm6
psrad xmm0, 16
psrad xmm1, 16
paddd xmm0, xmm1
movdqa xmm1, xmm0
movdqa xmm6, xmm7
punpckldq xmm6, xmm5
punpckhdq xmm7, xmm5
paddd xmm6, xmm7
punpckldq xmm0, xmm5
punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm7, xmm6
movdqa xmm1, xmm0
psrldq xmm7, 8
psrldq xmm1, 8
paddd xmm6, xmm7
paddd xmm0, xmm1
mov rsi, arg(5) ;[Sum]
mov rdi, arg(6) ;[SSE]
movd [rsi], xmm0
movd [rdi], xmm6
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vpx_half_vert_variance16x_h_sse2(unsigned char *ref,
; int ref_stride,
; unsigned char *src,
; int src_stride,
; unsigned int height,
; int *sum,
; unsigned int *sumsquared)
global sym(vpx_half_vert_variance16x_h_sse2) PRIVATE
sym(vpx_half_vert_variance16x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
pxor xmm6, xmm6 ; error accumulator
pxor xmm7, xmm7 ; sse eaccumulator
mov rsi, arg(0) ;ref
mov rdi, arg(2) ;src
movsxd rcx, dword ptr arg(4) ;height
movsxd rax, dword ptr arg(1) ;ref_stride
movsxd rdx, dword ptr arg(3) ;src_stride
movdqu xmm5, XMMWORD PTR [rsi]
lea rsi, [rsi + rax ]
pxor xmm0, xmm0
vpx_half_vert_variance16x_h_1:
movdqu xmm3, XMMWORD PTR [rsi]
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
movdqa xmm4, xmm5
punpcklbw xmm5, xmm0
punpckhbw xmm4, xmm0
movq xmm2, QWORD PTR [rdi]
punpcklbw xmm2, xmm0
psubw xmm5, xmm2
movq xmm2, QWORD PTR [rdi+8]
punpcklbw xmm2, xmm0
psubw xmm4, xmm2
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
paddw xmm6, xmm4
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
pmaddwd xmm4, xmm4
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
paddd xmm7, xmm4
movdqa xmm5, xmm3
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
sub rcx, 1
jnz vpx_half_vert_variance16x_h_1
pxor xmm1, xmm1
pxor xmm5, xmm5
punpcklwd xmm0, xmm6
punpckhwd xmm1, xmm6
psrad xmm0, 16
psrad xmm1, 16
paddd xmm0, xmm1
movdqa xmm1, xmm0
movdqa xmm6, xmm7
punpckldq xmm6, xmm5
punpckhdq xmm7, xmm5
paddd xmm6, xmm7
punpckldq xmm0, xmm5
punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm7, xmm6
movdqa xmm1, xmm0
psrldq xmm7, 8
psrldq xmm1, 8
paddd xmm6, xmm7
paddd xmm0, xmm1
mov rsi, arg(5) ;[Sum]
mov rdi, arg(6) ;[SSE]
movd [rsi], xmm0
movd [rdi], xmm6
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vpx_half_horiz_variance16x_h_sse2(unsigned char *ref,
; int ref_stride
; unsigned char *src,
; int src_stride,
; unsigned int height,
; int *sum,
; unsigned int *sumsquared)
global sym(vpx_half_horiz_variance16x_h_sse2) PRIVATE
sym(vpx_half_horiz_variance16x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
pxor xmm6, xmm6 ; error accumulator
pxor xmm7, xmm7 ; sse eaccumulator
mov rsi, arg(0) ;ref
mov rdi, arg(2) ;src
movsxd rcx, dword ptr arg(4) ;height
movsxd rax, dword ptr arg(1) ;ref_stride
movsxd rdx, dword ptr arg(3) ;src_stride
pxor xmm0, xmm0 ;
vpx_half_horiz_variance16x_h_1:
movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
movdqa xmm1, xmm5
punpcklbw xmm5, xmm0 ; xmm5 = words of above
punpckhbw xmm1, xmm0
movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
punpcklbw xmm3, xmm0 ; xmm3 = words of above
movq xmm2, QWORD PTR [rdi+8]
punpcklbw xmm2, xmm0
psubw xmm5, xmm3 ; xmm5 -= xmm3
psubw xmm1, xmm2
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
paddw xmm6, xmm1
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
pmaddwd xmm1, xmm1
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
paddd xmm7, xmm1
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
sub rcx, 1 ;
jnz vpx_half_horiz_variance16x_h_1 ;
pxor xmm1, xmm1
pxor xmm5, xmm5
punpcklwd xmm0, xmm6
punpckhwd xmm1, xmm6
psrad xmm0, 16
psrad xmm1, 16
paddd xmm0, xmm1
movdqa xmm1, xmm0
movdqa xmm6, xmm7
punpckldq xmm6, xmm5
punpckhdq xmm7, xmm5
paddd xmm6, xmm7
punpckldq xmm0, xmm5
punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm7, xmm6
movdqa xmm1, xmm0
psrldq xmm7, 8
psrldq xmm1, 8
paddd xmm6, xmm7
paddd xmm0, xmm1
mov rsi, arg(5) ;[Sum]
mov rdi, arg(6) ;[SSE]
movd [rsi], xmm0
movd [rdi], xmm6
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
align 16
xmm_bi_rd:
times 8 dw 64
align 16
vpx_bilinear_filters_sse2:
dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112

View File

@@ -1,76 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
void vpx_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref,
int ref_stride,
const unsigned char *src,
int src_stride, unsigned int height,
int *sum, unsigned int *sumsquared);
void vpx_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
const unsigned char *src, int src_stride,
unsigned int height, int *sum,
unsigned int *sumsquared);
void vpx_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
const unsigned char *src, int src_stride,
unsigned int height, int *sum,
unsigned int *sumsquared);
uint32_t vpx_variance_halfpixvar16x16_h_sse2(const unsigned char *src,
int src_stride,
const unsigned char *dst,
int dst_stride, uint32_t *sse) {
int xsum0;
unsigned int xxsum0;
vpx_half_horiz_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
&xsum0, &xxsum0);
*sse = xxsum0;
assert(xsum0 <= 255 * 16 * 16);
assert(xsum0 >= -255 * 16 * 16);
return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
}
uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
int src_stride,
const unsigned char *dst,
int dst_stride, uint32_t *sse) {
int xsum0;
unsigned int xxsum0;
vpx_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0,
&xxsum0);
*sse = xxsum0;
assert(xsum0 <= 255 * 16 * 16);
assert(xsum0 >= -255 * 16 * 16);
return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
}
uint32_t vpx_variance_halfpixvar16x16_hv_sse2(const unsigned char *src,
int src_stride,
const unsigned char *dst,
int dst_stride, uint32_t *sse) {
int xsum0;
unsigned int xxsum0;
vpx_half_horiz_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
&xsum0, &xxsum0);
*sse = xxsum0;
assert(xsum0 <= 255 * 16 * 16);
assert(xsum0 >= -255 * 16 * 16);
return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
}