Merge changes I158f631a,I0555f639

* changes:
  vp8: remove mmx functions
  Rename _xmm functions to _sse2
This commit is contained in:
Johann Koenig 2016-09-30 01:47:40 +00:00 committed by Gerrit Code Review
commit cb4aa6d589
15 changed files with 56 additions and 3021 deletions

View File

@ -298,10 +298,7 @@ INSTANTIATE_TEST_CASE_P(
#if HAVE_MMX
INSTANTIATE_TEST_CASE_P(
MMX, SixtapPredictTest,
::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmx),
make_tuple(8, 8, &vp8_sixtap_predict8x8_mmx),
make_tuple(8, 4, &vp8_sixtap_predict8x4_mmx),
make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx)));
::testing::Values(make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx)));
#endif
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
@ -353,9 +350,7 @@ INSTANTIATE_TEST_CASE_P(
#if HAVE_MMX
INSTANTIATE_TEST_CASE_P(
MMX, BilinearPredictTest,
::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_mmx),
make_tuple(8, 8, &vp8_bilinear_predict8x8_mmx),
make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx),
::testing::Values(make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx),
make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx)));
#endif
#if HAVE_SSE2

View File

@ -28,55 +28,51 @@ add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char
specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa/;
add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
specialize qw/vp8_dequant_idct_add_y_block mmx sse2 neon dspr2 msa/;
specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa/;
add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 neon dspr2 msa/;
specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa/;
#
# Loopfilter
#
add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
specialize qw/vp8_loop_filter_mbv mmx sse2 neon dspr2 msa/;
specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa/;
add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
specialize qw/vp8_loop_filter_bv mmx sse2 neon dspr2 msa/;
specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa/;
add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
specialize qw/vp8_loop_filter_mbh mmx sse2 neon dspr2 msa/;
specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa/;
add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
specialize qw/vp8_loop_filter_bh mmx sse2 neon dspr2 msa/;
specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa/;
add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
specialize qw/vp8_loop_filter_simple_mbv mmx sse2 neon msa/;
specialize qw/vp8_loop_filter_simple_mbv sse2 neon msa/;
$vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
$vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
$vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
$vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa;
add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
specialize qw/vp8_loop_filter_simple_mbh mmx sse2 neon msa/;
specialize qw/vp8_loop_filter_simple_mbh sse2 neon msa/;
$vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
$vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx;
$vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
$vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa;
add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
specialize qw/vp8_loop_filter_simple_bv mmx sse2 neon msa/;
specialize qw/vp8_loop_filter_simple_bv sse2 neon msa/;
$vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
$vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
$vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
$vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa;
add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
specialize qw/vp8_loop_filter_simple_bh mmx sse2 neon msa/;
specialize qw/vp8_loop_filter_simple_bh sse2 neon msa/;
$vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
$vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx;
$vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
$vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
$vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
@ -94,7 +90,7 @@ specialize qw/vp8_short_inv_walsh4x4_1 dspr2/;
#iwalsh16
add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
specialize qw/vp8_short_inv_walsh4x4 mmx sse2 neon dspr2 msa/;
specialize qw/vp8_short_inv_walsh4x4 sse2 neon dspr2 msa/;
#idct1_scalar_add
add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
@ -104,7 +100,7 @@ specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa/;
# RECON
#
add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
specialize qw/vp8_copy_mem16x16 mmx sse2 neon dspr2 msa/;
specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa/;
add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa/;
@ -136,22 +132,22 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") {
# Subpixel
#
add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 neon dspr2 msa/;
specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa/;
add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 neon dspr2 msa/;
specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa/;
add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 neon dspr2 msa/;
specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa/;
add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa/;
add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 neon msa/;
specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/;
add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 neon msa/;
specialize qw/vp8_bilinear_predict8x8 sse2 ssse3 neon msa/;
add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
specialize qw/vp8_bilinear_predict8x4 mmx neon msa/;
@ -176,10 +172,10 @@ if ($opts{arch} =~ /x86/) {
# Forward DCT
#
add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
specialize qw/vp8_short_fdct4x4 mmx sse2 neon msa/;
specialize qw/vp8_short_fdct4x4 sse2 neon msa/;
add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
specialize qw/vp8_short_fdct8x4 mmx sse2 neon msa/;
specialize qw/vp8_short_fdct8x4 sse2 neon msa/;
add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
specialize qw/vp8_short_walsh4x4 sse2 neon msa/;
@ -197,16 +193,13 @@ specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/;
# Block subtraction
#
add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff";
specialize qw/vp8_block_error mmx sse2 msa/;
$vp8_block_error_sse2=vp8_block_error_xmm;
specialize qw/vp8_block_error sse2 msa/;
add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc";
specialize qw/vp8_mbblock_error mmx sse2 msa/;
$vp8_mbblock_error_sse2=vp8_mbblock_error_xmm;
specialize qw/vp8_mbblock_error sse2 msa/;
add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
specialize qw/vp8_mbuverror mmx sse2 msa/;
$vp8_mbuverror_sse2=vp8_mbuverror_xmm;
specialize qw/vp8_mbuverror sse2 msa/;
#
# Motion search

View File

@ -21,91 +21,3 @@ void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC) {
vp8_dequantize_b_impl_mmx(sq, dq, DQC);
}
void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst,
int stride, char *eobs) {
int i;
for (i = 0; i < 4; ++i) {
if (eobs[0] > 1) {
vp8_dequant_idct_add_mmx(q, dq, dst, stride);
} else if (eobs[0] == 1) {
vp8_dc_only_idct_add_mmx(q[0] * dq[0], dst, stride, dst, stride);
memset(q, 0, 2 * sizeof(q[0]));
}
if (eobs[1] > 1) {
vp8_dequant_idct_add_mmx(q + 16, dq, dst + 4, stride);
} else if (eobs[1] == 1) {
vp8_dc_only_idct_add_mmx(q[16] * dq[0], dst + 4, stride, dst + 4, stride);
memset(q + 16, 0, 2 * sizeof(q[0]));
}
if (eobs[2] > 1) {
vp8_dequant_idct_add_mmx(q + 32, dq, dst + 8, stride);
} else if (eobs[2] == 1) {
vp8_dc_only_idct_add_mmx(q[32] * dq[0], dst + 8, stride, dst + 8, stride);
memset(q + 32, 0, 2 * sizeof(q[0]));
}
if (eobs[3] > 1) {
vp8_dequant_idct_add_mmx(q + 48, dq, dst + 12, stride);
} else if (eobs[3] == 1) {
vp8_dc_only_idct_add_mmx(q[48] * dq[0], dst + 12, stride, dst + 12,
stride);
memset(q + 48, 0, 2 * sizeof(q[0]));
}
q += 64;
dst += 4 * stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dstu,
unsigned char *dstv, int stride,
char *eobs) {
int i;
for (i = 0; i < 2; ++i) {
if (eobs[0] > 1) {
vp8_dequant_idct_add_mmx(q, dq, dstu, stride);
} else if (eobs[0] == 1) {
vp8_dc_only_idct_add_mmx(q[0] * dq[0], dstu, stride, dstu, stride);
memset(q, 0, 2 * sizeof(q[0]));
}
if (eobs[1] > 1) {
vp8_dequant_idct_add_mmx(q + 16, dq, dstu + 4, stride);
} else if (eobs[1] == 1) {
vp8_dc_only_idct_add_mmx(q[16] * dq[0], dstu + 4, stride, dstu + 4,
stride);
memset(q + 16, 0, 2 * sizeof(q[0]));
}
q += 32;
dstu += 4 * stride;
eobs += 2;
}
for (i = 0; i < 2; ++i) {
if (eobs[0] > 1) {
vp8_dequant_idct_add_mmx(q, dq, dstv, stride);
} else if (eobs[0] == 1) {
vp8_dc_only_idct_add_mmx(q[0] * dq[0], dstv, stride, dstv, stride);
memset(q, 0, 2 * sizeof(q[0]));
}
if (eobs[1] > 1) {
vp8_dequant_idct_add_mmx(q + 16, dq, dstv + 4, stride);
} else if (eobs[1] == 1) {
vp8_dc_only_idct_add_mmx(q[16] * dq[0], dstv + 4, stride, dstv + 4,
stride);
memset(q + 16, 0, 2 * sizeof(q[0]));
}
q += 32;
dstv += 4 * stride;
eobs += 2;
}
}

View File

@ -1,140 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
sym(vp8_short_inv_walsh4x4_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
; end prolog
mov rdx, arg(0)
mov rax, 30003h
movq mm0, [rdx + 0] ;ip[0]
movq mm1, [rdx + 8] ;ip[4]
movq mm7, rax
movq mm2, [rdx + 16] ;ip[8]
movq mm3, [rdx + 24] ;ip[12]
punpcklwd mm7, mm7 ;0003000300030003h
mov rdx, arg(1)
movq mm4, mm0
movq mm5, mm1
paddw mm4, mm3 ;ip[0] + ip[12] aka al
paddw mm5, mm2 ;ip[4] + ip[8] aka bl
movq mm6, mm4 ;temp al
paddw mm4, mm5 ;al + bl
psubw mm6, mm5 ;al - bl
psubw mm0, mm3 ;ip[0] - ip[12] aka d1
psubw mm1, mm2 ;ip[4] - ip[8] aka c1
movq mm5, mm0 ;temp dl
paddw mm0, mm1 ;dl + cl
psubw mm5, mm1 ;dl - cl
; 03 02 01 00
; 13 12 11 10
; 23 22 21 20
; 33 32 31 30
movq mm3, mm4 ; 03 02 01 00
punpcklwd mm4, mm0 ; 11 01 10 00
punpckhwd mm3, mm0 ; 13 03 12 02
movq mm1, mm6 ; 23 22 21 20
punpcklwd mm6, mm5 ; 31 21 30 20
punpckhwd mm1, mm5 ; 33 23 32 22
movq mm0, mm4 ; 11 01 10 00
movq mm2, mm3 ; 13 03 12 02
punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
;~~~~~~~~~~~~~~~~~~~~~
movq mm1, mm0
movq mm5, mm4
paddw mm1, mm3 ;ip[0] + ip[12] aka al
paddw mm5, mm2 ;ip[4] + ip[8] aka bl
movq mm6, mm1 ;temp al
paddw mm1, mm5 ;al + bl
psubw mm6, mm5 ;al - bl
paddw mm1, mm7
paddw mm6, mm7
psraw mm1, 3
psraw mm6, 3
psubw mm0, mm3 ;ip[0] - ip[12] aka d1
psubw mm4, mm2 ;ip[4] - ip[8] aka c1
movq mm5, mm0 ;temp dl
paddw mm0, mm4 ;dl + cl
psubw mm5, mm4 ;dl - cl
paddw mm0, mm7
paddw mm5, mm7
psraw mm0, 3
psraw mm5, 3
;~~~~~~~~~~~~~~~~~~~~~
movd eax, mm1
movd ecx, mm0
psrlq mm0, 32
psrlq mm1, 32
mov word ptr[rdx+32*0], ax
mov word ptr[rdx+32*1], cx
shr eax, 16
shr ecx, 16
mov word ptr[rdx+32*4], ax
mov word ptr[rdx+32*5], cx
movd eax, mm1
movd ecx, mm0
mov word ptr[rdx+32*8], ax
mov word ptr[rdx+32*9], cx
shr eax, 16
shr ecx, 16
mov word ptr[rdx+32*12], ax
mov word ptr[rdx+32*13], cx
movd eax, mm6
movd ecx, mm5
psrlq mm5, 32
psrlq mm6, 32
mov word ptr[rdx+32*2], ax
mov word ptr[rdx+32*3], cx
shr eax, 16
shr ecx, 16
mov word ptr[rdx+32*6], ax
mov word ptr[rdx+32*7], cx
movd eax, mm6
movd ecx, mm5
mov word ptr[rdx+32*10], ax
mov word ptr[rdx+32*11], cx
shr eax, 16
shr ecx, 16
mov word ptr[rdx+32*14], ax
mov word ptr[rdx+32*15], cx
; begin epilog
UNSHADOW_ARGS
pop rbp
ret

View File

@ -22,13 +22,6 @@
#define prototype_simple_loopfilter(sym) \
void sym(unsigned char *y, int ystride, const unsigned char *blimit)
prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
#if HAVE_SSE2 && ARCH_X86_64
prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
@ -44,105 +37,6 @@ extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
#if HAVE_MMX
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim,
lfi->hev_thr, 2);
if (u_ptr) {
vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim,
lfi->lim, lfi->hev_thr, 1);
}
if (v_ptr) {
vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim,
lfi->lim, lfi->hev_thr, 1);
}
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim,
lfi->hev_thr, 2);
if (u_ptr) {
vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim,
lfi->hev_thr, 1);
}
if (v_ptr) {
vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim,
lfi->hev_thr, 1);
}
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim,
lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim,
lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride,
lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr) {
vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride,
lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
if (v_ptr) {
vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride,
lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
}
void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit) {
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride,
blimit);
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride,
blimit);
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride,
blimit);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
lfi->hev_thr, 2);
if (u_ptr) {
vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
lfi->hev_thr, 1);
}
if (v_ptr) {
vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
lfi->hev_thr, 1);
}
}
void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit) {
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
}
#endif
/* Horizontal MB filtering */
#if HAVE_SSE2
void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr,

View File

@ -117,158 +117,3 @@ sym(vp8_copy_mem8x4_mmx):
UNSHADOW_ARGS
pop rbp
ret
;void copy_mem16x16_mmx(
; unsigned char *src,
; int src_stride,
; unsigned char *dst,
; int dst_stride
; )
global sym(vp8_copy_mem16x16_mmx) PRIVATE
sym(vp8_copy_mem16x16_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src;
movsxd rax, dword ptr arg(1) ;src_stride;
mov rdi, arg(2) ;dst;
movsxd rcx, dword ptr arg(3) ;dst_stride
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq [rdi], mm0
movq [rdi+8], mm3
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret

View File

@ -204,163 +204,6 @@ sym(vp8_filter_block1dc_v6_mmx):
ret
;void bilinear_predict8x8_mmx
;(
; unsigned char *src_ptr,
; int src_pixels_per_line,
; int xoffset,
; int yoffset,
; unsigned char *dst_ptr,
; int dst_pitch
;)
global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
sym(vp8_bilinear_predict8x8_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
GET_GOT rbx
push rsi
push rdi
; end prolog
;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
movsxd rax, dword ptr arg(2) ;xoffset
mov rdi, arg(4) ;dst_ptr ;
shl rax, 5 ; offset * 32
lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
add rax, rcx ; HFilter
mov rsi, arg(0) ;src_ptr ;
movsxd rdx, dword ptr arg(5) ;dst_pitch
movq mm1, [rax] ;
movq mm2, [rax+16] ;
movsxd rax, dword ptr arg(3) ;yoffset
pxor mm0, mm0 ;
shl rax, 5 ; offset*32
add rax, rcx ; VFilter
lea rcx, [rdi+rdx*8] ;
movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
; get the first horizontal line done ;
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movq mm4, mm3 ; make a copy of current line
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
punpckhbw mm4, mm0 ;
pmullw mm3, mm1 ;
pmullw mm4, mm1 ;
movq mm5, [rsi+1] ;
movq mm6, mm5 ;
punpcklbw mm5, mm0 ;
punpckhbw mm6, mm0 ;
pmullw mm5, mm2 ;
pmullw mm6, mm2 ;
paddw mm3, mm5 ;
paddw mm4, mm6 ;
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
paddw mm4, [GLOBAL(rd)] ;
psraw mm4, VP8_FILTER_SHIFT ;
movq mm7, mm3 ;
packuswb mm7, mm4 ;
add rsi, rdx ; next line
.next_row_8x8:
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movq mm4, mm3 ; make a copy of current line
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
punpckhbw mm4, mm0 ;
pmullw mm3, mm1 ;
pmullw mm4, mm1 ;
movq mm5, [rsi+1] ;
movq mm6, mm5 ;
punpcklbw mm5, mm0 ;
punpckhbw mm6, mm0 ;
pmullw mm5, mm2 ;
pmullw mm6, mm2 ;
paddw mm3, mm5 ;
paddw mm4, mm6 ;
movq mm5, mm7 ;
movq mm6, mm7 ;
punpcklbw mm5, mm0 ;
punpckhbw mm6, mm0
pmullw mm5, [rax] ;
pmullw mm6, [rax] ;
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
paddw mm4, [GLOBAL(rd)] ;
psraw mm4, VP8_FILTER_SHIFT ;
movq mm7, mm3 ;
packuswb mm7, mm4 ;
pmullw mm3, [rax+16] ;
pmullw mm4, [rax+16] ;
paddw mm3, mm5 ;
paddw mm4, mm6 ;
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
paddw mm4, [GLOBAL(rd)] ;
psraw mm4, VP8_FILTER_SHIFT ;
packuswb mm3, mm4
movq [rdi], mm3 ; store the results in the destination
%if ABI_IS_32BIT
add rsi, rdx ; next line
add rdi, dword ptr arg(5) ;dst_pitch ;
%else
movsxd r8, dword ptr arg(5) ;dst_pitch
add rsi, rdx ; next line
add rdi, r8 ;dst_pitch
%endif
cmp rdi, rcx ;
jne .next_row_8x8
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void bilinear_predict8x4_mmx
;(
; unsigned char *src_ptr,
@ -641,8 +484,8 @@ rd:
times 4 dw 0x40
align 16
global HIDDEN_DATA(sym(vp8_six_tap_mmx))
sym(vp8_six_tap_mmx):
global HIDDEN_DATA(sym(vp8_six_tap_x86))
sym(vp8_six_tap_x86):
times 8 dw 0
times 8 dw 0
times 8 dw 128

View File

@ -13,7 +13,7 @@
#include "vpx_ports/mem.h"
#include "filter_x86.h"
extern const short vp8_six_tap_mmx[8][6 * 8];
extern const short vp8_six_tap_x86[8][6 * 8];
extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
unsigned short *output_ptr,
@ -82,103 +82,13 @@ void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
DECLARE_ALIGNED(16, unsigned short,
FData2[16 * 16]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
HFilter = vp8_six_tap_mmx[xoffset];
HFilter = vp8_six_tap_x86[xoffset];
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
src_pixels_per_line, 1, 9, 8, HFilter);
VFilter = vp8_six_tap_mmx[yoffset];
VFilter = vp8_six_tap_x86[yoffset];
vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
VFilter);
}
void vp8_sixtap_predict16x16_mmx(unsigned char *src_ptr,
int src_pixels_per_line, int xoffset,
int yoffset, unsigned char *dst_ptr,
int dst_pitch) {
DECLARE_ALIGNED(16, unsigned short,
FData2[24 * 24]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
HFilter = vp8_six_tap_mmx[xoffset];
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
src_pixels_per_line, 1, 21, 32, HFilter);
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4,
src_pixels_per_line, 1, 21, 32, HFilter);
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8,
src_pixels_per_line, 1, 21, 32, HFilter);
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
FData2 + 12, src_pixels_per_line, 1, 21, 32,
HFilter);
VFilter = vp8_six_tap_mmx[yoffset];
vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, 16,
VFilter);
vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16, 16,
16, VFilter);
vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16, 16,
16, VFilter);
vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16, 16,
16, VFilter);
}
void vp8_sixtap_predict8x8_mmx(unsigned char *src_ptr, int src_pixels_per_line,
int xoffset, int yoffset, unsigned char *dst_ptr,
int dst_pitch) {
DECLARE_ALIGNED(16, unsigned short,
FData2[256]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
HFilter = vp8_six_tap_mmx[xoffset];
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
src_pixels_per_line, 1, 13, 16, HFilter);
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4,
src_pixels_per_line, 1, 13, 16, HFilter);
VFilter = vp8_six_tap_mmx[yoffset];
vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8, 8,
VFilter);
vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8, 8, 8,
VFilter);
}
void vp8_sixtap_predict8x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
int xoffset, int yoffset, unsigned char *dst_ptr,
int dst_pitch) {
DECLARE_ALIGNED(16, unsigned short,
FData2[256]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
HFilter = vp8_six_tap_mmx[xoffset];
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
src_pixels_per_line, 1, 9, 16, HFilter);
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4,
src_pixels_per_line, 1, 9, 16, HFilter);
VFilter = vp8_six_tap_mmx[yoffset];
vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4, 8,
VFilter);
vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8, 4, 8,
VFilter);
}
void vp8_bilinear_predict16x16_mmx(unsigned char *src_ptr,
int src_pixels_per_line, int xoffset,
int yoffset, unsigned char *dst_ptr,
int dst_pitch) {
vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset,
dst_ptr, dst_pitch);
vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 8, dst_pitch);
vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,
src_pixels_per_line, xoffset, yoffset,
dst_ptr + dst_pitch * 8, dst_pitch);
vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8,
src_pixels_per_line, xoffset, yoffset,
dst_ptr + dst_pitch * 8 + 8, dst_pitch);
}
#endif
#if HAVE_SSE2
@ -195,21 +105,21 @@ void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
if (xoffset) {
if (yoffset) {
HFilter = vp8_six_tap_mmx[xoffset];
HFilter = vp8_six_tap_x86[xoffset];
vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
src_pixels_per_line, 1, 21, 32, HFilter);
VFilter = vp8_six_tap_mmx[yoffset];
VFilter = vp8_six_tap_x86[yoffset];
vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
dst_pitch, VFilter);
} else {
/* First-pass only */
HFilter = vp8_six_tap_mmx[xoffset];
HFilter = vp8_six_tap_x86[xoffset];
vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
dst_pitch, 16, HFilter);
}
} else {
/* Second-pass only */
VFilter = vp8_six_tap_mmx[yoffset];
VFilter = vp8_six_tap_x86[yoffset];
vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
src_pixels_per_line, 21, 32);
vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
@ -226,21 +136,21 @@ void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
if (xoffset) {
if (yoffset) {
HFilter = vp8_six_tap_mmx[xoffset];
HFilter = vp8_six_tap_x86[xoffset];
vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
src_pixels_per_line, 1, 13, 16, HFilter);
VFilter = vp8_six_tap_mmx[yoffset];
VFilter = vp8_six_tap_x86[yoffset];
vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
dst_pitch, VFilter);
} else {
/* First-pass only */
HFilter = vp8_six_tap_mmx[xoffset];
HFilter = vp8_six_tap_x86[xoffset];
vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
dst_pitch, 8, HFilter);
}
} else {
/* Second-pass only */
VFilter = vp8_six_tap_mmx[yoffset];
VFilter = vp8_six_tap_x86[yoffset];
vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
src_pixels_per_line, dst_ptr, dst_pitch, 8,
VFilter);
@ -256,21 +166,21 @@ void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
if (xoffset) {
if (yoffset) {
HFilter = vp8_six_tap_mmx[xoffset];
HFilter = vp8_six_tap_x86[xoffset];
vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
src_pixels_per_line, 1, 9, 16, HFilter);
VFilter = vp8_six_tap_mmx[yoffset];
VFilter = vp8_six_tap_x86[yoffset];
vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
dst_pitch, VFilter);
} else {
/* First-pass only */
HFilter = vp8_six_tap_mmx[xoffset];
HFilter = vp8_six_tap_x86[xoffset];
vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
dst_pitch, 4, HFilter);
}
} else {
/* Second-pass only */
VFilter = vp8_six_tap_mmx[yoffset];
VFilter = vp8_six_tap_x86[yoffset];
vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
src_pixels_per_line, dst_ptr, dst_pitch, 4,
VFilter);

File diff suppressed because it is too large Load Diff

View File

@ -1,241 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
global sym(vp8_short_fdct4x4_mmx) PRIVATE
sym(vp8_short_fdct4x4_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ; input
mov rdi, arg(1) ; output
movsxd rax, dword ptr arg(2) ;pitch
lea rcx, [rsi + rax*2]
; read the input data
movq mm0, [rsi]
movq mm1, [rsi + rax]
movq mm2, [rcx]
movq mm4, [rcx + rax]
; transpose for the first stage
movq mm3, mm0 ; 00 01 02 03
movq mm5, mm2 ; 20 21 22 23
punpcklwd mm0, mm1 ; 00 10 01 11
punpckhwd mm3, mm1 ; 02 12 03 13
punpcklwd mm2, mm4 ; 20 30 21 31
punpckhwd mm5, mm4 ; 22 32 23 33
movq mm1, mm0 ; 00 10 01 11
punpckldq mm0, mm2 ; 00 10 20 30
punpckhdq mm1, mm2 ; 01 11 21 31
movq mm2, mm3 ; 02 12 03 13
punpckldq mm2, mm5 ; 02 12 22 32
punpckhdq mm3, mm5 ; 03 13 23 33
; mm0 0
; mm1 1
; mm2 2
; mm3 3
; first stage
movq mm5, mm0
movq mm4, mm1
paddw mm0, mm3 ; a1 = 0 + 3
paddw mm1, mm2 ; b1 = 1 + 2
psubw mm4, mm2 ; c1 = 1 - 2
psubw mm5, mm3 ; d1 = 0 - 3
psllw mm5, 3
psllw mm4, 3
psllw mm0, 3
psllw mm1, 3
; output 0 and 2
movq mm2, mm0 ; a1
paddw mm0, mm1 ; op[0] = a1 + b1
psubw mm2, mm1 ; op[2] = a1 - b1
; output 1 and 3
; interleave c1, d1
movq mm1, mm5 ; d1
punpcklwd mm1, mm4 ; c1 d1
punpckhwd mm5, mm4 ; c1 d1
movq mm3, mm1
movq mm4, mm5
pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
paddd mm1, MMWORD PTR[GLOBAL(_14500)]
paddd mm4, MMWORD PTR[GLOBAL(_14500)]
paddd mm3, MMWORD PTR[GLOBAL(_7500)]
paddd mm5, MMWORD PTR[GLOBAL(_7500)]
psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
packssdw mm1, mm4 ; op[1]
packssdw mm3, mm5 ; op[3]
; done with vertical
; transpose for the second stage
movq mm4, mm0 ; 00 10 20 30
movq mm5, mm2 ; 02 12 22 32
punpcklwd mm0, mm1 ; 00 01 10 11
punpckhwd mm4, mm1 ; 20 21 30 31
punpcklwd mm2, mm3 ; 02 03 12 13
punpckhwd mm5, mm3 ; 22 23 32 33
movq mm1, mm0 ; 00 01 10 11
punpckldq mm0, mm2 ; 00 01 02 03
punpckhdq mm1, mm2 ; 01 22 12 13
movq mm2, mm4 ; 20 31 30 31
punpckldq mm2, mm5 ; 20 21 22 23
punpckhdq mm4, mm5 ; 30 31 32 33
; mm0 0
; mm1 1
; mm2 2
; mm3 4
movq mm5, mm0
movq mm3, mm1
paddw mm0, mm4 ; a1 = 0 + 3
paddw mm1, mm2 ; b1 = 1 + 2
psubw mm3, mm2 ; c1 = 1 - 2
psubw mm5, mm4 ; d1 = 0 - 3
pxor mm6, mm6 ; zero out for compare
pcmpeqw mm6, mm5 ; d1 != 0
pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper,
; and keep bit 0 of lower
; output 0 and 2
movq mm2, mm0 ; a1
paddw mm0, mm1 ; a1 + b1
psubw mm2, mm1 ; a1 - b1
paddw mm0, MMWORD PTR[GLOBAL(_7w)]
paddw mm2, MMWORD PTR[GLOBAL(_7w)]
psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4
psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4
movq MMWORD PTR[rdi + 0 ], mm0
movq MMWORD PTR[rdi + 16], mm2
; output 1 and 3
; interleave c1, d1
movq mm1, mm5 ; d1
punpcklwd mm1, mm3 ; c1 d1
punpckhwd mm5, mm3 ; c1 d1
movq mm3, mm1
movq mm4, mm5
pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
paddd mm1, MMWORD PTR[GLOBAL(_12000)]
paddd mm4, MMWORD PTR[GLOBAL(_12000)]
paddd mm3, MMWORD PTR[GLOBAL(_51000)]
paddd mm5, MMWORD PTR[GLOBAL(_51000)]
psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
packssdw mm1, mm4 ; op[4]
packssdw mm3, mm5 ; op[12]
paddw mm1, mm6 ; op[4] += (d1!=0)
movq MMWORD PTR[rdi + 8 ], mm1
movq MMWORD PTR[rdi + 24], mm3
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 8
_5352_2217:
dw 5352
dw 2217
dw 5352
dw 2217
align 8
_2217_neg5352:
dw 2217
dw -5352
dw 2217
dw -5352
align 8
_cmp_mask:
times 4 dw 1
align 8
_7w:
times 4 dw 7
align 8
_14500:
times 2 dd 14500
align 8
_7500:
times 2 dd 7500
align 8
_12000:
times 2 dd 12000
align 8
_51000:
times 2 dd 51000

View File

@ -11,9 +11,9 @@
%include "vpx_ports/x86_abi_support.asm"
;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr)
global sym(vp8_block_error_xmm) PRIVATE
sym(vp8_block_error_xmm):
;int vp8_block_error_sse2(short *coeff_ptr, short *dcoef_ptr)
global sym(vp8_block_error_sse2) PRIVATE
sym(vp8_block_error_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
@ -59,152 +59,9 @@ sym(vp8_block_error_xmm):
pop rbp
ret
;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr)
global sym(vp8_block_error_mmx) PRIVATE
sym(vp8_block_error_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;coeff_ptr
pxor mm7, mm7
mov rdi, arg(1) ;dcoef_ptr
movq mm3, [rsi]
movq mm4, [rdi]
movq mm5, [rsi+8]
movq mm6, [rdi+8]
pxor mm1, mm1 ; from movd mm1, dc ; dc =0
movq mm2, mm7
psubw mm5, mm6
por mm1, mm2
pmaddwd mm5, mm5
pcmpeqw mm1, mm7
psubw mm3, mm4
pand mm1, mm3
pmaddwd mm1, mm1
paddd mm1, mm5
movq mm3, [rsi+16]
movq mm4, [rdi+16]
movq mm5, [rsi+24]
movq mm6, [rdi+24]
psubw mm5, mm6
pmaddwd mm5, mm5
psubw mm3, mm4
pmaddwd mm3, mm3
paddd mm3, mm5
paddd mm1, mm3
movq mm0, mm1
psrlq mm1, 32
paddd mm0, mm1
movq rax, mm0
pop rdi
pop rsi
; begin epilog
UNSHADOW_ARGS
pop rbp
ret
;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
global sym(vp8_mbblock_error_mmx_impl) PRIVATE
sym(vp8_mbblock_error_mmx_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;coeff_ptr
pxor mm7, mm7
mov rdi, arg(1) ;dcoef_ptr
pxor mm2, mm2
movd mm1, dword ptr arg(2) ;dc
por mm1, mm2
pcmpeqw mm1, mm7
mov rcx, 16
.mberror_loop_mmx:
movq mm3, [rsi]
movq mm4, [rdi]
movq mm5, [rsi+8]
movq mm6, [rdi+8]
psubw mm5, mm6
pmaddwd mm5, mm5
psubw mm3, mm4
pand mm3, mm1
pmaddwd mm3, mm3
paddd mm2, mm5
paddd mm2, mm3
movq mm3, [rsi+16]
movq mm4, [rdi+16]
movq mm5, [rsi+24]
movq mm6, [rdi+24]
psubw mm5, mm6
pmaddwd mm5, mm5
psubw mm3, mm4
pmaddwd mm3, mm3
paddd mm2, mm5
paddd mm2, mm3
add rsi, 32
add rdi, 32
sub rcx, 1
jnz .mberror_loop_mmx
movq mm0, mm2
psrlq mm2, 32
paddd mm0, mm2
movq rax, mm0
pop rdi
pop rsi
; begin epilog
UNSHADOW_ARGS
pop rbp
ret
;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
global sym(vp8_mbblock_error_xmm_impl) PRIVATE
sym(vp8_mbblock_error_xmm_impl):
;int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
global sym(vp8_mbblock_error_sse2_impl) PRIVATE
sym(vp8_mbblock_error_sse2_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
@ -272,66 +129,9 @@ sym(vp8_mbblock_error_xmm_impl):
ret
;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
global sym(vp8_mbuverror_mmx_impl) PRIVATE
sym(vp8_mbuverror_mmx_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;s_ptr
mov rdi, arg(1) ;d_ptr
mov rcx, 16
pxor mm7, mm7
.mbuverror_loop_mmx:
movq mm1, [rsi]
movq mm2, [rdi]
psubw mm1, mm2
pmaddwd mm1, mm1
movq mm3, [rsi+8]
movq mm4, [rdi+8]
psubw mm3, mm4
pmaddwd mm3, mm3
paddd mm7, mm1
paddd mm7, mm3
add rsi, 16
add rdi, 16
dec rcx
jnz .mbuverror_loop_mmx
movq mm0, mm7
psrlq mm7, 32
paddd mm0, mm7
movq rax, mm0
pop rdi
pop rsi
; begin epilog
UNSHADOW_ARGS
pop rbp
ret
;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
global sym(vp8_mbuverror_xmm_impl) PRIVATE
sym(vp8_mbuverror_xmm_impl):
;int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr);
global sym(vp8_mbuverror_sse2_impl) PRIVATE
sym(vp8_mbuverror_sse2_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2

View File

@ -13,12 +13,6 @@
#include "vpx_ports/x86.h"
#include "vp8/encoder/block.h"
void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch) {
vp8_short_fdct4x4_mmx(input, output, pitch);
vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
}
int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
short *qcoeff_ptr, short *dequant_ptr,
const short *scan_mask, short *round_ptr,
@ -38,17 +32,3 @@ void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) {
round_ptr, quant_ptr, dqcoeff_ptr);
}
int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc) {
short *coeff_ptr = mb->block[0].coeff;
short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
}
int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
int vp8_mbuverror_mmx(MACROBLOCK *mb) {
short *s_ptr = &mb->coeff[256];
short *d_ptr = &mb->e_mbd.dqcoeff[256];
return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
}

View File

@ -13,16 +13,16 @@
#include "vpx_ports/x86.h"
#include "vp8/encoder/block.h"
int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc) {
int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
int vp8_mbblock_error_sse2(MACROBLOCK *mb, int dc) {
short *coeff_ptr = mb->block[0].coeff;
short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
return vp8_mbblock_error_sse2_impl(coeff_ptr, dcoef_ptr, dc);
}
int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
int vp8_mbuverror_xmm(MACROBLOCK *mb) {
int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr);
int vp8_mbuverror_sse2(MACROBLOCK *mb) {
short *s_ptr = &mb->coeff[256];
short *d_ptr = &mb->e_mbd.dqcoeff[256];
return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
return vp8_mbuverror_sse2_impl(s_ptr, d_ptr);
}

View File

@ -80,8 +80,6 @@ VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_loopfilter_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm

View File

@ -76,7 +76,6 @@ VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
endif
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm