Merge changes I158f631a,I0555f639
* changes: vp8: remove mmx functions Rename _xmm functions to _sse2
This commit is contained in:
commit
cb4aa6d589
@ -298,10 +298,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
#if HAVE_MMX
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
MMX, SixtapPredictTest,
|
||||
::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmx),
|
||||
make_tuple(8, 8, &vp8_sixtap_predict8x8_mmx),
|
||||
make_tuple(8, 4, &vp8_sixtap_predict8x4_mmx),
|
||||
make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx)));
|
||||
::testing::Values(make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx)));
|
||||
#endif
|
||||
#if HAVE_SSE2
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
@ -353,9 +350,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
#if HAVE_MMX
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
MMX, BilinearPredictTest,
|
||||
::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_mmx),
|
||||
make_tuple(8, 8, &vp8_bilinear_predict8x8_mmx),
|
||||
make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx),
|
||||
::testing::Values(make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx),
|
||||
make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx)));
|
||||
#endif
|
||||
#if HAVE_SSE2
|
||||
|
@ -28,55 +28,51 @@ add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char
|
||||
specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa/;
|
||||
|
||||
add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
|
||||
specialize qw/vp8_dequant_idct_add_y_block mmx sse2 neon dspr2 msa/;
|
||||
specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa/;
|
||||
|
||||
add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
|
||||
specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 neon dspr2 msa/;
|
||||
specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa/;
|
||||
|
||||
#
|
||||
# Loopfilter
|
||||
#
|
||||
add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
|
||||
specialize qw/vp8_loop_filter_mbv mmx sse2 neon dspr2 msa/;
|
||||
specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa/;
|
||||
|
||||
add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
|
||||
specialize qw/vp8_loop_filter_bv mmx sse2 neon dspr2 msa/;
|
||||
specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa/;
|
||||
|
||||
add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
|
||||
specialize qw/vp8_loop_filter_mbh mmx sse2 neon dspr2 msa/;
|
||||
specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa/;
|
||||
|
||||
add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
|
||||
specialize qw/vp8_loop_filter_bh mmx sse2 neon dspr2 msa/;
|
||||
specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa/;
|
||||
|
||||
|
||||
add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
|
||||
specialize qw/vp8_loop_filter_simple_mbv mmx sse2 neon msa/;
|
||||
specialize qw/vp8_loop_filter_simple_mbv sse2 neon msa/;
|
||||
$vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
|
||||
$vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
|
||||
$vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
|
||||
$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
|
||||
$vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa;
|
||||
|
||||
add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
|
||||
specialize qw/vp8_loop_filter_simple_mbh mmx sse2 neon msa/;
|
||||
specialize qw/vp8_loop_filter_simple_mbh sse2 neon msa/;
|
||||
$vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
|
||||
$vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx;
|
||||
$vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
|
||||
$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
|
||||
$vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa;
|
||||
|
||||
add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
|
||||
specialize qw/vp8_loop_filter_simple_bv mmx sse2 neon msa/;
|
||||
specialize qw/vp8_loop_filter_simple_bv sse2 neon msa/;
|
||||
$vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
|
||||
$vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
|
||||
$vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
|
||||
$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
|
||||
$vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa;
|
||||
|
||||
add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
|
||||
specialize qw/vp8_loop_filter_simple_bh mmx sse2 neon msa/;
|
||||
specialize qw/vp8_loop_filter_simple_bh sse2 neon msa/;
|
||||
$vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
|
||||
$vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx;
|
||||
$vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
|
||||
$vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
|
||||
$vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
|
||||
@ -94,7 +90,7 @@ specialize qw/vp8_short_inv_walsh4x4_1 dspr2/;
|
||||
|
||||
#iwalsh16
|
||||
add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
|
||||
specialize qw/vp8_short_inv_walsh4x4 mmx sse2 neon dspr2 msa/;
|
||||
specialize qw/vp8_short_inv_walsh4x4 sse2 neon dspr2 msa/;
|
||||
|
||||
#idct1_scalar_add
|
||||
add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
|
||||
@ -104,7 +100,7 @@ specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa/;
|
||||
# RECON
|
||||
#
|
||||
add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_copy_mem16x16 mmx sse2 neon dspr2 msa/;
|
||||
specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa/;
|
||||
|
||||
add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa/;
|
||||
@ -136,22 +132,22 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") {
|
||||
# Subpixel
|
||||
#
|
||||
add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 neon dspr2 msa/;
|
||||
specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa/;
|
||||
|
||||
add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 neon dspr2 msa/;
|
||||
specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa/;
|
||||
|
||||
add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 neon dspr2 msa/;
|
||||
specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa/;
|
||||
|
||||
add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa/;
|
||||
|
||||
add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 neon msa/;
|
||||
specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/;
|
||||
|
||||
add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 neon msa/;
|
||||
specialize qw/vp8_bilinear_predict8x8 sse2 ssse3 neon msa/;
|
||||
|
||||
add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_bilinear_predict8x4 mmx neon msa/;
|
||||
@ -176,10 +172,10 @@ if ($opts{arch} =~ /x86/) {
|
||||
# Forward DCT
|
||||
#
|
||||
add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
|
||||
specialize qw/vp8_short_fdct4x4 mmx sse2 neon msa/;
|
||||
specialize qw/vp8_short_fdct4x4 sse2 neon msa/;
|
||||
|
||||
add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
|
||||
specialize qw/vp8_short_fdct8x4 mmx sse2 neon msa/;
|
||||
specialize qw/vp8_short_fdct8x4 sse2 neon msa/;
|
||||
|
||||
add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
|
||||
specialize qw/vp8_short_walsh4x4 sse2 neon msa/;
|
||||
@ -197,16 +193,13 @@ specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/;
|
||||
# Block subtraction
|
||||
#
|
||||
add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff";
|
||||
specialize qw/vp8_block_error mmx sse2 msa/;
|
||||
$vp8_block_error_sse2=vp8_block_error_xmm;
|
||||
specialize qw/vp8_block_error sse2 msa/;
|
||||
|
||||
add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc";
|
||||
specialize qw/vp8_mbblock_error mmx sse2 msa/;
|
||||
$vp8_mbblock_error_sse2=vp8_mbblock_error_xmm;
|
||||
specialize qw/vp8_mbblock_error sse2 msa/;
|
||||
|
||||
add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
|
||||
specialize qw/vp8_mbuverror mmx sse2 msa/;
|
||||
$vp8_mbuverror_sse2=vp8_mbuverror_xmm;
|
||||
specialize qw/vp8_mbuverror sse2 msa/;
|
||||
|
||||
#
|
||||
# Motion search
|
||||
|
@ -21,91 +21,3 @@ void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC) {
|
||||
|
||||
vp8_dequantize_b_impl_mmx(sq, dq, DQC);
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst,
|
||||
int stride, char *eobs) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; ++i) {
|
||||
if (eobs[0] > 1) {
|
||||
vp8_dequant_idct_add_mmx(q, dq, dst, stride);
|
||||
} else if (eobs[0] == 1) {
|
||||
vp8_dc_only_idct_add_mmx(q[0] * dq[0], dst, stride, dst, stride);
|
||||
memset(q, 0, 2 * sizeof(q[0]));
|
||||
}
|
||||
|
||||
if (eobs[1] > 1) {
|
||||
vp8_dequant_idct_add_mmx(q + 16, dq, dst + 4, stride);
|
||||
} else if (eobs[1] == 1) {
|
||||
vp8_dc_only_idct_add_mmx(q[16] * dq[0], dst + 4, stride, dst + 4, stride);
|
||||
memset(q + 16, 0, 2 * sizeof(q[0]));
|
||||
}
|
||||
|
||||
if (eobs[2] > 1) {
|
||||
vp8_dequant_idct_add_mmx(q + 32, dq, dst + 8, stride);
|
||||
} else if (eobs[2] == 1) {
|
||||
vp8_dc_only_idct_add_mmx(q[32] * dq[0], dst + 8, stride, dst + 8, stride);
|
||||
memset(q + 32, 0, 2 * sizeof(q[0]));
|
||||
}
|
||||
|
||||
if (eobs[3] > 1) {
|
||||
vp8_dequant_idct_add_mmx(q + 48, dq, dst + 12, stride);
|
||||
} else if (eobs[3] == 1) {
|
||||
vp8_dc_only_idct_add_mmx(q[48] * dq[0], dst + 12, stride, dst + 12,
|
||||
stride);
|
||||
memset(q + 48, 0, 2 * sizeof(q[0]));
|
||||
}
|
||||
|
||||
q += 64;
|
||||
dst += 4 * stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dstu,
|
||||
unsigned char *dstv, int stride,
|
||||
char *eobs) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 2; ++i) {
|
||||
if (eobs[0] > 1) {
|
||||
vp8_dequant_idct_add_mmx(q, dq, dstu, stride);
|
||||
} else if (eobs[0] == 1) {
|
||||
vp8_dc_only_idct_add_mmx(q[0] * dq[0], dstu, stride, dstu, stride);
|
||||
memset(q, 0, 2 * sizeof(q[0]));
|
||||
}
|
||||
|
||||
if (eobs[1] > 1) {
|
||||
vp8_dequant_idct_add_mmx(q + 16, dq, dstu + 4, stride);
|
||||
} else if (eobs[1] == 1) {
|
||||
vp8_dc_only_idct_add_mmx(q[16] * dq[0], dstu + 4, stride, dstu + 4,
|
||||
stride);
|
||||
memset(q + 16, 0, 2 * sizeof(q[0]));
|
||||
}
|
||||
|
||||
q += 32;
|
||||
dstu += 4 * stride;
|
||||
eobs += 2;
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; ++i) {
|
||||
if (eobs[0] > 1) {
|
||||
vp8_dequant_idct_add_mmx(q, dq, dstv, stride);
|
||||
} else if (eobs[0] == 1) {
|
||||
vp8_dc_only_idct_add_mmx(q[0] * dq[0], dstv, stride, dstv, stride);
|
||||
memset(q, 0, 2 * sizeof(q[0]));
|
||||
}
|
||||
|
||||
if (eobs[1] > 1) {
|
||||
vp8_dequant_idct_add_mmx(q + 16, dq, dstv + 4, stride);
|
||||
} else if (eobs[1] == 1) {
|
||||
vp8_dc_only_idct_add_mmx(q[16] * dq[0], dstv + 4, stride, dstv + 4,
|
||||
stride);
|
||||
memset(q + 16, 0, 2 * sizeof(q[0]));
|
||||
}
|
||||
|
||||
q += 32;
|
||||
dstv += 4 * stride;
|
||||
eobs += 2;
|
||||
}
|
||||
}
|
||||
|
@ -1,140 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
|
||||
global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
|
||||
sym(vp8_short_inv_walsh4x4_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 2
|
||||
; end prolog
|
||||
|
||||
mov rdx, arg(0)
|
||||
mov rax, 30003h
|
||||
|
||||
movq mm0, [rdx + 0] ;ip[0]
|
||||
movq mm1, [rdx + 8] ;ip[4]
|
||||
movq mm7, rax
|
||||
|
||||
movq mm2, [rdx + 16] ;ip[8]
|
||||
movq mm3, [rdx + 24] ;ip[12]
|
||||
punpcklwd mm7, mm7 ;0003000300030003h
|
||||
mov rdx, arg(1)
|
||||
|
||||
movq mm4, mm0
|
||||
movq mm5, mm1
|
||||
|
||||
paddw mm4, mm3 ;ip[0] + ip[12] aka al
|
||||
paddw mm5, mm2 ;ip[4] + ip[8] aka bl
|
||||
|
||||
movq mm6, mm4 ;temp al
|
||||
paddw mm4, mm5 ;al + bl
|
||||
psubw mm6, mm5 ;al - bl
|
||||
|
||||
psubw mm0, mm3 ;ip[0] - ip[12] aka d1
|
||||
psubw mm1, mm2 ;ip[4] - ip[8] aka c1
|
||||
|
||||
movq mm5, mm0 ;temp dl
|
||||
paddw mm0, mm1 ;dl + cl
|
||||
psubw mm5, mm1 ;dl - cl
|
||||
|
||||
; 03 02 01 00
|
||||
; 13 12 11 10
|
||||
; 23 22 21 20
|
||||
; 33 32 31 30
|
||||
|
||||
movq mm3, mm4 ; 03 02 01 00
|
||||
punpcklwd mm4, mm0 ; 11 01 10 00
|
||||
punpckhwd mm3, mm0 ; 13 03 12 02
|
||||
|
||||
movq mm1, mm6 ; 23 22 21 20
|
||||
punpcklwd mm6, mm5 ; 31 21 30 20
|
||||
punpckhwd mm1, mm5 ; 33 23 32 22
|
||||
|
||||
movq mm0, mm4 ; 11 01 10 00
|
||||
movq mm2, mm3 ; 13 03 12 02
|
||||
|
||||
punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
|
||||
punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
|
||||
|
||||
punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
|
||||
punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
|
||||
;~~~~~~~~~~~~~~~~~~~~~
|
||||
movq mm1, mm0
|
||||
movq mm5, mm4
|
||||
paddw mm1, mm3 ;ip[0] + ip[12] aka al
|
||||
paddw mm5, mm2 ;ip[4] + ip[8] aka bl
|
||||
|
||||
movq mm6, mm1 ;temp al
|
||||
paddw mm1, mm5 ;al + bl
|
||||
psubw mm6, mm5 ;al - bl
|
||||
paddw mm1, mm7
|
||||
paddw mm6, mm7
|
||||
psraw mm1, 3
|
||||
psraw mm6, 3
|
||||
|
||||
psubw mm0, mm3 ;ip[0] - ip[12] aka d1
|
||||
psubw mm4, mm2 ;ip[4] - ip[8] aka c1
|
||||
|
||||
movq mm5, mm0 ;temp dl
|
||||
paddw mm0, mm4 ;dl + cl
|
||||
psubw mm5, mm4 ;dl - cl
|
||||
paddw mm0, mm7
|
||||
paddw mm5, mm7
|
||||
psraw mm0, 3
|
||||
psraw mm5, 3
|
||||
;~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
movd eax, mm1
|
||||
movd ecx, mm0
|
||||
psrlq mm0, 32
|
||||
psrlq mm1, 32
|
||||
mov word ptr[rdx+32*0], ax
|
||||
mov word ptr[rdx+32*1], cx
|
||||
shr eax, 16
|
||||
shr ecx, 16
|
||||
mov word ptr[rdx+32*4], ax
|
||||
mov word ptr[rdx+32*5], cx
|
||||
movd eax, mm1
|
||||
movd ecx, mm0
|
||||
mov word ptr[rdx+32*8], ax
|
||||
mov word ptr[rdx+32*9], cx
|
||||
shr eax, 16
|
||||
shr ecx, 16
|
||||
mov word ptr[rdx+32*12], ax
|
||||
mov word ptr[rdx+32*13], cx
|
||||
|
||||
movd eax, mm6
|
||||
movd ecx, mm5
|
||||
psrlq mm5, 32
|
||||
psrlq mm6, 32
|
||||
mov word ptr[rdx+32*2], ax
|
||||
mov word ptr[rdx+32*3], cx
|
||||
shr eax, 16
|
||||
shr ecx, 16
|
||||
mov word ptr[rdx+32*6], ax
|
||||
mov word ptr[rdx+32*7], cx
|
||||
movd eax, mm6
|
||||
movd ecx, mm5
|
||||
mov word ptr[rdx+32*10], ax
|
||||
mov word ptr[rdx+32*11], cx
|
||||
shr eax, 16
|
||||
shr ecx, 16
|
||||
mov word ptr[rdx+32*14], ax
|
||||
mov word ptr[rdx+32*15], cx
|
||||
|
||||
; begin epilog
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
@ -22,13 +22,6 @@
|
||||
#define prototype_simple_loopfilter(sym) \
|
||||
void sym(unsigned char *y, int ystride, const unsigned char *blimit)
|
||||
|
||||
prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
|
||||
prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
|
||||
prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
|
||||
prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
|
||||
prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
|
||||
prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
|
||||
|
||||
#if HAVE_SSE2 && ARCH_X86_64
|
||||
prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
|
||||
prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
|
||||
@ -44,105 +37,6 @@ extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
|
||||
extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
|
||||
extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
|
||||
|
||||
#if HAVE_MMX
|
||||
/* Horizontal MB filtering */
|
||||
void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr,
|
||||
unsigned char *v_ptr, int y_stride, int uv_stride,
|
||||
loop_filter_info *lfi) {
|
||||
vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim,
|
||||
lfi->hev_thr, 2);
|
||||
|
||||
if (u_ptr) {
|
||||
vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim,
|
||||
lfi->lim, lfi->hev_thr, 1);
|
||||
}
|
||||
|
||||
if (v_ptr) {
|
||||
vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim,
|
||||
lfi->lim, lfi->hev_thr, 1);
|
||||
}
|
||||
}
|
||||
|
||||
/* Vertical MB Filtering */
|
||||
void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr,
|
||||
unsigned char *v_ptr, int y_stride, int uv_stride,
|
||||
loop_filter_info *lfi) {
|
||||
vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim,
|
||||
lfi->hev_thr, 2);
|
||||
|
||||
if (u_ptr) {
|
||||
vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim,
|
||||
lfi->hev_thr, 1);
|
||||
}
|
||||
|
||||
if (v_ptr) {
|
||||
vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim,
|
||||
lfi->hev_thr, 1);
|
||||
}
|
||||
}
|
||||
|
||||
/* Horizontal B Filtering */
|
||||
void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr,
|
||||
unsigned char *v_ptr, int y_stride, int uv_stride,
|
||||
loop_filter_info *lfi) {
|
||||
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim,
|
||||
lfi->lim, lfi->hev_thr, 2);
|
||||
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim,
|
||||
lfi->lim, lfi->hev_thr, 2);
|
||||
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride,
|
||||
lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
|
||||
if (u_ptr) {
|
||||
vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride,
|
||||
lfi->blim, lfi->lim, lfi->hev_thr, 1);
|
||||
}
|
||||
|
||||
if (v_ptr) {
|
||||
vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride,
|
||||
lfi->blim, lfi->lim, lfi->hev_thr, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,
|
||||
const unsigned char *blimit) {
|
||||
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride,
|
||||
blimit);
|
||||
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride,
|
||||
blimit);
|
||||
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride,
|
||||
blimit);
|
||||
}
|
||||
|
||||
/* Vertical B Filtering */
|
||||
void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr,
|
||||
unsigned char *v_ptr, int y_stride, int uv_stride,
|
||||
loop_filter_info *lfi) {
|
||||
vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
|
||||
lfi->hev_thr, 2);
|
||||
vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
|
||||
lfi->hev_thr, 2);
|
||||
vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
|
||||
lfi->hev_thr, 2);
|
||||
|
||||
if (u_ptr) {
|
||||
vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
|
||||
lfi->hev_thr, 1);
|
||||
}
|
||||
|
||||
if (v_ptr) {
|
||||
vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
|
||||
lfi->hev_thr, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
|
||||
const unsigned char *blimit) {
|
||||
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
|
||||
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
|
||||
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Horizontal MB filtering */
|
||||
#if HAVE_SSE2
|
||||
void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
|
||||
|
@ -117,158 +117,3 @@ sym(vp8_copy_mem8x4_mmx):
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void copy_mem16x16_mmx(
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; )
|
||||
global sym(vp8_copy_mem16x16_mmx) PRIVATE
|
||||
sym(vp8_copy_mem16x16_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;src;
|
||||
movsxd rax, dword ptr arg(1) ;src_stride;
|
||||
|
||||
mov rdi, arg(2) ;dst;
|
||||
movsxd rcx, dword ptr arg(3) ;dst_stride
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm3, [rsi+8];
|
||||
|
||||
movq mm1, [rsi+rax]
|
||||
movq mm4, [rsi+rax+8]
|
||||
|
||||
movq mm2, [rsi+rax*2]
|
||||
movq mm5, [rsi+rax*2+8]
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
add rsi, rax
|
||||
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
|
||||
movq [rdi+rcx], mm1
|
||||
movq [rdi+rcx+8], mm4
|
||||
|
||||
movq [rdi+rcx*2], mm2
|
||||
movq [rdi+rcx*2+8], mm5
|
||||
|
||||
lea rdi, [rdi+rcx*2]
|
||||
add rdi, rcx
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm3, [rsi+8];
|
||||
|
||||
movq mm1, [rsi+rax]
|
||||
movq mm4, [rsi+rax+8]
|
||||
|
||||
movq mm2, [rsi+rax*2]
|
||||
movq mm5, [rsi+rax*2+8]
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
add rsi, rax
|
||||
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
|
||||
movq [rdi+rcx], mm1
|
||||
movq [rdi+rcx+8], mm4
|
||||
|
||||
movq [rdi+rcx*2], mm2
|
||||
movq [rdi+rcx*2+8], mm5
|
||||
|
||||
lea rdi, [rdi+rcx*2]
|
||||
add rdi, rcx
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm3, [rsi+8];
|
||||
|
||||
movq mm1, [rsi+rax]
|
||||
movq mm4, [rsi+rax+8]
|
||||
|
||||
movq mm2, [rsi+rax*2]
|
||||
movq mm5, [rsi+rax*2+8]
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
add rsi, rax
|
||||
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
|
||||
movq [rdi+rcx], mm1
|
||||
movq [rdi+rcx+8], mm4
|
||||
|
||||
movq [rdi+rcx*2], mm2
|
||||
movq [rdi+rcx*2+8], mm5
|
||||
|
||||
lea rdi, [rdi+rcx*2]
|
||||
add rdi, rcx
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm3, [rsi+8];
|
||||
|
||||
movq mm1, [rsi+rax]
|
||||
movq mm4, [rsi+rax+8]
|
||||
|
||||
movq mm2, [rsi+rax*2]
|
||||
movq mm5, [rsi+rax*2+8]
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
add rsi, rax
|
||||
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
|
||||
movq [rdi+rcx], mm1
|
||||
movq [rdi+rcx+8], mm4
|
||||
|
||||
movq [rdi+rcx*2], mm2
|
||||
movq [rdi+rcx*2+8], mm5
|
||||
|
||||
lea rdi, [rdi+rcx*2]
|
||||
add rdi, rcx
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm3, [rsi+8];
|
||||
|
||||
movq mm1, [rsi+rax]
|
||||
movq mm4, [rsi+rax+8]
|
||||
|
||||
movq mm2, [rsi+rax*2]
|
||||
movq mm5, [rsi+rax*2+8]
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
add rsi, rax
|
||||
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
|
||||
movq [rdi+rcx], mm1
|
||||
movq [rdi+rcx+8], mm4
|
||||
|
||||
movq [rdi+rcx*2], mm2
|
||||
movq [rdi+rcx*2+8], mm5
|
||||
|
||||
lea rdi, [rdi+rcx*2]
|
||||
add rdi, rcx
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm3, [rsi+8];
|
||||
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
@ -204,163 +204,6 @@ sym(vp8_filter_block1dc_v6_mmx):
|
||||
ret
|
||||
|
||||
|
||||
;void bilinear_predict8x8_mmx
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; int xoffset,
|
||||
; int yoffset,
|
||||
; unsigned char *dst_ptr,
|
||||
; int dst_pitch
|
||||
;)
|
||||
global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
|
||||
sym(vp8_bilinear_predict8x8_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
|
||||
;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
|
||||
|
||||
movsxd rax, dword ptr arg(2) ;xoffset
|
||||
mov rdi, arg(4) ;dst_ptr ;
|
||||
|
||||
shl rax, 5 ; offset * 32
|
||||
lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
|
||||
|
||||
add rax, rcx ; HFilter
|
||||
mov rsi, arg(0) ;src_ptr ;
|
||||
|
||||
movsxd rdx, dword ptr arg(5) ;dst_pitch
|
||||
movq mm1, [rax] ;
|
||||
|
||||
movq mm2, [rax+16] ;
|
||||
movsxd rax, dword ptr arg(3) ;yoffset
|
||||
|
||||
pxor mm0, mm0 ;
|
||||
|
||||
shl rax, 5 ; offset*32
|
||||
add rax, rcx ; VFilter
|
||||
|
||||
lea rcx, [rdi+rdx*8] ;
|
||||
movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
|
||||
|
||||
|
||||
|
||||
; get the first horizontal line done ;
|
||||
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
||||
movq mm4, mm3 ; make a copy of current line
|
||||
|
||||
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
||||
punpckhbw mm4, mm0 ;
|
||||
|
||||
pmullw mm3, mm1 ;
|
||||
pmullw mm4, mm1 ;
|
||||
|
||||
movq mm5, [rsi+1] ;
|
||||
movq mm6, mm5 ;
|
||||
|
||||
punpcklbw mm5, mm0 ;
|
||||
punpckhbw mm6, mm0 ;
|
||||
|
||||
pmullw mm5, mm2 ;
|
||||
pmullw mm6, mm2 ;
|
||||
|
||||
paddw mm3, mm5 ;
|
||||
paddw mm4, mm6 ;
|
||||
|
||||
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
||||
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
|
||||
|
||||
paddw mm4, [GLOBAL(rd)] ;
|
||||
psraw mm4, VP8_FILTER_SHIFT ;
|
||||
|
||||
movq mm7, mm3 ;
|
||||
packuswb mm7, mm4 ;
|
||||
|
||||
add rsi, rdx ; next line
|
||||
.next_row_8x8:
|
||||
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
||||
movq mm4, mm3 ; make a copy of current line
|
||||
|
||||
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
||||
punpckhbw mm4, mm0 ;
|
||||
|
||||
pmullw mm3, mm1 ;
|
||||
pmullw mm4, mm1 ;
|
||||
|
||||
movq mm5, [rsi+1] ;
|
||||
movq mm6, mm5 ;
|
||||
|
||||
punpcklbw mm5, mm0 ;
|
||||
punpckhbw mm6, mm0 ;
|
||||
|
||||
pmullw mm5, mm2 ;
|
||||
pmullw mm6, mm2 ;
|
||||
|
||||
paddw mm3, mm5 ;
|
||||
paddw mm4, mm6 ;
|
||||
|
||||
movq mm5, mm7 ;
|
||||
movq mm6, mm7 ;
|
||||
|
||||
punpcklbw mm5, mm0 ;
|
||||
punpckhbw mm6, mm0
|
||||
|
||||
pmullw mm5, [rax] ;
|
||||
pmullw mm6, [rax] ;
|
||||
|
||||
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
||||
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
|
||||
|
||||
paddw mm4, [GLOBAL(rd)] ;
|
||||
psraw mm4, VP8_FILTER_SHIFT ;
|
||||
|
||||
movq mm7, mm3 ;
|
||||
packuswb mm7, mm4 ;
|
||||
|
||||
|
||||
pmullw mm3, [rax+16] ;
|
||||
pmullw mm4, [rax+16] ;
|
||||
|
||||
paddw mm3, mm5 ;
|
||||
paddw mm4, mm6 ;
|
||||
|
||||
|
||||
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
||||
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
|
||||
|
||||
paddw mm4, [GLOBAL(rd)] ;
|
||||
psraw mm4, VP8_FILTER_SHIFT ;
|
||||
|
||||
packuswb mm3, mm4
|
||||
|
||||
movq [rdi], mm3 ; store the results in the destination
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, rdx ; next line
|
||||
add rdi, dword ptr arg(5) ;dst_pitch ;
|
||||
%else
|
||||
movsxd r8, dword ptr arg(5) ;dst_pitch
|
||||
add rsi, rdx ; next line
|
||||
add rdi, r8 ;dst_pitch
|
||||
%endif
|
||||
cmp rdi, rcx ;
|
||||
jne .next_row_8x8
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void bilinear_predict8x4_mmx
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
@ -641,8 +484,8 @@ rd:
|
||||
times 4 dw 0x40
|
||||
|
||||
align 16
|
||||
global HIDDEN_DATA(sym(vp8_six_tap_mmx))
|
||||
sym(vp8_six_tap_mmx):
|
||||
global HIDDEN_DATA(sym(vp8_six_tap_x86))
|
||||
sym(vp8_six_tap_x86):
|
||||
times 8 dw 0
|
||||
times 8 dw 0
|
||||
times 8 dw 128
|
||||
|
@ -13,7 +13,7 @@
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "filter_x86.h"
|
||||
|
||||
extern const short vp8_six_tap_mmx[8][6 * 8];
|
||||
extern const short vp8_six_tap_x86[8][6 * 8];
|
||||
|
||||
extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
|
||||
unsigned short *output_ptr,
|
||||
@ -82,103 +82,13 @@ void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
|
||||
DECLARE_ALIGNED(16, unsigned short,
|
||||
FData2[16 * 16]); /* Temp data bufffer used in filtering */
|
||||
const short *HFilter, *VFilter;
|
||||
HFilter = vp8_six_tap_mmx[xoffset];
|
||||
HFilter = vp8_six_tap_x86[xoffset];
|
||||
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
|
||||
src_pixels_per_line, 1, 9, 8, HFilter);
|
||||
VFilter = vp8_six_tap_mmx[yoffset];
|
||||
VFilter = vp8_six_tap_x86[yoffset];
|
||||
vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
|
||||
VFilter);
|
||||
}
|
||||
|
||||
void vp8_sixtap_predict16x16_mmx(unsigned char *src_ptr,
|
||||
int src_pixels_per_line, int xoffset,
|
||||
int yoffset, unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
DECLARE_ALIGNED(16, unsigned short,
|
||||
FData2[24 * 24]); /* Temp data bufffer used in filtering */
|
||||
|
||||
const short *HFilter, *VFilter;
|
||||
|
||||
HFilter = vp8_six_tap_mmx[xoffset];
|
||||
|
||||
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
|
||||
src_pixels_per_line, 1, 21, 32, HFilter);
|
||||
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4,
|
||||
src_pixels_per_line, 1, 21, 32, HFilter);
|
||||
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8,
|
||||
src_pixels_per_line, 1, 21, 32, HFilter);
|
||||
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
|
||||
FData2 + 12, src_pixels_per_line, 1, 21, 32,
|
||||
HFilter);
|
||||
|
||||
VFilter = vp8_six_tap_mmx[yoffset];
|
||||
vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, 16,
|
||||
VFilter);
|
||||
vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16, 16,
|
||||
16, VFilter);
|
||||
vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16, 16,
|
||||
16, VFilter);
|
||||
vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16, 16,
|
||||
16, VFilter);
|
||||
}
|
||||
|
||||
void vp8_sixtap_predict8x8_mmx(unsigned char *src_ptr, int src_pixels_per_line,
|
||||
int xoffset, int yoffset, unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
DECLARE_ALIGNED(16, unsigned short,
|
||||
FData2[256]); /* Temp data bufffer used in filtering */
|
||||
|
||||
const short *HFilter, *VFilter;
|
||||
|
||||
HFilter = vp8_six_tap_mmx[xoffset];
|
||||
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
|
||||
src_pixels_per_line, 1, 13, 16, HFilter);
|
||||
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4,
|
||||
src_pixels_per_line, 1, 13, 16, HFilter);
|
||||
|
||||
VFilter = vp8_six_tap_mmx[yoffset];
|
||||
vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8, 8,
|
||||
VFilter);
|
||||
vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8, 8, 8,
|
||||
VFilter);
|
||||
}
|
||||
|
||||
void vp8_sixtap_predict8x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
|
||||
int xoffset, int yoffset, unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
DECLARE_ALIGNED(16, unsigned short,
|
||||
FData2[256]); /* Temp data bufffer used in filtering */
|
||||
|
||||
const short *HFilter, *VFilter;
|
||||
|
||||
HFilter = vp8_six_tap_mmx[xoffset];
|
||||
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
|
||||
src_pixels_per_line, 1, 9, 16, HFilter);
|
||||
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4,
|
||||
src_pixels_per_line, 1, 9, 16, HFilter);
|
||||
|
||||
VFilter = vp8_six_tap_mmx[yoffset];
|
||||
vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4, 8,
|
||||
VFilter);
|
||||
vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8, 4, 8,
|
||||
VFilter);
|
||||
}
|
||||
|
||||
void vp8_bilinear_predict16x16_mmx(unsigned char *src_ptr,
|
||||
int src_pixels_per_line, int xoffset,
|
||||
int yoffset, unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset,
|
||||
dst_ptr, dst_pitch);
|
||||
vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 8, dst_pitch);
|
||||
vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,
|
||||
src_pixels_per_line, xoffset, yoffset,
|
||||
dst_ptr + dst_pitch * 8, dst_pitch);
|
||||
vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8,
|
||||
src_pixels_per_line, xoffset, yoffset,
|
||||
dst_ptr + dst_pitch * 8 + 8, dst_pitch);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2
|
||||
@ -195,21 +105,21 @@ void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
|
||||
|
||||
if (xoffset) {
|
||||
if (yoffset) {
|
||||
HFilter = vp8_six_tap_mmx[xoffset];
|
||||
HFilter = vp8_six_tap_x86[xoffset];
|
||||
vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
|
||||
src_pixels_per_line, 1, 21, 32, HFilter);
|
||||
VFilter = vp8_six_tap_mmx[yoffset];
|
||||
VFilter = vp8_six_tap_x86[yoffset];
|
||||
vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
|
||||
dst_pitch, VFilter);
|
||||
} else {
|
||||
/* First-pass only */
|
||||
HFilter = vp8_six_tap_mmx[xoffset];
|
||||
HFilter = vp8_six_tap_x86[xoffset];
|
||||
vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
|
||||
dst_pitch, 16, HFilter);
|
||||
}
|
||||
} else {
|
||||
/* Second-pass only */
|
||||
VFilter = vp8_six_tap_mmx[yoffset];
|
||||
VFilter = vp8_six_tap_x86[yoffset];
|
||||
vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
|
||||
src_pixels_per_line, 21, 32);
|
||||
vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
|
||||
@ -226,21 +136,21 @@ void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
|
||||
|
||||
if (xoffset) {
|
||||
if (yoffset) {
|
||||
HFilter = vp8_six_tap_mmx[xoffset];
|
||||
HFilter = vp8_six_tap_x86[xoffset];
|
||||
vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
|
||||
src_pixels_per_line, 1, 13, 16, HFilter);
|
||||
VFilter = vp8_six_tap_mmx[yoffset];
|
||||
VFilter = vp8_six_tap_x86[yoffset];
|
||||
vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
|
||||
dst_pitch, VFilter);
|
||||
} else {
|
||||
/* First-pass only */
|
||||
HFilter = vp8_six_tap_mmx[xoffset];
|
||||
HFilter = vp8_six_tap_x86[xoffset];
|
||||
vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
|
||||
dst_pitch, 8, HFilter);
|
||||
}
|
||||
} else {
|
||||
/* Second-pass only */
|
||||
VFilter = vp8_six_tap_mmx[yoffset];
|
||||
VFilter = vp8_six_tap_x86[yoffset];
|
||||
vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
|
||||
src_pixels_per_line, dst_ptr, dst_pitch, 8,
|
||||
VFilter);
|
||||
@ -256,21 +166,21 @@ void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
|
||||
|
||||
if (xoffset) {
|
||||
if (yoffset) {
|
||||
HFilter = vp8_six_tap_mmx[xoffset];
|
||||
HFilter = vp8_six_tap_x86[xoffset];
|
||||
vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
|
||||
src_pixels_per_line, 1, 9, 16, HFilter);
|
||||
VFilter = vp8_six_tap_mmx[yoffset];
|
||||
VFilter = vp8_six_tap_x86[yoffset];
|
||||
vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
|
||||
dst_pitch, VFilter);
|
||||
} else {
|
||||
/* First-pass only */
|
||||
HFilter = vp8_six_tap_mmx[xoffset];
|
||||
HFilter = vp8_six_tap_x86[xoffset];
|
||||
vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
|
||||
dst_pitch, 4, HFilter);
|
||||
}
|
||||
} else {
|
||||
/* Second-pass only */
|
||||
VFilter = vp8_six_tap_mmx[yoffset];
|
||||
VFilter = vp8_six_tap_x86[yoffset];
|
||||
vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
|
||||
src_pixels_per_line, dst_ptr, dst_pitch, 4,
|
||||
VFilter);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,241 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
|
||||
global sym(vp8_short_fdct4x4_mmx) PRIVATE
|
||||
sym(vp8_short_fdct4x4_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 3
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ; input
|
||||
mov rdi, arg(1) ; output
|
||||
|
||||
movsxd rax, dword ptr arg(2) ;pitch
|
||||
|
||||
lea rcx, [rsi + rax*2]
|
||||
; read the input data
|
||||
movq mm0, [rsi]
|
||||
movq mm1, [rsi + rax]
|
||||
|
||||
movq mm2, [rcx]
|
||||
movq mm4, [rcx + rax]
|
||||
|
||||
; transpose for the first stage
|
||||
movq mm3, mm0 ; 00 01 02 03
|
||||
movq mm5, mm2 ; 20 21 22 23
|
||||
|
||||
punpcklwd mm0, mm1 ; 00 10 01 11
|
||||
punpckhwd mm3, mm1 ; 02 12 03 13
|
||||
|
||||
punpcklwd mm2, mm4 ; 20 30 21 31
|
||||
punpckhwd mm5, mm4 ; 22 32 23 33
|
||||
|
||||
movq mm1, mm0 ; 00 10 01 11
|
||||
punpckldq mm0, mm2 ; 00 10 20 30
|
||||
|
||||
punpckhdq mm1, mm2 ; 01 11 21 31
|
||||
|
||||
movq mm2, mm3 ; 02 12 03 13
|
||||
punpckldq mm2, mm5 ; 02 12 22 32
|
||||
|
||||
punpckhdq mm3, mm5 ; 03 13 23 33
|
||||
|
||||
; mm0 0
|
||||
; mm1 1
|
||||
; mm2 2
|
||||
; mm3 3
|
||||
|
||||
; first stage
|
||||
movq mm5, mm0
|
||||
movq mm4, mm1
|
||||
|
||||
paddw mm0, mm3 ; a1 = 0 + 3
|
||||
paddw mm1, mm2 ; b1 = 1 + 2
|
||||
|
||||
psubw mm4, mm2 ; c1 = 1 - 2
|
||||
psubw mm5, mm3 ; d1 = 0 - 3
|
||||
|
||||
psllw mm5, 3
|
||||
psllw mm4, 3
|
||||
|
||||
psllw mm0, 3
|
||||
psllw mm1, 3
|
||||
|
||||
; output 0 and 2
|
||||
movq mm2, mm0 ; a1
|
||||
|
||||
paddw mm0, mm1 ; op[0] = a1 + b1
|
||||
psubw mm2, mm1 ; op[2] = a1 - b1
|
||||
|
||||
; output 1 and 3
|
||||
; interleave c1, d1
|
||||
movq mm1, mm5 ; d1
|
||||
punpcklwd mm1, mm4 ; c1 d1
|
||||
punpckhwd mm5, mm4 ; c1 d1
|
||||
|
||||
movq mm3, mm1
|
||||
movq mm4, mm5
|
||||
|
||||
pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
|
||||
pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
|
||||
|
||||
pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
|
||||
pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
|
||||
|
||||
paddd mm1, MMWORD PTR[GLOBAL(_14500)]
|
||||
paddd mm4, MMWORD PTR[GLOBAL(_14500)]
|
||||
paddd mm3, MMWORD PTR[GLOBAL(_7500)]
|
||||
paddd mm5, MMWORD PTR[GLOBAL(_7500)]
|
||||
|
||||
psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
|
||||
psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
|
||||
psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
|
||||
psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
|
||||
|
||||
packssdw mm1, mm4 ; op[1]
|
||||
packssdw mm3, mm5 ; op[3]
|
||||
|
||||
; done with vertical
|
||||
; transpose for the second stage
|
||||
movq mm4, mm0 ; 00 10 20 30
|
||||
movq mm5, mm2 ; 02 12 22 32
|
||||
|
||||
punpcklwd mm0, mm1 ; 00 01 10 11
|
||||
punpckhwd mm4, mm1 ; 20 21 30 31
|
||||
|
||||
punpcklwd mm2, mm3 ; 02 03 12 13
|
||||
punpckhwd mm5, mm3 ; 22 23 32 33
|
||||
|
||||
movq mm1, mm0 ; 00 01 10 11
|
||||
punpckldq mm0, mm2 ; 00 01 02 03
|
||||
|
||||
punpckhdq mm1, mm2 ; 01 22 12 13
|
||||
|
||||
movq mm2, mm4 ; 20 31 30 31
|
||||
punpckldq mm2, mm5 ; 20 21 22 23
|
||||
|
||||
punpckhdq mm4, mm5 ; 30 31 32 33
|
||||
|
||||
; mm0 0
|
||||
; mm1 1
|
||||
; mm2 2
|
||||
; mm3 4
|
||||
|
||||
movq mm5, mm0
|
||||
movq mm3, mm1
|
||||
|
||||
paddw mm0, mm4 ; a1 = 0 + 3
|
||||
paddw mm1, mm2 ; b1 = 1 + 2
|
||||
|
||||
psubw mm3, mm2 ; c1 = 1 - 2
|
||||
psubw mm5, mm4 ; d1 = 0 - 3
|
||||
|
||||
pxor mm6, mm6 ; zero out for compare
|
||||
|
||||
pcmpeqw mm6, mm5 ; d1 != 0
|
||||
|
||||
pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper,
|
||||
; and keep bit 0 of lower
|
||||
|
||||
; output 0 and 2
|
||||
movq mm2, mm0 ; a1
|
||||
|
||||
paddw mm0, mm1 ; a1 + b1
|
||||
psubw mm2, mm1 ; a1 - b1
|
||||
|
||||
paddw mm0, MMWORD PTR[GLOBAL(_7w)]
|
||||
paddw mm2, MMWORD PTR[GLOBAL(_7w)]
|
||||
|
||||
psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4
|
||||
psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4
|
||||
|
||||
movq MMWORD PTR[rdi + 0 ], mm0
|
||||
movq MMWORD PTR[rdi + 16], mm2
|
||||
|
||||
; output 1 and 3
|
||||
; interleave c1, d1
|
||||
movq mm1, mm5 ; d1
|
||||
punpcklwd mm1, mm3 ; c1 d1
|
||||
punpckhwd mm5, mm3 ; c1 d1
|
||||
|
||||
movq mm3, mm1
|
||||
movq mm4, mm5
|
||||
|
||||
pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
|
||||
pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
|
||||
|
||||
pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
|
||||
pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
|
||||
|
||||
paddd mm1, MMWORD PTR[GLOBAL(_12000)]
|
||||
paddd mm4, MMWORD PTR[GLOBAL(_12000)]
|
||||
paddd mm3, MMWORD PTR[GLOBAL(_51000)]
|
||||
paddd mm5, MMWORD PTR[GLOBAL(_51000)]
|
||||
|
||||
psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
|
||||
psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
|
||||
psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
|
||||
psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
|
||||
|
||||
packssdw mm1, mm4 ; op[4]
|
||||
packssdw mm3, mm5 ; op[12]
|
||||
|
||||
paddw mm1, mm6 ; op[4] += (d1!=0)
|
||||
|
||||
movq MMWORD PTR[rdi + 8 ], mm1
|
||||
movq MMWORD PTR[rdi + 24], mm3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
align 8
|
||||
_5352_2217:
|
||||
dw 5352
|
||||
dw 2217
|
||||
dw 5352
|
||||
dw 2217
|
||||
align 8
|
||||
_2217_neg5352:
|
||||
dw 2217
|
||||
dw -5352
|
||||
dw 2217
|
||||
dw -5352
|
||||
align 8
|
||||
_cmp_mask:
|
||||
times 4 dw 1
|
||||
align 8
|
||||
_7w:
|
||||
times 4 dw 7
|
||||
align 8
|
||||
_14500:
|
||||
times 2 dd 14500
|
||||
align 8
|
||||
_7500:
|
||||
times 2 dd 7500
|
||||
align 8
|
||||
_12000:
|
||||
times 2 dd 12000
|
||||
align 8
|
||||
_51000:
|
||||
times 2 dd 51000
|
@ -11,9 +11,9 @@
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr)
|
||||
global sym(vp8_block_error_xmm) PRIVATE
|
||||
sym(vp8_block_error_xmm):
|
||||
;int vp8_block_error_sse2(short *coeff_ptr, short *dcoef_ptr)
|
||||
global sym(vp8_block_error_sse2) PRIVATE
|
||||
sym(vp8_block_error_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 2
|
||||
@ -59,152 +59,9 @@ sym(vp8_block_error_xmm):
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr)
|
||||
global sym(vp8_block_error_mmx) PRIVATE
|
||||
sym(vp8_block_error_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 2
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
|
||||
mov rsi, arg(0) ;coeff_ptr
|
||||
pxor mm7, mm7
|
||||
|
||||
mov rdi, arg(1) ;dcoef_ptr
|
||||
movq mm3, [rsi]
|
||||
|
||||
movq mm4, [rdi]
|
||||
movq mm5, [rsi+8]
|
||||
|
||||
movq mm6, [rdi+8]
|
||||
pxor mm1, mm1 ; from movd mm1, dc ; dc =0
|
||||
|
||||
movq mm2, mm7
|
||||
psubw mm5, mm6
|
||||
|
||||
por mm1, mm2
|
||||
pmaddwd mm5, mm5
|
||||
|
||||
pcmpeqw mm1, mm7
|
||||
psubw mm3, mm4
|
||||
|
||||
pand mm1, mm3
|
||||
pmaddwd mm1, mm1
|
||||
|
||||
paddd mm1, mm5
|
||||
movq mm3, [rsi+16]
|
||||
|
||||
movq mm4, [rdi+16]
|
||||
movq mm5, [rsi+24]
|
||||
|
||||
movq mm6, [rdi+24]
|
||||
psubw mm5, mm6
|
||||
|
||||
pmaddwd mm5, mm5
|
||||
psubw mm3, mm4
|
||||
|
||||
pmaddwd mm3, mm3
|
||||
paddd mm3, mm5
|
||||
|
||||
paddd mm1, mm3
|
||||
movq mm0, mm1
|
||||
|
||||
psrlq mm1, 32
|
||||
paddd mm0, mm1
|
||||
|
||||
movq rax, mm0
|
||||
|
||||
pop rdi
|
||||
pop rsi
|
||||
; begin epilog
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
|
||||
global sym(vp8_mbblock_error_mmx_impl) PRIVATE
|
||||
sym(vp8_mbblock_error_mmx_impl):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 3
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
|
||||
mov rsi, arg(0) ;coeff_ptr
|
||||
pxor mm7, mm7
|
||||
|
||||
mov rdi, arg(1) ;dcoef_ptr
|
||||
pxor mm2, mm2
|
||||
|
||||
movd mm1, dword ptr arg(2) ;dc
|
||||
por mm1, mm2
|
||||
|
||||
pcmpeqw mm1, mm7
|
||||
mov rcx, 16
|
||||
|
||||
.mberror_loop_mmx:
|
||||
movq mm3, [rsi]
|
||||
movq mm4, [rdi]
|
||||
|
||||
movq mm5, [rsi+8]
|
||||
movq mm6, [rdi+8]
|
||||
|
||||
|
||||
psubw mm5, mm6
|
||||
pmaddwd mm5, mm5
|
||||
|
||||
psubw mm3, mm4
|
||||
pand mm3, mm1
|
||||
|
||||
pmaddwd mm3, mm3
|
||||
paddd mm2, mm5
|
||||
|
||||
paddd mm2, mm3
|
||||
movq mm3, [rsi+16]
|
||||
|
||||
movq mm4, [rdi+16]
|
||||
movq mm5, [rsi+24]
|
||||
|
||||
movq mm6, [rdi+24]
|
||||
psubw mm5, mm6
|
||||
|
||||
pmaddwd mm5, mm5
|
||||
psubw mm3, mm4
|
||||
|
||||
pmaddwd mm3, mm3
|
||||
paddd mm2, mm5
|
||||
|
||||
paddd mm2, mm3
|
||||
add rsi, 32
|
||||
|
||||
add rdi, 32
|
||||
sub rcx, 1
|
||||
|
||||
jnz .mberror_loop_mmx
|
||||
|
||||
movq mm0, mm2
|
||||
psrlq mm2, 32
|
||||
|
||||
paddd mm0, mm2
|
||||
movq rax, mm0
|
||||
|
||||
pop rdi
|
||||
pop rsi
|
||||
; begin epilog
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
|
||||
global sym(vp8_mbblock_error_xmm_impl) PRIVATE
|
||||
sym(vp8_mbblock_error_xmm_impl):
|
||||
;int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
|
||||
global sym(vp8_mbblock_error_sse2_impl) PRIVATE
|
||||
sym(vp8_mbblock_error_sse2_impl):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 3
|
||||
@ -272,66 +129,9 @@ sym(vp8_mbblock_error_xmm_impl):
|
||||
ret
|
||||
|
||||
|
||||
;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
|
||||
global sym(vp8_mbuverror_mmx_impl) PRIVATE
|
||||
sym(vp8_mbuverror_mmx_impl):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 2
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
|
||||
mov rsi, arg(0) ;s_ptr
|
||||
mov rdi, arg(1) ;d_ptr
|
||||
|
||||
mov rcx, 16
|
||||
pxor mm7, mm7
|
||||
|
||||
.mbuverror_loop_mmx:
|
||||
|
||||
movq mm1, [rsi]
|
||||
movq mm2, [rdi]
|
||||
|
||||
psubw mm1, mm2
|
||||
pmaddwd mm1, mm1
|
||||
|
||||
|
||||
movq mm3, [rsi+8]
|
||||
movq mm4, [rdi+8]
|
||||
|
||||
psubw mm3, mm4
|
||||
pmaddwd mm3, mm3
|
||||
|
||||
|
||||
paddd mm7, mm1
|
||||
paddd mm7, mm3
|
||||
|
||||
|
||||
add rsi, 16
|
||||
add rdi, 16
|
||||
|
||||
dec rcx
|
||||
jnz .mbuverror_loop_mmx
|
||||
|
||||
movq mm0, mm7
|
||||
psrlq mm7, 32
|
||||
|
||||
paddd mm0, mm7
|
||||
movq rax, mm0
|
||||
|
||||
pop rdi
|
||||
pop rsi
|
||||
; begin epilog
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
|
||||
global sym(vp8_mbuverror_xmm_impl) PRIVATE
|
||||
sym(vp8_mbuverror_xmm_impl):
|
||||
;int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr);
|
||||
global sym(vp8_mbuverror_sse2_impl) PRIVATE
|
||||
sym(vp8_mbuverror_sse2_impl):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 2
|
||||
|
@ -13,12 +13,6 @@
|
||||
#include "vpx_ports/x86.h"
|
||||
#include "vp8/encoder/block.h"
|
||||
|
||||
void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
|
||||
void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch) {
|
||||
vp8_short_fdct4x4_mmx(input, output, pitch);
|
||||
vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
|
||||
}
|
||||
|
||||
int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
|
||||
short *qcoeff_ptr, short *dequant_ptr,
|
||||
const short *scan_mask, short *round_ptr,
|
||||
@ -38,17 +32,3 @@ void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) {
|
||||
|
||||
round_ptr, quant_ptr, dqcoeff_ptr);
|
||||
}
|
||||
|
||||
int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
|
||||
int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc) {
|
||||
short *coeff_ptr = mb->block[0].coeff;
|
||||
short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
|
||||
return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
|
||||
}
|
||||
|
||||
int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
|
||||
int vp8_mbuverror_mmx(MACROBLOCK *mb) {
|
||||
short *s_ptr = &mb->coeff[256];
|
||||
short *d_ptr = &mb->e_mbd.dqcoeff[256];
|
||||
return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
|
||||
}
|
||||
|
@ -13,16 +13,16 @@
|
||||
#include "vpx_ports/x86.h"
|
||||
#include "vp8/encoder/block.h"
|
||||
|
||||
int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
|
||||
int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc) {
|
||||
int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
|
||||
int vp8_mbblock_error_sse2(MACROBLOCK *mb, int dc) {
|
||||
short *coeff_ptr = mb->block[0].coeff;
|
||||
short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
|
||||
return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
|
||||
return vp8_mbblock_error_sse2_impl(coeff_ptr, dcoef_ptr, dc);
|
||||
}
|
||||
|
||||
int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
|
||||
int vp8_mbuverror_xmm(MACROBLOCK *mb) {
|
||||
int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr);
|
||||
int vp8_mbuverror_sse2(MACROBLOCK *mb) {
|
||||
short *s_ptr = &mb->coeff[256];
|
||||
short *d_ptr = &mb->e_mbd.dqcoeff[256];
|
||||
return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
|
||||
return vp8_mbuverror_sse2_impl(s_ptr, d_ptr);
|
||||
}
|
||||
|
@ -80,8 +80,6 @@ VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_loopfilter_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
|
||||
|
@ -76,7 +76,6 @@ VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
|
||||
VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
|
||||
endif
|
||||
|
||||
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
|
||||
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
|
||||
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
|
||||
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
|
||||
|
Loading…
x
Reference in New Issue
Block a user