diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 80af49e84..f61230c80 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -86,13 +86,6 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) { } } -void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr, - uint8_t *dst_ptr, - int pitch, int stride) { - int16_t dc = input_dc; - vp9_short_iwalsh4x4_1_add_c(&dc, dst_ptr, stride); -} - void vp9_idct4_1d_c(int16_t *input, int16_t *output) { int16_t step[4]; int temp1, temp2; diff --git a/vp9/common/x86/vp9_idct_sse2.asm b/vp9/common/x86/vp9_idct_sse2.asm deleted file mode 100644 index 8f3c6dfc3..000000000 --- a/vp9/common/x86/vp9_idct_sse2.asm +++ /dev/null @@ -1,712 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_idct_dequant_0_2x_sse2 -; ( -; short *qcoeff - 0 -; short *dequant - 1 -; unsigned char *pre - 2 -; unsigned char *dst - 3 -; int dst_stride - 4 -; int blk_stride - 5 -; ) - -global sym(vp9_idct_dequant_0_2x_sse2) PRIVATE -sym(vp9_idct_dequant_0_2x_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - ; end prolog - - mov rdx, arg(1) ; dequant - mov rax, arg(0) ; qcoeff - - movd xmm4, [rax] - movd xmm5, [rdx] - - pinsrw xmm4, [rax+32], 4 - pinsrw xmm5, [rdx], 4 - - pmullw xmm4, xmm5 - - ; Zero out xmm5, for use unpacking - pxor xmm5, xmm5 - - ; clear coeffs - movd [rax], xmm5 - movd [rax+32], xmm5 -;pshufb - pshuflw xmm4, xmm4, 00000000b - pshufhw xmm4, xmm4, 00000000b - - mov rax, arg(2) ; pre - paddw xmm4, [GLOBAL(fours)] - - movsxd rcx, dword ptr arg(5) ; blk_stride - psraw xmm4, 3 - - movq xmm0, [rax] - movq xmm1, [rax+rcx] - movq xmm2, [rax+2*rcx] - lea rcx, [3*rcx] - movq xmm3, [rax+rcx] - - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpcklbw xmm2, xmm5 - punpcklbw xmm3, xmm5 - - mov rax, arg(3) ; dst - movsxd rdx, dword ptr arg(4) ; dst_stride - - ; Add to predict buffer - paddw xmm0, xmm4 - paddw xmm1, xmm4 - paddw xmm2, xmm4 - paddw xmm3, xmm4 - - ; pack up before storing - packuswb xmm0, xmm5 - packuswb xmm1, xmm5 - packuswb xmm2, xmm5 - packuswb xmm3, xmm5 - - ; store blocks back out - movq [rax], xmm0 - movq [rax + rdx], xmm1 - - lea rax, [rax + 2*rdx] - - movq [rax], xmm2 - movq [rax + rdx], xmm3 - - ; begin epilog - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -global sym(vp9_idct_dequant_full_2x_sse2) PRIVATE -sym(vp9_idct_dequant_full_2x_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ; special case when 2 blocks have 0 or 1 coeffs - ; dc is set as first coeff, so no need to load qcoeff - mov rax, arg(0) ; qcoeff - mov rsi, arg(2) ; pre - mov rdi, arg(3) ; dst - movsxd rcx, dword ptr arg(5) ; blk_stride - - ; Zero out xmm7, for use unpacking - pxor xmm7, xmm7 - - mov rdx, arg(1) ; dequant - - ; note the transpose of xmm1 and xmm2, necessary for shuffle - ; to spit out sensicle data - movdqa xmm0, [rax] - movdqa xmm2, [rax+16] - movdqa xmm1, [rax+32] - movdqa xmm3, [rax+48] - - ; Clear out coeffs - movdqa [rax], xmm7 - movdqa [rax+16], xmm7 - movdqa [rax+32], xmm7 - movdqa [rax+48], xmm7 - - ; dequantize qcoeff buffer - pmullw xmm0, [rdx] - pmullw xmm2, [rdx+16] - pmullw xmm1, [rdx] - pmullw xmm3, [rdx+16] - - ; repack so block 0 row x and block 1 row x are together - movdqa xmm4, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm4, xmm1 - - pshufd xmm0, xmm0, 11011000b - pshufd xmm1, xmm4, 11011000b - - movdqa xmm4, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm4, xmm3 - - pshufd xmm2, xmm2, 11011000b - pshufd xmm3, xmm4, 11011000b - - ; first pass - psubw xmm0, xmm2 ; b1 = 0-2 - paddw xmm2, xmm2 ; - - movdqa xmm5, xmm1 - paddw xmm2, xmm0 ; a1 = 0+2 - - pmulhw xmm5, [GLOBAL(x_s1sqr2)] - paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) - - movdqa xmm7, xmm3 - pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] - - paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw xmm7, xmm5 ; c1 - - movdqa xmm5, xmm1 - movdqa xmm4, xmm3 - - pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] - paddw xmm5, xmm1 - - pmulhw xmm3, [GLOBAL(x_s1sqr2)] - paddw xmm3, xmm4 - - paddw xmm3, xmm5 ; d1 - movdqa xmm6, xmm2 ; a1 - - movdqa xmm4, xmm0 ; b1 - paddw xmm2, xmm3 ;0 - - paddw xmm4, xmm7 ;1 - psubw xmm0, xmm7 ;2 - - psubw xmm6, xmm3 ;3 - - ; transpose for the second pass - movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 - punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 - punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 - - movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 - punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 - punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 - - - movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 - punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 - punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 - - movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 - punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 - punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 - - - movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 - punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 - punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 - - movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 - punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 - punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 - - pshufd xmm0, xmm2, 11011000b - pshufd xmm2, xmm1, 11011000b - - pshufd xmm1, xmm5, 11011000b - pshufd xmm3, xmm7, 11011000b - - ; second pass - psubw xmm0, xmm2 ; b1 = 0-2 - paddw xmm2, xmm2 - - movdqa xmm5, xmm1 - paddw xmm2, xmm0 ; a1 = 0+2 - - pmulhw xmm5, [GLOBAL(x_s1sqr2)] - paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) - - movdqa xmm7, xmm3 - pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] - - paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw xmm7, xmm5 ; c1 - - movdqa xmm5, xmm1 - movdqa xmm4, xmm3 - - pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] - paddw xmm5, xmm1 - - pmulhw xmm3, [GLOBAL(x_s1sqr2)] - paddw xmm3, xmm4 - - paddw xmm3, xmm5 ; d1 - paddw xmm0, [GLOBAL(fours)] - - paddw xmm2, [GLOBAL(fours)] - movdqa xmm6, xmm2 ; a1 - - movdqa xmm4, xmm0 ; b1 - paddw xmm2, xmm3 ;0 - - paddw xmm4, xmm7 ;1 - psubw xmm0, xmm7 ;2 - - psubw xmm6, xmm3 ;3 - psraw xmm2, 3 - - psraw xmm0, 3 - psraw xmm4, 3 - - psraw xmm6, 3 - - ; transpose to save - movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 - punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 - punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 - - movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 - punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 - punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 - - - movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 - punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 - punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 - - movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 - punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 - punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 - - - movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 - punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 - punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 - - movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 - punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 - punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 - - pshufd xmm0, xmm2, 11011000b - pshufd xmm2, xmm1, 11011000b - - pshufd xmm1, xmm5, 11011000b - pshufd xmm3, xmm7, 11011000b - - pxor xmm7, xmm7 - - ; Load up predict blocks - movq xmm4, [rsi] - movq xmm5, [rsi+rcx] - - punpcklbw xmm4, xmm7 - punpcklbw xmm5, xmm7 - - paddw xmm0, xmm4 - paddw xmm1, xmm5 - - movq xmm4, [rsi+2*rcx] - lea rcx, [3*rcx] - movq xmm5, [rsi+rcx] - - punpcklbw xmm4, xmm7 - punpcklbw xmm5, xmm7 - - paddw xmm2, xmm4 - paddw xmm3, xmm5 - -.finish: - - ; pack up before storing - packuswb xmm0, xmm7 - packuswb xmm1, xmm7 - packuswb xmm2, xmm7 - packuswb xmm3, xmm7 - - ; Load destination stride before writing out, - ; doesn't need to persist - movsxd rdx, dword ptr arg(4) ; dst_stride - - ; store blocks back out - movq [rdi], xmm0 - movq [rdi + rdx], xmm1 - - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm2 - movq [rdi + rdx], xmm3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_idct_dequant_dc_0_2x_sse2 -; ( -; short *qcoeff - 0 -; short *dequant - 1 -; unsigned char *pre - 2 -; unsigned char *dst - 3 -; int dst_stride - 4 -; short *dc - 5 -; ) -global sym(vp9_idct_dequant_dc_0_2x_sse2) PRIVATE -sym(vp9_idct_dequant_dc_0_2x_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ; special case when 2 blocks have 0 or 1 coeffs - ; dc is set as first coeff, so no need to load qcoeff - mov rax, arg(0) ; qcoeff - mov rsi, arg(2) ; pre - mov rdi, arg(3) ; dst - mov rdx, arg(5) ; dc - - ; Zero out xmm5, for use unpacking - pxor xmm5, xmm5 - - ; load up 2 dc words here == 2*16 = doubleword - movd xmm4, [rdx] - - ; Load up predict blocks - movq xmm0, [rsi] - movq xmm1, [rsi+16] - movq xmm2, [rsi+32] - movq xmm3, [rsi+48] - - ; Duplicate and expand dc across - punpcklwd xmm4, xmm4 - punpckldq xmm4, xmm4 - - ; Rounding to dequant and downshift - paddw xmm4, [GLOBAL(fours)] - psraw xmm4, 3 - - ; Predict buffer needs to be expanded from bytes to words - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpcklbw xmm2, xmm5 - punpcklbw xmm3, xmm5 - - ; Add to predict buffer - paddw xmm0, xmm4 - paddw xmm1, xmm4 - paddw xmm2, xmm4 - paddw xmm3, xmm4 - - ; pack up before storing - packuswb xmm0, xmm5 - packuswb xmm1, xmm5 - packuswb xmm2, xmm5 - packuswb xmm3, xmm5 - - ; Load destination stride before writing out, - ; doesn't need to persist - movsxd rdx, dword ptr arg(4) ; dst_stride - - ; store blocks back out - movq [rdi], xmm0 - movq [rdi + rdx], xmm1 - - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm2 - movq [rdi + rdx], xmm3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -global sym(vp9_idct_dequant_dc_full_2x_sse2) PRIVATE -sym(vp9_idct_dequant_dc_full_2x_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ; special case when 2 blocks have 0 or 1 coeffs - ; dc is set as first coeff, so no need to load qcoeff - mov rax, arg(0) ; qcoeff - mov rsi, arg(2) ; pre - mov rdi, arg(3) ; dst - - ; Zero out xmm7, for use unpacking - pxor xmm7, xmm7 - - mov rdx, arg(1) ; dequant - - ; note the transpose of xmm1 and xmm2, necessary for shuffle - ; to spit out sensicle data - movdqa xmm0, [rax] - movdqa xmm2, [rax+16] - movdqa xmm1, [rax+32] - movdqa xmm3, [rax+48] - - ; Clear out coeffs - movdqa [rax], xmm7 - movdqa [rax+16], xmm7 - movdqa [rax+32], xmm7 - movdqa [rax+48], xmm7 - - ; dequantize qcoeff buffer - pmullw xmm0, [rdx] - pmullw xmm2, [rdx+16] - pmullw xmm1, [rdx] - pmullw xmm3, [rdx+16] - - ; DC component - mov rdx, arg(5) - - ; repack so block 0 row x and block 1 row x are together - movdqa xmm4, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm4, xmm1 - - pshufd xmm0, xmm0, 11011000b - pshufd xmm1, xmm4, 11011000b - - movdqa xmm4, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm4, xmm3 - - pshufd xmm2, xmm2, 11011000b - pshufd xmm3, xmm4, 11011000b - - ; insert DC component - pinsrw xmm0, [rdx], 0 - pinsrw xmm0, [rdx+2], 4 - - ; first pass - psubw xmm0, xmm2 ; b1 = 0-2 - paddw xmm2, xmm2 ; - - movdqa xmm5, xmm1 - paddw xmm2, xmm0 ; a1 = 0+2 - - pmulhw xmm5, [GLOBAL(x_s1sqr2)] - paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) - - movdqa xmm7, xmm3 - pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] - - paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw xmm7, xmm5 ; c1 - - movdqa xmm5, xmm1 - movdqa xmm4, xmm3 - - pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] - paddw xmm5, xmm1 - - pmulhw xmm3, [GLOBAL(x_s1sqr2)] - paddw xmm3, xmm4 - - paddw xmm3, xmm5 ; d1 - movdqa xmm6, xmm2 ; a1 - - movdqa xmm4, xmm0 ; b1 - paddw xmm2, xmm3 ;0 - - paddw xmm4, xmm7 ;1 - psubw xmm0, xmm7 ;2 - - psubw xmm6, xmm3 ;3 - - ; transpose for the second pass - movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 - punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 - punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 - - movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 - punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 - punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 - - - movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 - punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 - punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 - - movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 - punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 - punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 - - - movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 - punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 - punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 - - movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 - punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 - punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 - - pshufd xmm0, xmm2, 11011000b - pshufd xmm2, xmm1, 11011000b - - pshufd xmm1, xmm5, 11011000b - pshufd xmm3, xmm7, 11011000b - - ; second pass - psubw xmm0, xmm2 ; b1 = 0-2 - paddw xmm2, xmm2 - - movdqa xmm5, xmm1 - paddw xmm2, xmm0 ; a1 = 0+2 - - pmulhw xmm5, [GLOBAL(x_s1sqr2)] - paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) - - movdqa xmm7, xmm3 - pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] - - paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw xmm7, xmm5 ; c1 - - movdqa xmm5, xmm1 - movdqa xmm4, xmm3 - - pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] - paddw xmm5, xmm1 - - pmulhw xmm3, [GLOBAL(x_s1sqr2)] - paddw xmm3, xmm4 - - paddw xmm3, xmm5 ; d1 - paddw xmm0, [GLOBAL(fours)] - - paddw xmm2, [GLOBAL(fours)] - movdqa xmm6, xmm2 ; a1 - - movdqa xmm4, xmm0 ; b1 - paddw xmm2, xmm3 ;0 - - paddw xmm4, xmm7 ;1 - psubw xmm0, xmm7 ;2 - - psubw xmm6, xmm3 ;3 - psraw xmm2, 3 - - psraw xmm0, 3 - psraw xmm4, 3 - - psraw xmm6, 3 - - ; transpose to save - movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 - punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 - punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 - - movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 - punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 - punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 - - - movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 - punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 - punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 - - movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 - punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 - punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 - - - movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 - punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 - punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 - - movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 - punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 - punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 - - pshufd xmm0, xmm2, 11011000b - pshufd xmm2, xmm1, 11011000b - - pshufd xmm1, xmm5, 11011000b - pshufd xmm3, xmm7, 11011000b - - pxor xmm7, xmm7 - - ; Load up predict blocks - movq xmm4, [rsi] - movq xmm5, [rsi+16] - - punpcklbw xmm4, xmm7 - punpcklbw xmm5, xmm7 - - paddw xmm0, xmm4 - paddw xmm1, xmm5 - - movq xmm4, [rsi+32] - movq xmm5, [rsi+48] - - punpcklbw xmm4, xmm7 - punpcklbw xmm5, xmm7 - - paddw xmm2, xmm4 - paddw xmm3, xmm5 - -.finish: - - ; pack up before storing - packuswb xmm0, xmm7 - packuswb xmm1, xmm7 - packuswb xmm2, xmm7 - packuswb xmm3, xmm7 - - ; Load destination stride before writing out, - ; doesn't need to persist - movsxd rdx, dword ptr arg(4) ; dst_stride - - ; store blocks back out - movq [rdi], xmm0 - movq [rdi + rdx], xmm1 - - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm2 - movq [rdi + rdx], xmm3 - - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -fours: - times 8 dw 0x0004 -align 16 -x_s1sqr2: - times 8 dw 0x8A8C -align 16 -x_c1sqr2less1: - times 8 dw 0x4E7B diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c index 7726598bc..f26f87c64 100644 --- a/vp9/decoder/vp9_idct_blk.c +++ b/vp9/decoder/vp9_idct_blk.c @@ -143,12 +143,6 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) { } } -void vp9_dc_idct_add_c(int16_t *input, uint8_t *dest, int stride, int dc) { - input[0] = dc; - vp9_short_idct4x4_add(input, dest, stride); - vpx_memset(input, 0, 32); -} - void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride, int eob) { if (eob > 1) { @@ -160,13 +154,6 @@ void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride, } } -void vp9_dc_idct_add_lossless_c(int16_t *input, uint8_t *dest, - int stride, int dc) { - input[0] = dc; - vp9_short_iwalsh4x4_add(input, dest, stride); - vpx_memset(input, 0, 32); -} - void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) { // If dc is 1, then input[0] is the reconstructed value, do not need // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. diff --git a/vp9/decoder/vp9_idct_blk.h b/vp9/decoder/vp9_idct_blk.h index ab4c04205..42dcc8170 100644 --- a/vp9/decoder/vp9_idct_blk.h +++ b/vp9/decoder/vp9_idct_blk.h @@ -18,13 +18,6 @@ void vp9_idct_add_lossless_c(int16_t *input, unsigned char *dest, int stride, int eob); -void vp9_dc_idct_add_lossless_c(int16_t *input, unsigned char *output, - int stride, int dc); - -void vp9_dc_idct_add_y_block_lossless_c(int16_t *q, unsigned char *pre, - unsigned char *dst, int stride, - const int16_t *dc); - void vp9_idct_add_y_block_lossless_c(int16_t *q, unsigned char *dst, int stride, struct macroblockd *xd); diff --git a/vp9/decoder/x86/vp9_idct_blk_sse2.c b/vp9/decoder/x86/vp9_idct_blk_sse2.c deleted file mode 100644 index ece5803e1..000000000 --- a/vp9/decoder/x86/vp9_idct_blk_sse2.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "vp9/common/vp9_blockd.h" - -void vp9_idct_dequant_dc_0_2x_sse2(short *q, const short *dq, - unsigned char *pre, unsigned char *dst, - int dst_stride, const short *dc); - -void vp9_idct_dequant_dc_full_2x_sse2(short *q, const short *dq, - unsigned char *pre, unsigned char *dst, - int dst_stride, const short *dc); - -void vp9_idct_dequant_0_2x_sse2(short *q, const short *dq, - unsigned char *pre, unsigned char *dst, - int dst_stride, int blk_stride); - -void vp9_idct_dequant_full_2x_sse2(short *q, const short *dq, - unsigned char *pre, unsigned char *dst, - int dst_stride, int blk_stride); - -void vp9_dequant_dc_idct_add_y_block_sse2(short *q, const short *dq, - unsigned char *pre, - unsigned char *dst, - int stride, unsigned short *eobs, - const short *dc) { - int i; - - for (i = 0; i < 4; i++) { - if (((short *)(eobs))[0] & 0xfefe) - vp9_idct_dequant_dc_full_2x_sse2(q, dq, pre, dst, stride, dc); - else - vp9_idct_dequant_dc_0_2x_sse2(q, dq, pre, dst, stride, dc); - - if (((short *)(eobs))[1] & 0xfefe) - vp9_idct_dequant_dc_full_2x_sse2(q + 32, dq, pre + 8, dst + 8, - stride, dc + 2); - else - vp9_idct_dequant_dc_0_2x_sse2(q + 32, dq, pre + 8, dst + 8, - stride, dc + 2); - - q += 64; - dc += 4; - pre += 64; - dst += stride * 4; - eobs += 4; - } -} - -void vp9_dequant_idct_add_y_block_sse2(short *q, const short *dq, - unsigned char *pre, unsigned char *dst, - int stride, unsigned short *eobs) { - int i; - - for (i = 0; i < 4; i++) { - if (((short *)(eobs))[0] & 0xfefe) - vp9_idct_dequant_full_2x_sse2(q, dq, pre, dst, stride, 16); - else - vp9_idct_dequant_0_2x_sse2(q, dq, pre, dst, stride, 16); - - if (((short *)(eobs))[1] & 0xfefe) - vp9_idct_dequant_full_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16); - else - vp9_idct_dequant_0_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16); - - q += 64; - pre += 64; - dst += stride * 4; - eobs += 4; - } -} - -void vp9_dequant_idct_add_uv_block_sse2(short *q, const short *dq, - unsigned char *pre, - unsigned char *dstu, - unsigned char *dstv, - int stride, unsigned short *eobs) { - if (((short *)(eobs))[0] & 0xfefe) - vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8); - else - vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8); - - q += 32; - pre += 32; - dstu += stride * 4; - - if (((short *)(eobs))[1] & 0xfefe) - vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8); - else - vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8); - - q += 32; - pre += 32; - - if (((short *)(eobs))[2] & 0xfefe) - vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8); - else - vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8); - - q += 32; - pre += 32; - dstv += stride * 4; - - if (((short *)(eobs))[3] & 0xfefe) - vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8); - else - vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8); -} diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 0c5ce1dfe..147804d03 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -86,7 +86,6 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk index 72cdfebf4..3be0b6dde 100644 --- a/vp9/vp9dx.mk +++ b/vp9/vp9dx.mk @@ -35,8 +35,6 @@ VP9_DX_SRCS-yes += decoder/vp9_idct_blk.h VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes)) -VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_idct_blk_sse2.c - VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c $(eval $(call asm_offsets_template,\