ed9c66f584
Instead of using the predict buffer, the decoder now writes the predictor into the recon buffer. For blocks with eob=0, unnecessary idcts can be eliminated. This gave a performance boost of ~1.8% for the HD clips used. Tero: Added needed changes to ARM side and scheduled some assembly code to prevent interlocks. Patch Set 6: Merged (I1bcdca7a95aacc3a181b9faa6b10e3a71ee24df3) into this commit because of similarities in the idct functions. Patch Set 7: EC bug fix. Change-Id: Ie31d90b5d3522e1108163f2ac491e455e3f955e6
709 lines
24 KiB
NASM
709 lines
24 KiB
NASM
;
|
|
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license
|
|
; that can be found in the LICENSE file in the root of the source
|
|
; tree. An additional intellectual property rights grant can be found
|
|
; in the file PATENTS. All contributing project authors may
|
|
; be found in the AUTHORS file in the root of the source tree.
|
|
;
|
|
|
|
|
|
%include "vpx_ports/x86_abi_support.asm"
|
|
|
|
;void vp8_idct_dequant_0_2x_sse2
|
|
; (
|
|
; short *qcoeff - 0
|
|
; short *dequant - 1
|
|
; unsigned char *dst - 2
|
|
; int dst_stride - 3
|
|
; )
|
|
|
|
global sym(vp8_idct_dequant_0_2x_sse2)
|
|
sym(vp8_idct_dequant_0_2x_sse2):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 4
|
|
GET_GOT rbx
|
|
; end prolog
|
|
|
|
mov rdx, arg(1) ; dequant
|
|
mov rax, arg(0) ; qcoeff
|
|
|
|
movd xmm4, [rax]
|
|
movd xmm5, [rdx]
|
|
|
|
pinsrw xmm4, [rax+32], 4
|
|
pinsrw xmm5, [rdx], 4
|
|
|
|
pmullw xmm4, xmm5
|
|
|
|
; Zero out xmm5, for use unpacking
|
|
pxor xmm5, xmm5
|
|
|
|
; clear coeffs
|
|
movd [rax], xmm5
|
|
movd [rax+32], xmm5
|
|
;pshufb
|
|
mov rax, arg(2) ; dst
|
|
movsxd rdx, dword ptr arg(3) ; dst_stride
|
|
|
|
pshuflw xmm4, xmm4, 00000000b
|
|
pshufhw xmm4, xmm4, 00000000b
|
|
|
|
lea rcx, [rdx + rdx*2]
|
|
paddw xmm4, [GLOBAL(fours)]
|
|
|
|
psraw xmm4, 3
|
|
|
|
movq xmm0, [rax]
|
|
movq xmm1, [rax+rdx]
|
|
movq xmm2, [rax+2*rdx]
|
|
movq xmm3, [rax+rcx]
|
|
|
|
punpcklbw xmm0, xmm5
|
|
punpcklbw xmm1, xmm5
|
|
punpcklbw xmm2, xmm5
|
|
punpcklbw xmm3, xmm5
|
|
|
|
|
|
; Add to predict buffer
|
|
paddw xmm0, xmm4
|
|
paddw xmm1, xmm4
|
|
paddw xmm2, xmm4
|
|
paddw xmm3, xmm4
|
|
|
|
; pack up before storing
|
|
packuswb xmm0, xmm5
|
|
packuswb xmm1, xmm5
|
|
packuswb xmm2, xmm5
|
|
packuswb xmm3, xmm5
|
|
|
|
; store blocks back out
|
|
movq [rax], xmm0
|
|
movq [rax + rdx], xmm1
|
|
|
|
lea rax, [rax + 2*rdx]
|
|
|
|
movq [rax], xmm2
|
|
movq [rax + rdx], xmm3
|
|
|
|
; begin epilog
|
|
RESTORE_GOT
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
;void vp8_idct_dequant_full_2x_sse2
|
|
; (
|
|
; short *qcoeff - 0
|
|
; short *dequant - 1
|
|
; unsigned char *dst - 2
|
|
; int dst_stride - 3
|
|
; )
|
|
global sym(vp8_idct_dequant_full_2x_sse2)
|
|
sym(vp8_idct_dequant_full_2x_sse2):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 4
|
|
SAVE_XMM 7
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
; special case when 2 blocks have 0 or 1 coeffs
|
|
; dc is set as first coeff, so no need to load qcoeff
|
|
mov rax, arg(0) ; qcoeff
|
|
mov rdx, arg(1) ; dequant
|
|
mov rdi, arg(2) ; dst
|
|
|
|
|
|
; Zero out xmm7, for use unpacking
|
|
pxor xmm7, xmm7
|
|
|
|
|
|
; note the transpose of xmm1 and xmm2, necessary for shuffle
|
|
; to spit out sensicle data
|
|
movdqa xmm0, [rax]
|
|
movdqa xmm2, [rax+16]
|
|
movdqa xmm1, [rax+32]
|
|
movdqa xmm3, [rax+48]
|
|
|
|
; Clear out coeffs
|
|
movdqa [rax], xmm7
|
|
movdqa [rax+16], xmm7
|
|
movdqa [rax+32], xmm7
|
|
movdqa [rax+48], xmm7
|
|
|
|
; dequantize qcoeff buffer
|
|
pmullw xmm0, [rdx]
|
|
pmullw xmm2, [rdx+16]
|
|
pmullw xmm1, [rdx]
|
|
pmullw xmm3, [rdx+16]
|
|
movsxd rdx, dword ptr arg(3) ; dst_stride
|
|
|
|
; repack so block 0 row x and block 1 row x are together
|
|
movdqa xmm4, xmm0
|
|
punpckldq xmm0, xmm1
|
|
punpckhdq xmm4, xmm1
|
|
|
|
pshufd xmm0, xmm0, 11011000b
|
|
pshufd xmm1, xmm4, 11011000b
|
|
|
|
movdqa xmm4, xmm2
|
|
punpckldq xmm2, xmm3
|
|
punpckhdq xmm4, xmm3
|
|
|
|
pshufd xmm2, xmm2, 11011000b
|
|
pshufd xmm3, xmm4, 11011000b
|
|
|
|
; first pass
|
|
psubw xmm0, xmm2 ; b1 = 0-2
|
|
paddw xmm2, xmm2 ;
|
|
|
|
movdqa xmm5, xmm1
|
|
paddw xmm2, xmm0 ; a1 = 0+2
|
|
|
|
pmulhw xmm5, [GLOBAL(x_s1sqr2)]
|
|
lea rcx, [rdx + rdx*2] ;dst_stride * 3
|
|
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
|
|
|
|
movdqa xmm7, xmm3
|
|
pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
|
|
|
|
paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
|
|
psubw xmm7, xmm5 ; c1
|
|
|
|
movdqa xmm5, xmm1
|
|
movdqa xmm4, xmm3
|
|
|
|
pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
|
|
paddw xmm5, xmm1
|
|
|
|
pmulhw xmm3, [GLOBAL(x_s1sqr2)]
|
|
paddw xmm3, xmm4
|
|
|
|
paddw xmm3, xmm5 ; d1
|
|
movdqa xmm6, xmm2 ; a1
|
|
|
|
movdqa xmm4, xmm0 ; b1
|
|
paddw xmm2, xmm3 ;0
|
|
|
|
paddw xmm4, xmm7 ;1
|
|
psubw xmm0, xmm7 ;2
|
|
|
|
psubw xmm6, xmm3 ;3
|
|
|
|
; transpose for the second pass
|
|
movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
|
|
punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
|
|
punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
|
|
|
|
movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
|
|
punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
|
|
punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
|
|
|
|
|
|
movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
|
|
punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
|
|
punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
|
|
|
|
movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
|
|
punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
|
|
punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
|
|
|
|
|
|
movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
|
|
punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
|
|
punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
|
|
|
|
movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
|
|
punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
|
|
punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
|
|
|
|
pshufd xmm0, xmm2, 11011000b
|
|
pshufd xmm2, xmm1, 11011000b
|
|
|
|
pshufd xmm1, xmm5, 11011000b
|
|
pshufd xmm3, xmm7, 11011000b
|
|
|
|
; second pass
|
|
psubw xmm0, xmm2 ; b1 = 0-2
|
|
paddw xmm2, xmm2
|
|
|
|
movdqa xmm5, xmm1
|
|
paddw xmm2, xmm0 ; a1 = 0+2
|
|
|
|
pmulhw xmm5, [GLOBAL(x_s1sqr2)]
|
|
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
|
|
|
|
movdqa xmm7, xmm3
|
|
pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
|
|
|
|
paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
|
|
psubw xmm7, xmm5 ; c1
|
|
|
|
movdqa xmm5, xmm1
|
|
movdqa xmm4, xmm3
|
|
|
|
pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
|
|
paddw xmm5, xmm1
|
|
|
|
pmulhw xmm3, [GLOBAL(x_s1sqr2)]
|
|
paddw xmm3, xmm4
|
|
|
|
paddw xmm3, xmm5 ; d1
|
|
paddw xmm0, [GLOBAL(fours)]
|
|
|
|
paddw xmm2, [GLOBAL(fours)]
|
|
movdqa xmm6, xmm2 ; a1
|
|
|
|
movdqa xmm4, xmm0 ; b1
|
|
paddw xmm2, xmm3 ;0
|
|
|
|
paddw xmm4, xmm7 ;1
|
|
psubw xmm0, xmm7 ;2
|
|
|
|
psubw xmm6, xmm3 ;3
|
|
psraw xmm2, 3
|
|
|
|
psraw xmm0, 3
|
|
psraw xmm4, 3
|
|
|
|
psraw xmm6, 3
|
|
|
|
; transpose to save
|
|
movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
|
|
punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
|
|
punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
|
|
|
|
movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
|
|
punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
|
|
punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
|
|
|
|
|
|
movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
|
|
punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
|
|
punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
|
|
|
|
movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
|
|
punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
|
|
punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
|
|
|
|
|
|
movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
|
|
punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
|
|
punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
|
|
|
|
movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
|
|
punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
|
|
punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
|
|
|
|
pshufd xmm0, xmm2, 11011000b
|
|
pshufd xmm2, xmm1, 11011000b
|
|
|
|
pshufd xmm1, xmm5, 11011000b
|
|
pshufd xmm3, xmm7, 11011000b
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
; Load up predict blocks
|
|
movq xmm4, [rdi]
|
|
movq xmm5, [rdi+rdx]
|
|
|
|
punpcklbw xmm4, xmm7
|
|
punpcklbw xmm5, xmm7
|
|
|
|
paddw xmm0, xmm4
|
|
paddw xmm1, xmm5
|
|
|
|
movq xmm4, [rdi+2*rdx]
|
|
movq xmm5, [rdi+rcx]
|
|
|
|
punpcklbw xmm4, xmm7
|
|
punpcklbw xmm5, xmm7
|
|
|
|
paddw xmm2, xmm4
|
|
paddw xmm3, xmm5
|
|
|
|
.finish:
|
|
|
|
; pack up before storing
|
|
packuswb xmm0, xmm7
|
|
packuswb xmm1, xmm7
|
|
packuswb xmm2, xmm7
|
|
packuswb xmm3, xmm7
|
|
|
|
; store blocks back out
|
|
movq [rdi], xmm0
|
|
movq [rdi + rdx], xmm1
|
|
movq [rdi + rdx*2], xmm2
|
|
movq [rdi + rcx], xmm3
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
;void vp8_idct_dequant_dc_0_2x_sse2
|
|
; (
|
|
; short *qcoeff - 0
|
|
; short *dequant - 1
|
|
; unsigned char *dst - 2
|
|
; int dst_stride - 3
|
|
; short *dc - 4
|
|
; )
|
|
global sym(vp8_idct_dequant_dc_0_2x_sse2)
|
|
sym(vp8_idct_dequant_dc_0_2x_sse2):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 5
|
|
GET_GOT rbx
|
|
push rdi
|
|
; end prolog
|
|
|
|
; special case when 2 blocks have 0 or 1 coeffs
|
|
; dc is set as first coeff, so no need to load qcoeff
|
|
mov rax, arg(0) ; qcoeff
|
|
|
|
mov rdi, arg(2) ; dst
|
|
mov rdx, arg(4) ; dc
|
|
|
|
; Zero out xmm5, for use unpacking
|
|
pxor xmm5, xmm5
|
|
|
|
; load up 2 dc words here == 2*16 = doubleword
|
|
movd xmm4, [rdx]
|
|
|
|
movsxd rdx, dword ptr arg(3) ; dst_stride
|
|
lea rcx, [rdx + rdx*2]
|
|
; Load up predict blocks
|
|
movq xmm0, [rdi]
|
|
movq xmm1, [rdi+rdx*1]
|
|
movq xmm2, [rdi+rdx*2]
|
|
movq xmm3, [rdi+rcx]
|
|
|
|
; Duplicate and expand dc across
|
|
punpcklwd xmm4, xmm4
|
|
punpckldq xmm4, xmm4
|
|
|
|
; Rounding to dequant and downshift
|
|
paddw xmm4, [GLOBAL(fours)]
|
|
psraw xmm4, 3
|
|
|
|
; Predict buffer needs to be expanded from bytes to words
|
|
punpcklbw xmm0, xmm5
|
|
punpcklbw xmm1, xmm5
|
|
punpcklbw xmm2, xmm5
|
|
punpcklbw xmm3, xmm5
|
|
|
|
; Add to predict buffer
|
|
paddw xmm0, xmm4
|
|
paddw xmm1, xmm4
|
|
paddw xmm2, xmm4
|
|
paddw xmm3, xmm4
|
|
|
|
; pack up before storing
|
|
packuswb xmm0, xmm5
|
|
packuswb xmm1, xmm5
|
|
packuswb xmm2, xmm5
|
|
packuswb xmm3, xmm5
|
|
|
|
; store blocks back out
|
|
movq [rdi], xmm0
|
|
movq [rdi + rdx], xmm1
|
|
movq [rdi + rdx*2], xmm2
|
|
movq [rdi + rcx], xmm3
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
RESTORE_GOT
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
;void vp8_idct_dequant_dc_full_2x_sse2
|
|
; (
|
|
; short *qcoeff - 0
|
|
; short *dequant - 1
|
|
; unsigned char *dst - 2
|
|
; int dst_stride - 3
|
|
; short *dc - 4
|
|
; )
|
|
global sym(vp8_idct_dequant_dc_full_2x_sse2)
|
|
sym(vp8_idct_dequant_dc_full_2x_sse2):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 5
|
|
SAVE_XMM 7
|
|
GET_GOT rbx
|
|
push rdi
|
|
; end prolog
|
|
|
|
; special case when 2 blocks have 0 or 1 coeffs
|
|
; dc is set as first coeff, so no need to load qcoeff
|
|
mov rax, arg(0) ; qcoeff
|
|
mov rdx, arg(1) ; dequant
|
|
|
|
mov rdi, arg(2) ; dst
|
|
|
|
; Zero out xmm7, for use unpacking
|
|
pxor xmm7, xmm7
|
|
|
|
|
|
; note the transpose of xmm1 and xmm2, necessary for shuffle
|
|
; to spit out sensicle data
|
|
movdqa xmm0, [rax]
|
|
movdqa xmm2, [rax+16]
|
|
movdqa xmm1, [rax+32]
|
|
movdqa xmm3, [rax+48]
|
|
|
|
; Clear out coeffs
|
|
movdqa [rax], xmm7
|
|
movdqa [rax+16], xmm7
|
|
movdqa [rax+32], xmm7
|
|
movdqa [rax+48], xmm7
|
|
|
|
; dequantize qcoeff buffer
|
|
pmullw xmm0, [rdx]
|
|
pmullw xmm2, [rdx+16]
|
|
pmullw xmm1, [rdx]
|
|
pmullw xmm3, [rdx+16]
|
|
|
|
; DC component
|
|
mov rdx, arg(4)
|
|
|
|
; repack so block 0 row x and block 1 row x are together
|
|
movdqa xmm4, xmm0
|
|
punpckldq xmm0, xmm1
|
|
punpckhdq xmm4, xmm1
|
|
|
|
pshufd xmm0, xmm0, 11011000b
|
|
pshufd xmm1, xmm4, 11011000b
|
|
|
|
movdqa xmm4, xmm2
|
|
punpckldq xmm2, xmm3
|
|
punpckhdq xmm4, xmm3
|
|
|
|
pshufd xmm2, xmm2, 11011000b
|
|
pshufd xmm3, xmm4, 11011000b
|
|
|
|
; insert DC component
|
|
pinsrw xmm0, [rdx], 0
|
|
pinsrw xmm0, [rdx+2], 4
|
|
|
|
; first pass
|
|
psubw xmm0, xmm2 ; b1 = 0-2
|
|
paddw xmm2, xmm2 ;
|
|
|
|
movdqa xmm5, xmm1
|
|
paddw xmm2, xmm0 ; a1 = 0+2
|
|
|
|
pmulhw xmm5, [GLOBAL(x_s1sqr2)]
|
|
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
|
|
|
|
movdqa xmm7, xmm3
|
|
pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
|
|
|
|
paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
|
|
psubw xmm7, xmm5 ; c1
|
|
|
|
movdqa xmm5, xmm1
|
|
movdqa xmm4, xmm3
|
|
|
|
pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
|
|
paddw xmm5, xmm1
|
|
|
|
pmulhw xmm3, [GLOBAL(x_s1sqr2)]
|
|
paddw xmm3, xmm4
|
|
|
|
paddw xmm3, xmm5 ; d1
|
|
movdqa xmm6, xmm2 ; a1
|
|
|
|
movdqa xmm4, xmm0 ; b1
|
|
paddw xmm2, xmm3 ;0
|
|
|
|
paddw xmm4, xmm7 ;1
|
|
psubw xmm0, xmm7 ;2
|
|
|
|
psubw xmm6, xmm3 ;3
|
|
|
|
; transpose for the second pass
|
|
movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
|
|
punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
|
|
punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
|
|
|
|
movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
|
|
punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
|
|
punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
|
|
|
|
|
|
movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
|
|
punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
|
|
punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
|
|
|
|
movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
|
|
punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
|
|
punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
|
|
|
|
|
|
movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
|
|
punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
|
|
punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
|
|
|
|
movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
|
|
punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
|
|
punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
|
|
|
|
pshufd xmm0, xmm2, 11011000b
|
|
pshufd xmm2, xmm1, 11011000b
|
|
|
|
pshufd xmm1, xmm5, 11011000b
|
|
pshufd xmm3, xmm7, 11011000b
|
|
|
|
; second pass
|
|
psubw xmm0, xmm2 ; b1 = 0-2
|
|
paddw xmm2, xmm2
|
|
|
|
movdqa xmm5, xmm1
|
|
paddw xmm2, xmm0 ; a1 = 0+2
|
|
|
|
pmulhw xmm5, [GLOBAL(x_s1sqr2)]
|
|
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
|
|
|
|
movdqa xmm7, xmm3
|
|
pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
|
|
|
|
paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
|
|
psubw xmm7, xmm5 ; c1
|
|
|
|
movdqa xmm5, xmm1
|
|
movdqa xmm4, xmm3
|
|
|
|
pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
|
|
paddw xmm5, xmm1
|
|
|
|
pmulhw xmm3, [GLOBAL(x_s1sqr2)]
|
|
paddw xmm3, xmm4
|
|
|
|
paddw xmm3, xmm5 ; d1
|
|
paddw xmm0, [GLOBAL(fours)]
|
|
|
|
paddw xmm2, [GLOBAL(fours)]
|
|
movdqa xmm6, xmm2 ; a1
|
|
|
|
movdqa xmm4, xmm0 ; b1
|
|
paddw xmm2, xmm3 ;0
|
|
|
|
paddw xmm4, xmm7 ;1
|
|
psubw xmm0, xmm7 ;2
|
|
|
|
psubw xmm6, xmm3 ;3
|
|
psraw xmm2, 3
|
|
|
|
psraw xmm0, 3
|
|
psraw xmm4, 3
|
|
|
|
psraw xmm6, 3
|
|
|
|
; transpose to save
|
|
movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
|
|
punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
|
|
punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
|
|
|
|
movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
|
|
punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
|
|
punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
|
|
|
|
|
|
movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
|
|
punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
|
|
punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
|
|
|
|
movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
|
|
punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
|
|
punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
|
|
|
|
|
|
movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
|
|
punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
|
|
punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
|
|
|
|
movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
|
|
punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
|
|
punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
|
|
|
|
pshufd xmm0, xmm2, 11011000b
|
|
pshufd xmm2, xmm1, 11011000b
|
|
|
|
pshufd xmm1, xmm5, 11011000b
|
|
pshufd xmm3, xmm7, 11011000b
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
; Load up predict blocks
|
|
movsxd rdx, dword ptr arg(3) ; dst_stride
|
|
movq xmm4, [rdi]
|
|
movq xmm5, [rdi+rdx]
|
|
lea rcx, [rdx + rdx*2]
|
|
|
|
punpcklbw xmm4, xmm7
|
|
punpcklbw xmm5, xmm7
|
|
|
|
paddw xmm0, xmm4
|
|
paddw xmm1, xmm5
|
|
|
|
movq xmm4, [rdi+rdx*2]
|
|
movq xmm5, [rdi+rcx]
|
|
|
|
punpcklbw xmm4, xmm7
|
|
punpcklbw xmm5, xmm7
|
|
|
|
paddw xmm2, xmm4
|
|
paddw xmm3, xmm5
|
|
|
|
.finish:
|
|
|
|
; pack up before storing
|
|
packuswb xmm0, xmm7
|
|
packuswb xmm1, xmm7
|
|
packuswb xmm2, xmm7
|
|
packuswb xmm3, xmm7
|
|
|
|
; Load destination stride before writing out,
|
|
; doesn't need to persist
|
|
movsxd rdx, dword ptr arg(3) ; dst_stride
|
|
|
|
; store blocks back out
|
|
movq [rdi], xmm0
|
|
movq [rdi + rdx], xmm1
|
|
|
|
lea rdi, [rdi + 2*rdx]
|
|
|
|
movq [rdi], xmm2
|
|
movq [rdi + rdx], xmm3
|
|
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
RESTORE_GOT
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
SECTION_RODATA
|
|
align 16
|
|
fours:
|
|
times 8 dw 0x0004
|
|
align 16
|
|
x_s1sqr2:
|
|
times 8 dw 0x8A8C
|
|
align 16
|
|
x_c1sqr2less1:
|
|
times 8 dw 0x4E7B
|