vpx/vp8/common/x86/mfqe_sse2.asm

;
;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"

;void vp8_filter_by_weight16x16_sse2
;(
;    unsigned char *src,
;    int            src_stride,
;    unsigned char *dst,
;    int            dst_stride,
;    int            src_weight
;)
global sym(vp8_filter_by_weight16x16_sse2) PRIVATE
sym(vp8_filter_by_weight16x16_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    SAVE_XMM 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    movd        xmm0, arg(4)                ; src_weight
    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words

    movdqa      xmm1, [GLOBAL(tMFQE)]
    psubw       xmm1, xmm0                  ; dst_weight

    mov         rax, arg(0)                 ; src
    mov         rsi, arg(1)                 ; src_stride
    mov         rdx, arg(2)                 ; dst
    mov         rdi, arg(3)                 ; dst_stride

    mov         rcx, 16                     ; loop count
    pxor        xmm6, xmm6

.combine
    movdqa      xmm2, [rax]
    movdqa      xmm4, [rdx]
    add         rax, rsi

    ; src * src_weight
    movdqa      xmm3, xmm2
    punpcklbw   xmm2, xmm6
    punpckhbw   xmm3, xmm6
    pmullw      xmm2, xmm0
    pmullw      xmm3, xmm0

    ; dst * dst_weight
    movdqa      xmm5, xmm4
    punpcklbw   xmm4, xmm6
    punpckhbw   xmm5, xmm6
    pmullw      xmm4, xmm1
    pmullw      xmm5, xmm1

    ; sum, round and shift
    paddw       xmm2, xmm4
    paddw       xmm3, xmm5
    paddw       xmm2, [GLOBAL(tMFQE_round)]
    paddw       xmm3, [GLOBAL(tMFQE_round)]
    psrlw       xmm2, 4
    psrlw       xmm3, 4

    packuswb    xmm2, xmm3
    movdqa      [rdx], xmm2
    add         rdx, rdi

    dec         rcx
    jnz         .combine

    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp

    ret

;void vp8_filter_by_weight8x8_sse2
;(
;    unsigned char *src,
;    int            src_stride,
;    unsigned char *dst,
;    int            dst_stride,
;    int            src_weight
;)
global sym(vp8_filter_by_weight8x8_sse2) PRIVATE
sym(vp8_filter_by_weight8x8_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    movd        xmm0, arg(4)                ; src_weight
    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words

    movdqa      xmm1, [GLOBAL(tMFQE)]
    psubw       xmm1, xmm0                  ; dst_weight

    mov         rax, arg(0)                 ; src
    mov         rsi, arg(1)                 ; src_stride
    mov         rdx, arg(2)                 ; dst
    mov         rdi, arg(3)                 ; dst_stride

    mov         rcx, 8                      ; loop count
    pxor        xmm4, xmm4

.combine
    movq        xmm2, [rax]
    movq        xmm3, [rdx]
    add         rax, rsi

    ; src * src_weight
    punpcklbw   xmm2, xmm4
    pmullw      xmm2, xmm0

    ; dst * dst_weight
    punpcklbw   xmm3, xmm4
    pmullw      xmm3, xmm1

    ; sum, round and shift
    paddw       xmm2, xmm3
    paddw       xmm2, [GLOBAL(tMFQE_round)]
    psrlw       xmm2, 4

    packuswb    xmm2, xmm4
    movq        [rdx], xmm2
    add         rdx, rdi

    dec         rcx
    jnz         .combine

    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp

    ret

;void vp8_variance_and_sad_16x16_sse2 | arg
;(
;    unsigned char *src1,          0
;    int            stride1,       1
;    unsigned char *src2,          2
;    int            stride2,       3
;    unsigned int  *variance,      4
;    unsigned int  *sad,           5
;)
global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
sym(vp8_variance_and_sad_16x16_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    mov         rax,        arg(0)          ; src1
    mov         rcx,        arg(1)          ; stride1
    mov         rdx,        arg(2)          ; src2
    mov         rdi,        arg(3)          ; stride2

    mov         rsi,        16              ; block height

    ; Prep accumulator registers
    pxor        xmm3, xmm3                  ; SAD
    pxor        xmm4, xmm4                  ; sum of src2
    pxor        xmm5, xmm5                  ; sum of src2^2

    ; Because we're working with the actual output frames
    ; we can't depend on any kind of data alignment.
.accumulate
    movdqa      xmm0, [rax]                 ; src1
    movdqa      xmm1, [rdx]                 ; src2
    add         rax, rcx                    ; src1 + stride1
    add         rdx, rdi                    ; src2 + stride2

    ; SAD(src1, src2)
    psadbw      xmm0, xmm1
    paddusw     xmm3, xmm0

    ; SUM(src2)
    pxor        xmm2, xmm2
    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
    paddusw     xmm4, xmm2

    ; pmaddubsw would be ideal if it took two unsigned values. instead,
    ; it expects a signed and an unsigned value. so instead we zero extend
    ; and operate on words.
    pxor        xmm2, xmm2
    movdqa      xmm0, xmm1
    punpcklbw   xmm0, xmm2
    punpckhbw   xmm1, xmm2
    pmaddwd     xmm0, xmm0
    pmaddwd     xmm1, xmm1
    paddd       xmm5, xmm0
    paddd       xmm5, xmm1

    sub         rsi,        1
    jnz         .accumulate

    ; phaddd only operates on adjacent double words.
    ; Finalize SAD and store
    movdqa      xmm0, xmm3
    psrldq      xmm0, 8
    paddusw     xmm0, xmm3
    paddd       xmm0, [GLOBAL(t128)]
    psrld       xmm0, 8

    mov         rax,  arg(5)
    movd        [rax], xmm0

    ; Accumulate sum of src2
    movdqa      xmm0, xmm4
    psrldq      xmm0, 8
    paddusw     xmm0, xmm4
    ; Square src2. Ignore high value
    pmuludq     xmm0, xmm0
    psrld       xmm0, 8

    ; phaddw could be used to sum adjacent values but we want
    ; all the values summed. promote to doubles, accumulate,
    ; shift and sum
    pxor        xmm2, xmm2
    movdqa      xmm1, xmm5
    punpckldq   xmm1, xmm2
    punpckhdq   xmm5, xmm2
    paddd       xmm1, xmm5
    movdqa      xmm2, xmm1
    psrldq      xmm1, 8
    paddd       xmm1, xmm2

    psubd       xmm1, xmm0

    ; (variance + 128) >> 8
    paddd       xmm1, [GLOBAL(t128)]
    psrld       xmm1, 8
    mov         rax,  arg(4)

    movd        [rax], xmm1


    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

SECTION_RODATA
align 16
t128:
%ifndef __NASM_VER__
    ddq 128
%elif CONFIG_BIG_ENDIAN
    dq  0, 128
%else
    dq  128, 0
%endif
align 16
tMFQE: ; 1 << MFQE_PRECISION
    times 8 dw 0x10
align 16
tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
    times 8 dw 0x08
RFC: Reorganize MFQE loops Break MFQE code into it's own file. It is currently only valid for 16x16 and 8x8 Y blocks. It also filters 4x4 U/V blocks. Refactor filtering and add associated assembly. Limited test cases show --mfqe introduces a penalty of ~20% with HD content. The assembly reduces the penalty to ~15% Change-Id: I4b8de6b5cdff5413037de5b6c42f437033ee55bf 2012-01-27 19:23:52 +01:00			`;`
			`; Copyright (c) 2012 The WebM project authors. All Rights Reserved.`
			`;`
			`; Use of this source code is governed by a BSD-style license`
			`; that can be found in the LICENSE file in the root of the source`
			`; tree. An additional intellectual property rights grant can be found`
			`; in the file PATENTS. All contributing project authors may`
			`; be found in the AUTHORS file in the root of the source tree.`
			`;`


			`%include "vpx_ports/x86_abi_support.asm"`

			`;void vp8_filter_by_weight16x16_sse2`
			`;(`
			`; unsigned char *src,`
			`; int src_stride,`
			`; unsigned char *dst,`
			`; int dst_stride,`
			`; int src_weight`
			`;)`
Make libvpx Chromium build friendly Add PRIVATE macro for adding private_extern directive for yasm to hide global symbols. This is only enabled if -DCHROMIUM is used with YASM. Also fixed a small problem with rtcd_defs.sh to guard TEMPORAL_DENOISING. Change-Id: I9027fce3ebddcf20078293e4b86b396f21da7857 2012-05-24 00:52:34 +02:00			`global sym(vp8_filter_by_weight16x16_sse2) PRIVATE`
RFC: Reorganize MFQE loops Break MFQE code into it's own file. It is currently only valid for 16x16 and 8x8 Y blocks. It also filters 4x4 U/V blocks. Refactor filtering and add associated assembly. Limited test cases show --mfqe introduces a penalty of ~20% with HD content. The assembly reduces the penalty to ~15% Change-Id: I4b8de6b5cdff5413037de5b6c42f437033ee55bf 2012-01-27 19:23:52 +01:00			`sym(vp8_filter_by_weight16x16_sse2):`
			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 5`
			`SAVE_XMM 6`
			`GET_GOT rbx`
			`push rsi`
			`push rdi`
			`; end prolog`

			`movd xmm0, arg(4) ; src_weight`
			`pshuflw xmm0, xmm0, 0x0 ; replicate to all low words`
			`punpcklqdq xmm0, xmm0 ; replicate to all hi words`

			`movdqa xmm1, [GLOBAL(tMFQE)]`
			`psubw xmm1, xmm0 ; dst_weight`

			`mov rax, arg(0) ; src`
			`mov rsi, arg(1) ; src_stride`
			`mov rdx, arg(2) ; dst`
			`mov rdi, arg(3) ; dst_stride`

			`mov rcx, 16 ; loop count`
			`pxor xmm6, xmm6`

			`.combine`
			`movdqa xmm2, [rax]`
			`movdqa xmm4, [rdx]`
			`add rax, rsi`

			`; src * src_weight`
			`movdqa xmm3, xmm2`
			`punpcklbw xmm2, xmm6`
			`punpckhbw xmm3, xmm6`
			`pmullw xmm2, xmm0`
			`pmullw xmm3, xmm0`

			`; dst * dst_weight`
			`movdqa xmm5, xmm4`
			`punpcklbw xmm4, xmm6`
			`punpckhbw xmm5, xmm6`
			`pmullw xmm4, xmm1`
			`pmullw xmm5, xmm1`

			`; sum, round and shift`
			`paddw xmm2, xmm4`
			`paddw xmm3, xmm5`
			`paddw xmm2, [GLOBAL(tMFQE_round)]`
			`paddw xmm3, [GLOBAL(tMFQE_round)]`
			`psrlw xmm2, 4`
			`psrlw xmm3, 4`

			`packuswb xmm2, xmm3`
			`movdqa [rdx], xmm2`
			`add rdx, rdi`

			`dec rcx`
			`jnz .combine`

			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_GOT`
			`RESTORE_XMM`
			`UNSHADOW_ARGS`
			`pop rbp`

			`ret`

			`;void vp8_filter_by_weight8x8_sse2`
			`;(`
			`; unsigned char *src,`
			`; int src_stride,`
			`; unsigned char *dst,`
			`; int dst_stride,`
			`; int src_weight`
			`;)`
Make libvpx Chromium build friendly Add PRIVATE macro for adding private_extern directive for yasm to hide global symbols. This is only enabled if -DCHROMIUM is used with YASM. Also fixed a small problem with rtcd_defs.sh to guard TEMPORAL_DENOISING. Change-Id: I9027fce3ebddcf20078293e4b86b396f21da7857 2012-05-24 00:52:34 +02:00			`global sym(vp8_filter_by_weight8x8_sse2) PRIVATE`
RFC: Reorganize MFQE loops Break MFQE code into it's own file. It is currently only valid for 16x16 and 8x8 Y blocks. It also filters 4x4 U/V blocks. Refactor filtering and add associated assembly. Limited test cases show --mfqe introduces a penalty of ~20% with HD content. The assembly reduces the penalty to ~15% Change-Id: I4b8de6b5cdff5413037de5b6c42f437033ee55bf 2012-01-27 19:23:52 +01:00			`sym(vp8_filter_by_weight8x8_sse2):`
			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 5`
			`GET_GOT rbx`
			`push rsi`
			`push rdi`
			`; end prolog`

			`movd xmm0, arg(4) ; src_weight`
			`pshuflw xmm0, xmm0, 0x0 ; replicate to all low words`
			`punpcklqdq xmm0, xmm0 ; replicate to all hi words`

			`movdqa xmm1, [GLOBAL(tMFQE)]`
			`psubw xmm1, xmm0 ; dst_weight`

			`mov rax, arg(0) ; src`
			`mov rsi, arg(1) ; src_stride`
			`mov rdx, arg(2) ; dst`
			`mov rdi, arg(3) ; dst_stride`

			`mov rcx, 8 ; loop count`
			`pxor xmm4, xmm4`

			`.combine`
			`movq xmm2, [rax]`
			`movq xmm3, [rdx]`
			`add rax, rsi`

			`; src * src_weight`
			`punpcklbw xmm2, xmm4`
			`pmullw xmm2, xmm0`

			`; dst * dst_weight`
			`punpcklbw xmm3, xmm4`
			`pmullw xmm3, xmm1`

			`; sum, round and shift`
			`paddw xmm2, xmm3`
			`paddw xmm2, [GLOBAL(tMFQE_round)]`
			`psrlw xmm2, 4`

			`packuswb xmm2, xmm4`
			`movq [rdx], xmm2`
			`add rdx, rdi`

			`dec rcx`
			`jnz .combine`

			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_GOT`
			`UNSHADOW_ARGS`
			`pop rbp`

			`ret`

			`;void vp8_variance_and_sad_16x16_sse2 \| arg`
			`;(`
			`; unsigned char *src1, 0`
			`; int stride1, 1`
			`; unsigned char *src2, 2`
			`; int stride2, 3`
			`; unsigned int *variance, 4`
			`; unsigned int *sad, 5`
			`;)`
Make libvpx Chromium build friendly Add PRIVATE macro for adding private_extern directive for yasm to hide global symbols. This is only enabled if -DCHROMIUM is used with YASM. Also fixed a small problem with rtcd_defs.sh to guard TEMPORAL_DENOISING. Change-Id: I9027fce3ebddcf20078293e4b86b396f21da7857 2012-05-24 00:52:34 +02:00			`global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE`
RFC: Reorganize MFQE loops Break MFQE code into it's own file. It is currently only valid for 16x16 and 8x8 Y blocks. It also filters 4x4 U/V blocks. Refactor filtering and add associated assembly. Limited test cases show --mfqe introduces a penalty of ~20% with HD content. The assembly reduces the penalty to ~15% Change-Id: I4b8de6b5cdff5413037de5b6c42f437033ee55bf 2012-01-27 19:23:52 +01:00			`sym(vp8_variance_and_sad_16x16_sse2):`
			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 6`
			`GET_GOT rbx`
			`push rsi`
			`push rdi`
			`; end prolog`

			`mov rax, arg(0) ; src1`
			`mov rcx, arg(1) ; stride1`
			`mov rdx, arg(2) ; src2`
			`mov rdi, arg(3) ; stride2`

			`mov rsi, 16 ; block height`

			`; Prep accumulator registers`
			`pxor xmm3, xmm3 ; SAD`
			`pxor xmm4, xmm4 ; sum of src2`
			`pxor xmm5, xmm5 ; sum of src2^2`

			`; Because we're working with the actual output frames`
			`; we can't depend on any kind of data alignment.`
			`.accumulate`
			`movdqa xmm0, [rax] ; src1`
			`movdqa xmm1, [rdx] ; src2`
			`add rax, rcx ; src1 + stride1`
			`add rdx, rdi ; src2 + stride2`

			`; SAD(src1, src2)`
			`psadbw xmm0, xmm1`
			`paddusw xmm3, xmm0`

			`; SUM(src2)`
			`pxor xmm2, xmm2`
			`psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0`
			`paddusw xmm4, xmm2`

			`; pmaddubsw would be ideal if it took two unsigned values. instead,`
			`; it expects a signed and an unsigned value. so instead we zero extend`
			`; and operate on words.`
			`pxor xmm2, xmm2`
			`movdqa xmm0, xmm1`
			`punpcklbw xmm0, xmm2`
			`punpckhbw xmm1, xmm2`
			`pmaddwd xmm0, xmm0`
			`pmaddwd xmm1, xmm1`
			`paddd xmm5, xmm0`
			`paddd xmm5, xmm1`

			`sub rsi, 1`
			`jnz .accumulate`

			`; phaddd only operates on adjacent double words.`
			`; Finalize SAD and store`
			`movdqa xmm0, xmm3`
			`psrldq xmm0, 8`
			`paddusw xmm0, xmm3`
			`paddd xmm0, [GLOBAL(t128)]`
			`psrld xmm0, 8`

			`mov rax, arg(5)`
			`movd [rax], xmm0`

			`; Accumulate sum of src2`
			`movdqa xmm0, xmm4`
			`psrldq xmm0, 8`
			`paddusw xmm0, xmm4`
			`; Square src2. Ignore high value`
			`pmuludq xmm0, xmm0`
			`psrld xmm0, 8`

			`; phaddw could be used to sum adjacent values but we want`
			`; all the values summed. promote to doubles, accumulate,`
			`; shift and sum`
			`pxor xmm2, xmm2`
			`movdqa xmm1, xmm5`
			`punpckldq xmm1, xmm2`
			`punpckhdq xmm5, xmm2`
			`paddd xmm1, xmm5`
			`movdqa xmm2, xmm1`
			`psrldq xmm1, 8`
			`paddd xmm1, xmm2`

			`psubd xmm1, xmm0`

			`; (variance + 128) >> 8`
			`paddd xmm1, [GLOBAL(t128)]`
			`psrld xmm1, 8`
			`mov rax, arg(4)`

			`movd [rax], xmm1`


			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_GOT`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`

			`SECTION_RODATA`
			`align 16`
			`t128:`
Use dq instead of ddq with NASM Change-Id: Iffb7cd44b449dc10fa5c24405be909d051b7abb5 2013-02-07 14:03:11 +01:00			`%ifndef __NASM_VER__`
RFC: Reorganize MFQE loops Break MFQE code into it's own file. It is currently only valid for 16x16 and 8x8 Y blocks. It also filters 4x4 U/V blocks. Refactor filtering and add associated assembly. Limited test cases show --mfqe introduces a penalty of ~20% with HD content. The assembly reduces the penalty to ~15% Change-Id: I4b8de6b5cdff5413037de5b6c42f437033ee55bf 2012-01-27 19:23:52 +01:00			`ddq 128`
Use dq instead of ddq with NASM Change-Id: Iffb7cd44b449dc10fa5c24405be909d051b7abb5 2013-02-07 14:03:11 +01:00			`%elif CONFIG_BIG_ENDIAN`
			`dq 0, 128`
			`%else`
			`dq 128, 0`
			`%endif`
RFC: Reorganize MFQE loops Break MFQE code into it's own file. It is currently only valid for 16x16 and 8x8 Y blocks. It also filters 4x4 U/V blocks. Refactor filtering and add associated assembly. Limited test cases show --mfqe introduces a penalty of ~20% with HD content. The assembly reduces the penalty to ~15% Change-Id: I4b8de6b5cdff5413037de5b6c42f437033ee55bf 2012-01-27 19:23:52 +01:00			`align 16`
			`tMFQE: ; 1 << MFQE_PRECISION`
			`times 8 dw 0x10`
			`align 16`
			`tMFQE_round: ; 1 << (MFQE_PRECISION - 1)`
			`times 8 dw 0x08`