vpx/vpx_dsp/x86/sad_ssse3.asm

;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"

%macro PROCESS_16X2X3 1
%if %1
        movdqa          xmm0,       XMMWORD PTR [rsi]
        lddqu           xmm5,       XMMWORD PTR [rdi]
        lddqu           xmm6,       XMMWORD PTR [rdi+1]
        lddqu           xmm7,       XMMWORD PTR [rdi+2]

        psadbw          xmm5,       xmm0
        psadbw          xmm6,       xmm0
        psadbw          xmm7,       xmm0
%else
        movdqa          xmm0,       XMMWORD PTR [rsi]
        lddqu           xmm1,       XMMWORD PTR [rdi]
        lddqu           xmm2,       XMMWORD PTR [rdi+1]
        lddqu           xmm3,       XMMWORD PTR [rdi+2]

        psadbw          xmm1,       xmm0
        psadbw          xmm2,       xmm0
        psadbw          xmm3,       xmm0

        paddw           xmm5,       xmm1
        paddw           xmm6,       xmm2
        paddw           xmm7,       xmm3
%endif
        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]

        lea             rsi,        [rsi+rax*2]
        lea             rdi,        [rdi+rdx*2]

        psadbw          xmm1,       xmm0
        psadbw          xmm2,       xmm0
        psadbw          xmm3,       xmm0

        paddw           xmm5,       xmm1
        paddw           xmm6,       xmm2
        paddw           xmm7,       xmm3
%endmacro

%macro PROCESS_16X2X3_OFFSET 2
%if %1
        movdqa          xmm0,       XMMWORD PTR [rsi]
        movdqa          xmm4,       XMMWORD PTR [rdi]
        movdqa          xmm7,       XMMWORD PTR [rdi+16]

        movdqa          xmm5,       xmm7
        palignr         xmm5,       xmm4,       %2

        movdqa          xmm6,       xmm7
        palignr         xmm6,       xmm4,       (%2+1)

        palignr         xmm7,       xmm4,       (%2+2)

        psadbw          xmm5,       xmm0
        psadbw          xmm6,       xmm0
        psadbw          xmm7,       xmm0
%else
        movdqa          xmm0,       XMMWORD PTR [rsi]
        movdqa          xmm4,       XMMWORD PTR [rdi]
        movdqa          xmm3,       XMMWORD PTR [rdi+16]

        movdqa          xmm1,       xmm3
        palignr         xmm1,       xmm4,       %2

        movdqa          xmm2,       xmm3
        palignr         xmm2,       xmm4,       (%2+1)

        palignr         xmm3,       xmm4,       (%2+2)

        psadbw          xmm1,       xmm0
        psadbw          xmm2,       xmm0
        psadbw          xmm3,       xmm0

        paddw           xmm5,       xmm1
        paddw           xmm6,       xmm2
        paddw           xmm7,       xmm3
%endif
        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]

        movdqa          xmm1,       xmm3
        palignr         xmm1,       xmm4,       %2

        movdqa          xmm2,       xmm3
        palignr         xmm2,       xmm4,       (%2+1)

        palignr         xmm3,       xmm4,       (%2+2)

        lea             rsi,        [rsi+rax*2]
        lea             rdi,        [rdi+rdx*2]

        psadbw          xmm1,       xmm0
        psadbw          xmm2,       xmm0
        psadbw          xmm3,       xmm0

        paddw           xmm5,       xmm1
        paddw           xmm6,       xmm2
        paddw           xmm7,       xmm3
%endmacro

%macro PROCESS_16X16X3_OFFSET 2
%2_aligned_by_%1:

        sub             rdi,        %1

        PROCESS_16X2X3_OFFSET 1, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1

        jmp             %2_store_off

%endmacro

%macro PROCESS_16X8X3_OFFSET 2
%2_aligned_by_%1:

        sub             rdi,        %1

        PROCESS_16X2X3_OFFSET 1, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1

        jmp             %2_store_off

%endmacro

;void int vpx_sad16x16x3_ssse3(
;    unsigned char *src_ptr,
;    int  src_stride,
;    unsigned char *ref_ptr,
;    int  ref_stride,
;    int  *results)
global sym(vpx_sad16x16x3_ssse3) PRIVATE
sym(vpx_sad16x16x3_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    SAVE_XMM 7
    push        rsi
    push        rdi
    push        rcx
    ; end prolog

        mov             rsi,        arg(0) ;src_ptr
        mov             rdi,        arg(2) ;ref_ptr

        mov             rdx,        0xf
        and             rdx,        rdi

        jmp .vpx_sad16x16x3_ssse3_skiptable
.vpx_sad16x16x3_ssse3_jumptable:
        dd .vpx_sad16x16x3_ssse3_aligned_by_0  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_1  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_2  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_3  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_4  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_5  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_6  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_7  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_8  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_9  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
.vpx_sad16x16x3_ssse3_skiptable:

        call .vpx_sad16x16x3_ssse3_do_jump
.vpx_sad16x16x3_ssse3_do_jump:
        pop             rcx                         ; get the address of do_jump
        mov             rax,  .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
        add             rax,  rcx  ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable

        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
        add             rcx,        rax

        movsxd          rax,        dword ptr arg(1) ;src_stride
        movsxd          rdx,        dword ptr arg(3) ;ref_stride

        jmp             rcx

        PROCESS_16X16X3_OFFSET 0,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 1,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 2,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 3,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 4,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 5,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 6,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 7,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 8,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 9,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3

.vpx_sad16x16x3_ssse3_aligned_by_15:
        PROCESS_16X2X3 1
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0

.vpx_sad16x16x3_ssse3_store_off:
        mov             rdi,        arg(4) ;Results

        movq            xmm0,       xmm5
        psrldq          xmm5,       8

        paddw           xmm0,       xmm5
        movd            [rdi],      xmm0
;-
        movq            xmm0,       xmm6
        psrldq          xmm6,       8

        paddw           xmm0,       xmm6
        movd            [rdi+4],    xmm0
;-
        movq            xmm0,       xmm7
        psrldq          xmm7,       8

        paddw           xmm0,       xmm7
        movd            [rdi+8],    xmm0

    ; begin epilog
    pop         rcx
    pop         rdi
    pop         rsi
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret

;void int vpx_sad16x8x3_ssse3(
;    unsigned char *src_ptr,
;    int  src_stride,
;    unsigned char *ref_ptr,
;    int  ref_stride,
;    int  *results)
global sym(vpx_sad16x8x3_ssse3) PRIVATE
sym(vpx_sad16x8x3_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    SAVE_XMM 7
    push        rsi
    push        rdi
    push        rcx
    ; end prolog

        mov             rsi,        arg(0) ;src_ptr
        mov             rdi,        arg(2) ;ref_ptr

        mov             rdx,        0xf
        and             rdx,        rdi

        jmp .vpx_sad16x8x3_ssse3_skiptable
.vpx_sad16x8x3_ssse3_jumptable:
        dd .vpx_sad16x8x3_ssse3_aligned_by_0  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_1  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_2  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_3  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_4  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_5  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_6  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_7  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_8  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_9  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
.vpx_sad16x8x3_ssse3_skiptable:

        call .vpx_sad16x8x3_ssse3_do_jump
.vpx_sad16x8x3_ssse3_do_jump:
        pop             rcx                         ; get the address of do_jump
        mov             rax,  .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
        add             rax,  rcx  ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable

        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
        add             rcx,        rax

        movsxd          rax,        dword ptr arg(1) ;src_stride
        movsxd          rdx,        dword ptr arg(3) ;ref_stride

        jmp             rcx

        PROCESS_16X8X3_OFFSET 0,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 1,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 2,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 3,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 4,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 5,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 6,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 7,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 8,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 9,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3

.vpx_sad16x8x3_ssse3_aligned_by_15:

        PROCESS_16X2X3 1
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0

.vpx_sad16x8x3_ssse3_store_off:
        mov             rdi,        arg(4) ;Results

        movq            xmm0,       xmm5
        psrldq          xmm5,       8

        paddw           xmm0,       xmm5
        movd            [rdi],      xmm0
;-
        movq            xmm0,       xmm6
        psrldq          xmm6,       8

        paddw           xmm0,       xmm6
        movd            [rdi+4],    xmm0
;-
        movq            xmm0,       xmm7
        psrldq          xmm7,       8

        paddw           xmm0,       xmm7
        movd            [rdi+8],    xmm0

    ; begin epilog
    pop         rcx
    pop         rdi
    pop         rsi
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`
Use WebM in copyright notice for consistency Changes 'The VP8 project' to 'The WebM project', for consistency with other webmproject.org repositories. Fixes issue #97. Change-Id: I37c13ed5fbdb9d334ceef71c6350e9febed9bbba 2010-09-09 14:16:39 +02:00			`; Copyright (c) 2010 The WebM project authors. All Rights Reserved.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`; Use of this source code is governed by a BSD-style license`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`; that can be found in the LICENSE file in the root of the source`
			`; tree. An additional intellectual property rights grant can be found`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`; in the file PATENTS. All contributing project authors may`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`; be found in the AUTHORS file in the root of the source tree.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`


			`%include "vpx_ports/x86_abi_support.asm"`

			`%macro PROCESS_16X2X3 1`
			`%if %1`
Correct QWORD usage in assembly files QWORD was being undefined because it was being used incorrectly. Change-Id: I3610cefa3d6f0da4054316760f78b9694cde3876 2010-10-14 01:57:57 +02:00			`movdqa xmm0, XMMWORD PTR [rsi]`
			`lddqu xmm5, XMMWORD PTR [rdi]`
			`lddqu xmm6, XMMWORD PTR [rdi+1]`
			`lddqu xmm7, XMMWORD PTR [rdi+2]`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`psadbw xmm5, xmm0`
			`psadbw xmm6, xmm0`
			`psadbw xmm7, xmm0`
			`%else`
Correct QWORD usage in assembly files QWORD was being undefined because it was being used incorrectly. Change-Id: I3610cefa3d6f0da4054316760f78b9694cde3876 2010-10-14 01:57:57 +02:00			`movdqa xmm0, XMMWORD PTR [rsi]`
			`lddqu xmm1, XMMWORD PTR [rdi]`
			`lddqu xmm2, XMMWORD PTR [rdi+1]`
			`lddqu xmm3, XMMWORD PTR [rdi+2]`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`psadbw xmm1, xmm0`
			`psadbw xmm2, xmm0`
			`psadbw xmm3, xmm0`

			`paddw xmm5, xmm1`
			`paddw xmm6, xmm2`
			`paddw xmm7, xmm3`
			`%endif`
Correct QWORD usage in assembly files QWORD was being undefined because it was being used incorrectly. Change-Id: I3610cefa3d6f0da4054316760f78b9694cde3876 2010-10-14 01:57:57 +02:00			`movdqa xmm0, XMMWORD PTR [rsi+rax]`
			`lddqu xmm1, XMMWORD PTR [rdi+rdx]`
			`lddqu xmm2, XMMWORD PTR [rdi+rdx+1]`
			`lddqu xmm3, XMMWORD PTR [rdi+rdx+2]`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`lea rsi, [rsi+rax*2]`
			`lea rdi, [rdi+rdx*2]`

			`psadbw xmm1, xmm0`
			`psadbw xmm2, xmm0`
			`psadbw xmm3, xmm0`

			`paddw xmm5, xmm1`
			`paddw xmm6, xmm2`
			`paddw xmm7, xmm3`
			`%endmacro`

			`%macro PROCESS_16X2X3_OFFSET 2`
			`%if %1`
Correct QWORD usage in assembly files QWORD was being undefined because it was being used incorrectly. Change-Id: I3610cefa3d6f0da4054316760f78b9694cde3876 2010-10-14 01:57:57 +02:00			`movdqa xmm0, XMMWORD PTR [rsi]`
			`movdqa xmm4, XMMWORD PTR [rdi]`
			`movdqa xmm7, XMMWORD PTR [rdi+16]`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`movdqa xmm5, xmm7`
			`palignr xmm5, xmm4, %2`

			`movdqa xmm6, xmm7`
			`palignr xmm6, xmm4, (%2+1)`

			`palignr xmm7, xmm4, (%2+2)`

			`psadbw xmm5, xmm0`
			`psadbw xmm6, xmm0`
			`psadbw xmm7, xmm0`
			`%else`
Correct QWORD usage in assembly files QWORD was being undefined because it was being used incorrectly. Change-Id: I3610cefa3d6f0da4054316760f78b9694cde3876 2010-10-14 01:57:57 +02:00			`movdqa xmm0, XMMWORD PTR [rsi]`
			`movdqa xmm4, XMMWORD PTR [rdi]`
			`movdqa xmm3, XMMWORD PTR [rdi+16]`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`movdqa xmm1, xmm3`
			`palignr xmm1, xmm4, %2`

			`movdqa xmm2, xmm3`
			`palignr xmm2, xmm4, (%2+1)`

			`palignr xmm3, xmm4, (%2+2)`

			`psadbw xmm1, xmm0`
			`psadbw xmm2, xmm0`
			`psadbw xmm3, xmm0`

			`paddw xmm5, xmm1`
			`paddw xmm6, xmm2`
			`paddw xmm7, xmm3`
			`%endif`
Correct QWORD usage in assembly files QWORD was being undefined because it was being used incorrectly. Change-Id: I3610cefa3d6f0da4054316760f78b9694cde3876 2010-10-14 01:57:57 +02:00			`movdqa xmm0, XMMWORD PTR [rsi+rax]`
			`movdqa xmm4, XMMWORD PTR [rdi+rdx]`
			`movdqa xmm3, XMMWORD PTR [rdi+rdx+16]`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`movdqa xmm1, xmm3`
			`palignr xmm1, xmm4, %2`

			`movdqa xmm2, xmm3`
			`palignr xmm2, xmm4, (%2+1)`

			`palignr xmm3, xmm4, (%2+2)`

			`lea rsi, [rsi+rax*2]`
			`lea rdi, [rdi+rdx*2]`

			`psadbw xmm1, xmm0`
			`psadbw xmm2, xmm0`
			`psadbw xmm3, xmm0`

			`paddw xmm5, xmm1`
			`paddw xmm6, xmm2`
			`paddw xmm7, xmm3`
			`%endmacro`

			`%macro PROCESS_16X16X3_OFFSET 2`
			`%2_aligned_by_%1:`

			`sub rdi, %1`

			`PROCESS_16X2X3_OFFSET 1, %1`
			`PROCESS_16X2X3_OFFSET 0, %1`
			`PROCESS_16X2X3_OFFSET 0, %1`
			`PROCESS_16X2X3_OFFSET 0, %1`
			`PROCESS_16X2X3_OFFSET 0, %1`
			`PROCESS_16X2X3_OFFSET 0, %1`
			`PROCESS_16X2X3_OFFSET 0, %1`
			`PROCESS_16X2X3_OFFSET 0, %1`

			`jmp %2_store_off`

			`%endmacro`

			`%macro PROCESS_16X8X3_OFFSET 2`
			`%2_aligned_by_%1:`

			`sub rdi, %1`

			`PROCESS_16X2X3_OFFSET 1, %1`
			`PROCESS_16X2X3_OFFSET 0, %1`
			`PROCESS_16X2X3_OFFSET 0, %1`
			`PROCESS_16X2X3_OFFSET 0, %1`

			`jmp %2_store_off`

			`%endmacro`

Move shared SAD code to vpx_dsp Create a new component, vpx_dsp, for code that can be shared between codecs. Move the SAD code into the component. This reduces the size of vpxenc/dec by 36k on x86_64 builds. Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a 2015-04-17 22:11:38 +02:00			`;void int vpx_sad16x16x3_ssse3(`
Initial WebM release 2010-05-18 17:58:33 +02:00			`; unsigned char *src_ptr,`
			`; int src_stride,`
			`; unsigned char *ref_ptr,`
			`; int ref_stride,`
			`; int *results)`
Move shared SAD code to vpx_dsp Create a new component, vpx_dsp, for code that can be shared between codecs. Move the SAD code into the component. This reduces the size of vpxenc/dec by 36k on x86_64 builds. Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a 2015-04-17 22:11:38 +02:00			`global sym(vpx_sad16x16x3_ssse3) PRIVATE`
			`sym(vpx_sad16x16x3_ssse3):`
Initial WebM release 2010-05-18 17:58:33 +02:00			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 5`
modify SAVE_XMM for potential 64bit use the win64 abi requires saving and restoring xmm6:xmm15. currently SAVE_XMM and RESTORE XMM only allow for saving xmm6:xmm7. allow specifying the highest register used and if the stack is unaligned. Change-Id: Ica5699622ffe3346d3a486f48eef0206c51cf867 2011-04-15 16:05:20 +02:00			`SAVE_XMM 7`
Initial WebM release 2010-05-18 17:58:33 +02:00			`push rsi`
			`push rdi`
			`push rcx`
			`; end prolog`

			`mov rsi, arg(0) ;src_ptr`
			`mov rdi, arg(2) ;ref_ptr`

			`mov rdx, 0xf`
			`and rdx, rdi`

Move shared SAD code to vpx_dsp Create a new component, vpx_dsp, for code that can be shared between codecs. Move the SAD code into the component. This reduces the size of vpxenc/dec by 36k on x86_64 builds. Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a 2015-04-17 22:11:38 +02:00			`jmp .vpx_sad16x16x3_ssse3_skiptable`
			`.vpx_sad16x16x3_ssse3_jumptable:`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump`
			`dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump`
			`.vpx_sad16x16x3_ssse3_skiptable:`

			`call .vpx_sad16x16x3_ssse3_do_jump`
			`.vpx_sad16x16x3_ssse3_do_jump:`
Initial WebM release 2010-05-18 17:58:33 +02:00			`pop rcx ; get the address of do_jump`
Move shared SAD code to vpx_dsp Create a new component, vpx_dsp, for code that can be shared between codecs. Move the SAD code into the component. This reduces the size of vpxenc/dec by 36k on x86_64 builds. Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a 2015-04-17 22:11:38 +02:00			`mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump`
			`add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable`
			`add rcx, rax`

			`movsxd rax, dword ptr arg(1) ;src_stride`
			`movsxd rdx, dword ptr arg(3) ;ref_stride`

			`jmp rcx`

Move shared SAD code to vpx_dsp Create a new component, vpx_dsp, for code that can be shared between codecs. Move the SAD code into the component. This reduces the size of vpxenc/dec by 36k on x86_64 builds. Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a 2015-04-17 22:11:38 +02:00			`PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3`
			`PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3`
			`PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3`
			`PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3`
			`PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3`
			`PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3`
			`PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3`
			`PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3`
			`PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3`
			`PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3`
			`PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3`
			`PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3`
			`PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3`
			`PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3`
			`PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3`

			`.vpx_sad16x16x3_ssse3_aligned_by_15:`
Initial WebM release 2010-05-18 17:58:33 +02:00			`PROCESS_16X2X3 1`
			`PROCESS_16X2X3 0`
			`PROCESS_16X2X3 0`
			`PROCESS_16X2X3 0`
			`PROCESS_16X2X3 0`
			`PROCESS_16X2X3 0`
			`PROCESS_16X2X3 0`
			`PROCESS_16X2X3 0`

Move shared SAD code to vpx_dsp Create a new component, vpx_dsp, for code that can be shared between codecs. Move the SAD code into the component. This reduces the size of vpxenc/dec by 36k on x86_64 builds. Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a 2015-04-17 22:11:38 +02:00			`.vpx_sad16x16x3_ssse3_store_off:`
Initial WebM release 2010-05-18 17:58:33 +02:00			`mov rdi, arg(4) ;Results`

			`movq xmm0, xmm5`
			`psrldq xmm5, 8`

			`paddw xmm0, xmm5`
			`movd [rdi], xmm0`
			`;-`
			`movq xmm0, xmm6`
			`psrldq xmm6, 8`

			`paddw xmm0, xmm6`
			`movd [rdi+4], xmm0`
			`;-`
			`movq xmm0, xmm7`
			`psrldq xmm7, 8`

			`paddw xmm0, xmm7`
			`movd [rdi+8], xmm0`

			`; begin epilog`
			`pop rcx`
			`pop rdi`
			`pop rsi`
Add save/restore xmm registers in x86 assembly code Went through the code and fixed it. Verified on Windows. Where possible, remove dependencies on xmm[67] Current code relies on pushing rbp to the stack to get 16 byte alignment. This broke when rbp wasn't pushed (vp8/encoder/x86/sad_sse3.asm). Work around this by using unaligned memory accesses. Revisit this and the offsets in vp8/encoder/x86/sad_sse3.asm in another change to SAVE_XMM. Change-Id: I5f940994d3ebfd977c3d68446cef20fd78b07877 2011-04-07 19:17:22 +02:00			`RESTORE_XMM`
Initial WebM release 2010-05-18 17:58:33 +02:00			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`

Move shared SAD code to vpx_dsp Create a new component, vpx_dsp, for code that can be shared between codecs. Move the SAD code into the component. This reduces the size of vpxenc/dec by 36k on x86_64 builds. Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a 2015-04-17 22:11:38 +02:00			`;void int vpx_sad16x8x3_ssse3(`
Initial WebM release 2010-05-18 17:58:33 +02:00			`; unsigned char *src_ptr,`
			`; int src_stride,`
			`; unsigned char *ref_ptr,`
			`; int ref_stride,`
			`; int *results)`
Move shared SAD code to vpx_dsp Create a new component, vpx_dsp, for code that can be shared between codecs. Move the SAD code into the component. This reduces the size of vpxenc/dec by 36k on x86_64 builds. Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a 2015-04-17 22:11:38 +02:00			`global sym(vpx_sad16x8x3_ssse3) PRIVATE`
			`sym(vpx_sad16x8x3_ssse3):`
Initial WebM release 2010-05-18 17:58:33 +02:00			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 5`
modify SAVE_XMM for potential 64bit use the win64 abi requires saving and restoring xmm6:xmm15. currently SAVE_XMM and RESTORE XMM only allow for saving xmm6:xmm7. allow specifying the highest register used and if the stack is unaligned. Change-Id: Ica5699622ffe3346d3a486f48eef0206c51cf867 2011-04-15 16:05:20 +02:00			`SAVE_XMM 7`
Initial WebM release 2010-05-18 17:58:33 +02:00			`push rsi`
			`push rdi`
			`push rcx`
			`; end prolog`

			`mov rsi, arg(0) ;src_ptr`
			`mov rdi, arg(2) ;ref_ptr`

			`mov rdx, 0xf`
			`and rdx, rdi`

Move shared SAD code to vpx_dsp Create a new component, vpx_dsp, for code that can be shared between codecs. Move the SAD code into the component. This reduces the size of vpxenc/dec by 36k on x86_64 builds. Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a 2015-04-17 22:11:38 +02:00			`jmp .vpx_sad16x8x3_ssse3_skiptable`
			`.vpx_sad16x8x3_ssse3_jumptable:`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump`
			`dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump`
			`.vpx_sad16x8x3_ssse3_skiptable:`

			`call .vpx_sad16x8x3_ssse3_do_jump`
			`.vpx_sad16x8x3_ssse3_do_jump:`
Initial WebM release 2010-05-18 17:58:33 +02:00			`pop rcx ; get the address of do_jump`
Move shared SAD code to vpx_dsp Create a new component, vpx_dsp, for code that can be shared between codecs. Move the SAD code into the component. This reduces the size of vpxenc/dec by 36k on x86_64 builds. Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a 2015-04-17 22:11:38 +02:00			`mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump`
			`add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable`
			`add rcx, rax`

			`movsxd rax, dword ptr arg(1) ;src_stride`
			`movsxd rdx, dword ptr arg(3) ;ref_stride`

			`jmp rcx`

Move shared SAD code to vpx_dsp Create a new component, vpx_dsp, for code that can be shared between codecs. Move the SAD code into the component. This reduces the size of vpxenc/dec by 36k on x86_64 builds. Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a 2015-04-17 22:11:38 +02:00			`PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3`
			`PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3`
			`PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3`
			`PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3`
			`PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3`
			`PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3`
			`PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3`
			`PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3`
			`PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3`
			`PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3`
			`PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3`
			`PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3`
			`PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3`
			`PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3`
			`PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3`

			`.vpx_sad16x8x3_ssse3_aligned_by_15:`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`PROCESS_16X2X3 1`
			`PROCESS_16X2X3 0`
			`PROCESS_16X2X3 0`
			`PROCESS_16X2X3 0`

Move shared SAD code to vpx_dsp Create a new component, vpx_dsp, for code that can be shared between codecs. Move the SAD code into the component. This reduces the size of vpxenc/dec by 36k on x86_64 builds. Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a 2015-04-17 22:11:38 +02:00			`.vpx_sad16x8x3_ssse3_store_off:`
Initial WebM release 2010-05-18 17:58:33 +02:00			`mov rdi, arg(4) ;Results`

			`movq xmm0, xmm5`
			`psrldq xmm5, 8`

			`paddw xmm0, xmm5`
			`movd [rdi], xmm0`
			`;-`
			`movq xmm0, xmm6`
			`psrldq xmm6, 8`

			`paddw xmm0, xmm6`
			`movd [rdi+4], xmm0`
			`;-`
			`movq xmm0, xmm7`
			`psrldq xmm7, 8`

			`paddw xmm0, xmm7`
			`movd [rdi+8], xmm0`

			`; begin epilog`
			`pop rcx`
			`pop rdi`
			`pop rsi`
Add save/restore xmm registers in x86 assembly code Went through the code and fixed it. Verified on Windows. Where possible, remove dependencies on xmm[67] Current code relies on pushing rbp to the stack to get 16 byte alignment. This broke when rbp wasn't pushed (vp8/encoder/x86/sad_sse3.asm). Work around this by using unaligned memory accesses. Revisit this and the offsets in vp8/encoder/x86/sad_sse3.asm in another change to SAVE_XMM. Change-Id: I5f940994d3ebfd977c3d68446cef20fd78b07877 2011-04-07 19:17:22 +02:00			`RESTORE_XMM`
Initial WebM release 2010-05-18 17:58:33 +02:00			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`