vpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm

;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"

;/************************************************************************************
; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
; input pixel array has output_height rows. This routine assumes that output_height is an
; even number. This function handles 8 pixels in horizontal direction, calculating ONE
; rows each iteration to take advantage of the 128 bits operations.
;
; This is an implementation of some of the SSE optimizations first seen in ffvp8
;
;*************************************************************************************/

;void vp9_filter_block1d8_v8_ssse3
;(
;    unsigned char *src_ptr,
;    unsigned int   src_pitch,
;    unsigned char *output_ptr,
;    unsigned int   out_pitch,
;    unsigned int   output_height,
;    short *filter
;)
global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
sym(vp9_filter_block1d8_v8_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM 7
    push        rsi
    push        rdi
    push        rbx
    ; end prolog

    ALIGN_STACK 16, rax
    sub         rsp, 16*5
    %define k0k1 [rsp + 16*0]
    %define k2k3 [rsp + 16*1]
    %define k4k5 [rsp + 16*2]
    %define k6k7 [rsp + 16*3]
    %define krd [rsp + 16*4]

    mov         rdx, arg(5)                 ;filter ptr
    mov         rsi, arg(0)                 ;src_ptr
    mov         rdi, arg(2)                 ;output_ptr
    mov         rcx, 0x0400040

    movdqa      xmm4, [rdx]                 ;load filters
    movd        xmm5, rcx
    packsswb    xmm4, xmm4
    pshuflw     xmm0, xmm4, 0b              ;k0_k1
    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7

    punpcklqdq  xmm0, xmm0
    punpcklqdq  xmm1, xmm1
    punpcklqdq  xmm2, xmm2
    punpcklqdq  xmm3, xmm3

    movdqa      k0k1, xmm0
    movdqa      k2k3, xmm1
    pshufd      xmm5, xmm5, 0
    movdqa      k4k5, xmm2
    movdqa      k6k7, xmm3
    movdqa      krd, xmm5

    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line

%if ABI_IS_32BIT=0
    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
%endif
    mov         rax, rsi
    movsxd      rcx, DWORD PTR arg(4)       ;output_height
    add         rax, rdx

    lea         rbx, [rdx + rdx*4]
    add         rbx, rdx                    ;pitch * 6

.vp9_filter_block1d8_v8_ssse3_loop:
    movq        xmm0, [rsi]                 ;A
    movq        xmm1, [rsi + rdx]           ;B
    movq        xmm2, [rsi + rdx * 2]       ;C
    movq        xmm3, [rax + rdx * 2]       ;D
    movq        xmm4, [rsi + rdx * 4]       ;E
    movq        xmm5, [rax + rdx * 4]       ;F

    punpcklbw   xmm0, xmm1                  ;A B
    punpcklbw   xmm2, xmm3                  ;C D
    punpcklbw   xmm4, xmm5                  ;E F

    movq        xmm6, [rsi + rbx]           ;G
    movq        xmm7, [rax + rbx]           ;H

    pmaddubsw   xmm0, k0k1
    pmaddubsw   xmm2, k2k3
    punpcklbw   xmm6, xmm7                  ;G H
    pmaddubsw   xmm4, k4k5
    pmaddubsw   xmm6, k6k7

    paddsw      xmm0, xmm2
    paddsw      xmm0, krd
    paddsw      xmm4, xmm6
    paddsw      xmm0, xmm4

    psraw       xmm0, 7
    packuswb    xmm0, xmm0

    add         rsi,  rdx
    add         rax,  rdx

    movq        [rdi], xmm0

%if ABI_IS_32BIT
    add         rdi, DWORD PTR arg(3)       ;out_pitch
%else
    add         rdi, r8
%endif
    dec         rcx
    jnz         .vp9_filter_block1d8_v8_ssse3_loop

    add rsp, 16*5
    pop rsp
    pop rbx
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret

;void vp9_filter_block1d16_v8_ssse3
;(
;    unsigned char *src_ptr,
;    unsigned int   src_pitch,
;    unsigned char *output_ptr,
;    unsigned int   out_pitch,
;    unsigned int   output_height,
;    short *filter
;)
global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
sym(vp9_filter_block1d16_v8_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM 7
    push        rsi
    push        rdi
    push        rbx
    ; end prolog

    ALIGN_STACK 16, rax
    sub         rsp, 16*5
    %define k0k1 [rsp + 16*0]
    %define k2k3 [rsp + 16*1]
    %define k4k5 [rsp + 16*2]
    %define k6k7 [rsp + 16*3]
    %define krd [rsp + 16*4]

    mov         rdx, arg(5)                 ;filter ptr
    mov         rsi, arg(0)                 ;src_ptr
    mov         rdi, arg(2)                 ;output_ptr
    mov         rcx, 0x0400040

    movdqa      xmm4, [rdx]                 ;load filters
    movd        xmm5, rcx
    packsswb    xmm4, xmm4
    pshuflw     xmm0, xmm4, 0b              ;k0_k1
    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7

    punpcklqdq  xmm0, xmm0
    punpcklqdq  xmm1, xmm1
    punpcklqdq  xmm2, xmm2
    punpcklqdq  xmm3, xmm3

    movdqa      k0k1, xmm0
    movdqa      k2k3, xmm1
    pshufd      xmm5, xmm5, 0
    movdqa      k4k5, xmm2
    movdqa      k6k7, xmm3
    movdqa      krd, xmm5

    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line

%if ABI_IS_32BIT=0
    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
%endif
    mov         rax, rsi
    movsxd      rcx, DWORD PTR arg(4)       ;output_height
    add         rax, rdx

    lea         rbx, [rdx + rdx*4]
    add         rbx, rdx                    ;pitch * 6

.vp9_filter_block1d16_v8_ssse3_loop:
    movq        xmm0, [rsi]                 ;A
    movq        xmm1, [rsi + rdx]           ;B
    movq        xmm2, [rsi + rdx * 2]       ;C
    movq        xmm3, [rax + rdx * 2]       ;D
    movq        xmm4, [rsi + rdx * 4]       ;E
    movq        xmm5, [rax + rdx * 4]       ;F

    punpcklbw   xmm0, xmm1                  ;A B
    punpcklbw   xmm2, xmm3                  ;C D
    punpcklbw   xmm4, xmm5                  ;E F

    movq        xmm6, [rsi + rbx]           ;G
    movq        xmm7, [rax + rbx]           ;H

    pmaddubsw   xmm0, k0k1
    pmaddubsw   xmm2, k2k3
    punpcklbw   xmm6, xmm7                  ;G H
    pmaddubsw   xmm4, k4k5
    pmaddubsw   xmm6, k6k7

    paddsw      xmm0, xmm2
    paddsw      xmm0, krd
    paddsw      xmm4, xmm6
    paddsw      xmm0, xmm4

    psraw       xmm0, 7
    packuswb    xmm0, xmm0

    movq        [rdi], xmm0

    movq        xmm0, [rsi + 8]             ;A
    movq        xmm1, [rsi + rdx + 8]       ;B
    movq        xmm2, [rsi + rdx * 2 + 8]   ;C
    movq        xmm3, [rax + rdx * 2 + 8]   ;D
    movq        xmm4, [rsi + rdx * 4 + 8]   ;E
    movq        xmm5, [rax + rdx * 4 + 8]   ;F

    punpcklbw   xmm0, xmm1                  ;A B
    punpcklbw   xmm2, xmm3                  ;C D
    punpcklbw   xmm4, xmm5                  ;E F


    movq        xmm6, [rsi + rbx + 8]       ;G
    movq        xmm7, [rax + rbx + 8]       ;H
    punpcklbw   xmm6, xmm7                  ;G H


    pmaddubsw   xmm0, k0k1
    pmaddubsw   xmm2, k2k3
    pmaddubsw   xmm4, k4k5
    pmaddubsw   xmm6, k6k7

    paddsw      xmm0, xmm2
    paddsw      xmm4, xmm6
    paddsw      xmm0, krd
    paddsw      xmm0, xmm4

    psraw       xmm0, 7
    packuswb    xmm0, xmm0

    add         rsi,  rdx
    add         rax,  rdx

    movq        [rdi+8], xmm0

%if ABI_IS_32BIT
    add         rdi, DWORD PTR arg(3)       ;out_pitch
%else
    add         rdi, r8
%endif
    dec         rcx
    jnz         .vp9_filter_block1d16_v8_ssse3_loop

    add rsp, 16*5
    pop rsp
    pop rbx
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret

;void vp9_filter_block1d8_h8_ssse3
;(
;    unsigned char  *src_ptr,
;    unsigned int    src_pixels_per_line,
;    unsigned char  *output_ptr,
;    unsigned int    output_pitch,
;    unsigned int    output_height,
;    short *filter
;)
global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
sym(vp9_filter_block1d8_h8_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    ALIGN_STACK 16, rax
    sub         rsp, 16*5
    %define k0k1 [rsp + 16*0]
    %define k2k3 [rsp + 16*1]
    %define k4k5 [rsp + 16*2]
    %define k6k7 [rsp + 16*3]
    %define krd [rsp + 16*4]

    mov         rdx, arg(5)                 ;filter ptr
    mov         rsi, arg(0)                 ;src_ptr
    mov         rdi, arg(2)                 ;output_ptr
    mov         rcx, 0x0400040

    movdqa      xmm4, [rdx]                 ;load filters
    movd        xmm5, rcx
    packsswb    xmm4, xmm4
    pshuflw     xmm0, xmm4, 0b              ;k0_k1
    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7

    punpcklqdq  xmm0, xmm0
    punpcklqdq  xmm1, xmm1
    punpcklqdq  xmm2, xmm2
    punpcklqdq  xmm3, xmm3

    movdqa      k0k1, xmm0
    movdqa      k2k3, xmm1
    pshufd      xmm5, xmm5, 0
    movdqa      k4k5, xmm2
    movdqa      k6k7, xmm3
;    movdqa      krd, xmm5

    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
    movsxd      rdx, dword ptr arg(3)       ;output_pitch
    movsxd      rcx, dword ptr arg(4)       ;output_height

.filter_block1d8_h8_rowloop_ssse3:
    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4

;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
;note: if we create a k0_k7 filter, we can save a pshufb
;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
    punpcklqdq  xmm0,   xmm3

    movdqa      xmm1,   xmm0
    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
    pmaddubsw   xmm0,   k0k1

    movdqa      xmm2,   xmm1
    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
    pmaddubsw   xmm1,   k2k3

    movdqa      xmm4,   xmm2
    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
    pmaddubsw   xmm2,   k4k5

    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
    pmaddubsw   xmm4,   k6k7

    paddsw      xmm0,   xmm1
    paddsw      xmm0,   xmm2
    paddsw      xmm0,   xmm5
    paddsw      xmm0,   xmm4
    psraw       xmm0,   7
    packuswb    xmm0,   xmm0

    lea         rsi,    [rsi + rax]
    movq        [rdi],  xmm0

    lea         rdi,    [rdi + rdx]
    dec         rcx
    jnz         .filter_block1d8_h8_rowloop_ssse3

    add rsp, 16*5
    pop rsp

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret

;void vp9_filter_block1d16_h8_ssse3
;(
;    unsigned char  *src_ptr,
;    unsigned int    src_pixels_per_line,
;    unsigned char  *output_ptr,
;    unsigned int    output_pitch,
;    unsigned int    output_height,
;    short *filter
;)
global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
sym(vp9_filter_block1d16_h8_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    ALIGN_STACK 16, rax
    sub         rsp, 16*5
    %define k0k1 [rsp + 16*0]
    %define k2k3 [rsp + 16*1]
    %define k4k5 [rsp + 16*2]
    %define k6k7 [rsp + 16*3]
    %define krd [rsp + 16*4]

    mov         rdx, arg(5)                 ;filter ptr
    mov         rsi, arg(0)                 ;src_ptr
    mov         rdi, arg(2)                 ;output_ptr
    mov         rcx, 0x0400040

    movdqa      xmm4, [rdx]                 ;load filters
    movd        xmm5, rcx
    packsswb    xmm4, xmm4
    pshuflw     xmm0, xmm4, 0b              ;k0_k1
    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7

    punpcklqdq  xmm0, xmm0
    punpcklqdq  xmm1, xmm1
    punpcklqdq  xmm2, xmm2
    punpcklqdq  xmm3, xmm3

    movdqa      k0k1, xmm0
    movdqa      k2k3, xmm1
    pshufd      xmm5, xmm5, 0
    movdqa      k4k5, xmm2
    movdqa      k6k7, xmm3
    movdqa      krd, xmm5

    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
    movsxd      rdx, dword ptr arg(3)       ;output_pitch
    movsxd      rcx, dword ptr arg(4)       ;output_height

.filter_block1d16_h8_rowloop_ssse3:
    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4

;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
;note: if we create a k0_k7 filter, we can save a pshufb
;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
    punpcklqdq  xmm0,   xmm3

    movdqa      xmm1,   xmm0
    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
    pmaddubsw   xmm0,   k0k1

    movdqa      xmm2,   xmm1
    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
    pmaddubsw   xmm1,   k2k3

    movdqa      xmm4,   xmm2
    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
    pmaddubsw   xmm2,   k4k5

    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
    pmaddubsw   xmm4,   k6k7

    paddsw      xmm0,   xmm1
    paddsw      xmm0,   xmm4
    paddsw      xmm0,   xmm2
    paddsw      xmm0,   krd
    psraw       xmm0,   7
    packuswb    xmm0,   xmm0


    movq        xmm3,   [rsi +  5]
;    movq        xmm7,   [rsi + 12]
    movq        xmm7,   [rsi + 13]
;note: same as above
;    punpcklbw   xmm3,   xmm7
    punpcklqdq  xmm3,   xmm7

    movdqa      xmm1,   xmm3
    pshufb      xmm3,   [GLOBAL(shuf_t0t1)]
    pmaddubsw   xmm3,   k0k1

    movdqa      xmm2,   xmm1
    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
    pmaddubsw   xmm1,   k2k3

    movdqa      xmm4,   xmm2
    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
    pmaddubsw   xmm2,   k4k5

    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
    pmaddubsw   xmm4,   k6k7

    paddsw      xmm3,   xmm1
    paddsw      xmm3,   xmm2
    paddsw      xmm3,   krd
    paddsw      xmm3,   xmm4
    psraw       xmm3,   7
    packuswb    xmm3,   xmm3
    punpcklqdq  xmm0,   xmm3

    lea         rsi,    [rsi + rax]
    movdqa      [rdi],  xmm0

    lea         rdi,    [rdi + rdx]
    dec         rcx
    jnz         .filter_block1d16_h8_rowloop_ssse3

    add rsp, 16*5
    pop rsp

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret


SECTION_RODATA
align 16
shuf_t0t1:
    db  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
align 16
shuf_t2t3:
    db  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
align 16
shuf_t4t5:
    db  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
align 16
shuf_t6t7:
    db  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
Faster 8t filtering Quickly modified the ssse3 sixtap filters to support eight taps. For the test clip used, a 23+% boost in decoder performance was seen. We can revisit later and improve further. Change-Id: I5f59860459e80d6fa23e6cc0fd91296a969f5240 2012-10-26 02:24:50 +02:00			`;`
			`; Copyright (c) 2010 The WebM project authors. All Rights Reserved.`
			`;`
			`; Use of this source code is governed by a BSD-style license`
			`; that can be found in the LICENSE file in the root of the source`
			`; tree. An additional intellectual property rights grant can be found`
			`; in the file PATENTS. All contributing project authors may`
			`; be found in the AUTHORS file in the root of the source tree.`
			`;`


			`%include "vpx_ports/x86_abi_support.asm"`

			`;/************************************************************************************`
			`; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The`
			`; input pixel array has output_height rows. This routine assumes that output_height is an`
			`; even number. This function handles 8 pixels in horizontal direction, calculating ONE`
			`; rows each iteration to take advantage of the 128 bits operations.`
			`;`
			`; This is an implementation of some of the SSE optimizations first seen in ffvp8`
			`;`
			`;*************************************************************************************/`

Change common vp8_ public symbol prefixes to vp9_. Change-Id: Ic5a5f60e1ff9d9ccae4174160d36529466eeb509 2012-10-31 00:25:53 +01:00			`;void vp9_filter_block1d8_v8_ssse3`
Faster 8t filtering Quickly modified the ssse3 sixtap filters to support eight taps. For the test clip used, a 23+% boost in decoder performance was seen. We can revisit later and improve further. Change-Id: I5f59860459e80d6fa23e6cc0fd91296a969f5240 2012-10-26 02:24:50 +02:00			`;(`
			`; unsigned char *src_ptr,`
			`; unsigned int src_pitch,`
			`; unsigned char *output_ptr,`
			`; unsigned int out_pitch,`
			`; unsigned int output_height,`
			`; short *filter`
			`;)`
add private to assembly files to insure proper chromebuild Change-Id: I6e43ca73f35401a974ed8ee27738d4318f09fd37 2012-12-20 18:40:18 +01:00			`global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE`
Change common vp8_ public symbol prefixes to vp9_. Change-Id: Ic5a5f60e1ff9d9ccae4174160d36529466eeb509 2012-10-31 00:25:53 +01:00			`sym(vp9_filter_block1d8_v8_ssse3):`
Faster 8t filtering Quickly modified the ssse3 sixtap filters to support eight taps. For the test clip used, a 23+% boost in decoder performance was seen. We can revisit later and improve further. Change-Id: I5f59860459e80d6fa23e6cc0fd91296a969f5240 2012-10-26 02:24:50 +02:00			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 6`
			`SAVE_XMM 7`
			`push rsi`
			`push rdi`
			`push rbx`
			`; end prolog`

			`ALIGN_STACK 16, rax`
			`sub rsp, 16*5`
			`%define k0k1 [rsp + 16*0]`
			`%define k2k3 [rsp + 16*1]`
			`%define k4k5 [rsp + 16*2]`
			`%define k6k7 [rsp + 16*3]`
			`%define krd [rsp + 16*4]`

			`mov rdx, arg(5) ;filter ptr`
			`mov rsi, arg(0) ;src_ptr`
			`mov rdi, arg(2) ;output_ptr`
			`mov rcx, 0x0400040`

			`movdqa xmm4, [rdx] ;load filters`
			`movd xmm5, rcx`
			`packsswb xmm4, xmm4`
			`pshuflw xmm0, xmm4, 0b ;k0_k1`
			`pshuflw xmm1, xmm4, 01010101b ;k2_k3`
			`pshuflw xmm2, xmm4, 10101010b ;k4_k5`
			`pshuflw xmm3, xmm4, 11111111b ;k6_k7`

			`punpcklqdq xmm0, xmm0`
			`punpcklqdq xmm1, xmm1`
			`punpcklqdq xmm2, xmm2`
			`punpcklqdq xmm3, xmm3`

			`movdqa k0k1, xmm0`
			`movdqa k2k3, xmm1`
			`pshufd xmm5, xmm5, 0`
			`movdqa k4k5, xmm2`
			`movdqa k6k7, xmm3`
			`movdqa krd, xmm5`

			`movsxd rdx, DWORD PTR arg(1) ;pixels_per_line`

			`%if ABI_IS_32BIT=0`
			`movsxd r8, DWORD PTR arg(3) ;out_pitch`
			`%endif`
			`mov rax, rsi`
			`movsxd rcx, DWORD PTR arg(4) ;output_height`
			`add rax, rdx`

			`lea rbx, [rdx + rdx*4]`
			`add rbx, rdx ;pitch * 6`

Change common vp8_ public symbol prefixes to vp9_. Change-Id: Ic5a5f60e1ff9d9ccae4174160d36529466eeb509 2012-10-31 00:25:53 +01:00			`.vp9_filter_block1d8_v8_ssse3_loop:`
Faster 8t filtering Quickly modified the ssse3 sixtap filters to support eight taps. For the test clip used, a 23+% boost in decoder performance was seen. We can revisit later and improve further. Change-Id: I5f59860459e80d6fa23e6cc0fd91296a969f5240 2012-10-26 02:24:50 +02:00			`movq xmm0, [rsi] ;A`
			`movq xmm1, [rsi + rdx] ;B`
			`movq xmm2, [rsi + rdx * 2] ;C`
			`movq xmm3, [rax + rdx * 2] ;D`
			`movq xmm4, [rsi + rdx * 4] ;E`
			`movq xmm5, [rax + rdx * 4] ;F`

			`punpcklbw xmm0, xmm1 ;A B`
			`punpcklbw xmm2, xmm3 ;C D`
			`punpcklbw xmm4, xmm5 ;E F`

			`movq xmm6, [rsi + rbx] ;G`
			`movq xmm7, [rax + rbx] ;H`

			`pmaddubsw xmm0, k0k1`
			`pmaddubsw xmm2, k2k3`
			`punpcklbw xmm6, xmm7 ;G H`
			`pmaddubsw xmm4, k4k5`
			`pmaddubsw xmm6, k6k7`

			`paddsw xmm0, xmm2`
			`paddsw xmm0, krd`
			`paddsw xmm4, xmm6`
			`paddsw xmm0, xmm4`

			`psraw xmm0, 7`
			`packuswb xmm0, xmm0`

			`add rsi, rdx`
			`add rax, rdx`

			`movq [rdi], xmm0`

			`%if ABI_IS_32BIT`
			`add rdi, DWORD PTR arg(3) ;out_pitch`
			`%else`
			`add rdi, r8`
			`%endif`
			`dec rcx`
Change common vp8_ public symbol prefixes to vp9_. Change-Id: Ic5a5f60e1ff9d9ccae4174160d36529466eeb509 2012-10-31 00:25:53 +01:00			`jnz .vp9_filter_block1d8_v8_ssse3_loop`
Faster 8t filtering Quickly modified the ssse3 sixtap filters to support eight taps. For the test clip used, a 23+% boost in decoder performance was seen. We can revisit later and improve further. Change-Id: I5f59860459e80d6fa23e6cc0fd91296a969f5240 2012-10-26 02:24:50 +02:00
			`add rsp, 16*5`
			`pop rsp`
			`pop rbx`
			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_XMM`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`

Change common vp8_ public symbol prefixes to vp9_. Change-Id: Ic5a5f60e1ff9d9ccae4174160d36529466eeb509 2012-10-31 00:25:53 +01:00			`;void vp9_filter_block1d16_v8_ssse3`
Faster 8t filtering Quickly modified the ssse3 sixtap filters to support eight taps. For the test clip used, a 23+% boost in decoder performance was seen. We can revisit later and improve further. Change-Id: I5f59860459e80d6fa23e6cc0fd91296a969f5240 2012-10-26 02:24:50 +02:00			`;(`
			`; unsigned char *src_ptr,`
			`; unsigned int src_pitch,`
			`; unsigned char *output_ptr,`
			`; unsigned int out_pitch,`
			`; unsigned int output_height,`
			`; short *filter`
			`;)`
add private to assembly files to insure proper chromebuild Change-Id: I6e43ca73f35401a974ed8ee27738d4318f09fd37 2012-12-20 18:40:18 +01:00			`global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE`
Change common vp8_ public symbol prefixes to vp9_. Change-Id: Ic5a5f60e1ff9d9ccae4174160d36529466eeb509 2012-10-31 00:25:53 +01:00			`sym(vp9_filter_block1d16_v8_ssse3):`
Faster 8t filtering Quickly modified the ssse3 sixtap filters to support eight taps. For the test clip used, a 23+% boost in decoder performance was seen. We can revisit later and improve further. Change-Id: I5f59860459e80d6fa23e6cc0fd91296a969f5240 2012-10-26 02:24:50 +02:00			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 6`
			`SAVE_XMM 7`
			`push rsi`
			`push rdi`
			`push rbx`
			`; end prolog`

			`ALIGN_STACK 16, rax`
			`sub rsp, 16*5`
			`%define k0k1 [rsp + 16*0]`
			`%define k2k3 [rsp + 16*1]`
			`%define k4k5 [rsp + 16*2]`
			`%define k6k7 [rsp + 16*3]`
			`%define krd [rsp + 16*4]`

			`mov rdx, arg(5) ;filter ptr`
			`mov rsi, arg(0) ;src_ptr`
			`mov rdi, arg(2) ;output_ptr`
			`mov rcx, 0x0400040`

			`movdqa xmm4, [rdx] ;load filters`
			`movd xmm5, rcx`
			`packsswb xmm4, xmm4`
			`pshuflw xmm0, xmm4, 0b ;k0_k1`
			`pshuflw xmm1, xmm4, 01010101b ;k2_k3`
			`pshuflw xmm2, xmm4, 10101010b ;k4_k5`
			`pshuflw xmm3, xmm4, 11111111b ;k6_k7`

			`punpcklqdq xmm0, xmm0`
			`punpcklqdq xmm1, xmm1`
			`punpcklqdq xmm2, xmm2`
			`punpcklqdq xmm3, xmm3`

			`movdqa k0k1, xmm0`
			`movdqa k2k3, xmm1`
			`pshufd xmm5, xmm5, 0`
			`movdqa k4k5, xmm2`
			`movdqa k6k7, xmm3`
			`movdqa krd, xmm5`

			`movsxd rdx, DWORD PTR arg(1) ;pixels_per_line`

			`%if ABI_IS_32BIT=0`
			`movsxd r8, DWORD PTR arg(3) ;out_pitch`
			`%endif`
			`mov rax, rsi`
			`movsxd rcx, DWORD PTR arg(4) ;output_height`
			`add rax, rdx`

			`lea rbx, [rdx + rdx*4]`
			`add rbx, rdx ;pitch * 6`

Change common vp8_ public symbol prefixes to vp9_. Change-Id: Ic5a5f60e1ff9d9ccae4174160d36529466eeb509 2012-10-31 00:25:53 +01:00			`.vp9_filter_block1d16_v8_ssse3_loop:`
Faster 8t filtering Quickly modified the ssse3 sixtap filters to support eight taps. For the test clip used, a 23+% boost in decoder performance was seen. We can revisit later and improve further. Change-Id: I5f59860459e80d6fa23e6cc0fd91296a969f5240 2012-10-26 02:24:50 +02:00			`movq xmm0, [rsi] ;A`
			`movq xmm1, [rsi + rdx] ;B`
			`movq xmm2, [rsi + rdx * 2] ;C`
			`movq xmm3, [rax + rdx * 2] ;D`
			`movq xmm4, [rsi + rdx * 4] ;E`
			`movq xmm5, [rax + rdx * 4] ;F`

			`punpcklbw xmm0, xmm1 ;A B`
			`punpcklbw xmm2, xmm3 ;C D`
			`punpcklbw xmm4, xmm5 ;E F`

			`movq xmm6, [rsi + rbx] ;G`
			`movq xmm7, [rax + rbx] ;H`

			`pmaddubsw xmm0, k0k1`
			`pmaddubsw xmm2, k2k3`
			`punpcklbw xmm6, xmm7 ;G H`
			`pmaddubsw xmm4, k4k5`
			`pmaddubsw xmm6, k6k7`

			`paddsw xmm0, xmm2`
			`paddsw xmm0, krd`
			`paddsw xmm4, xmm6`
			`paddsw xmm0, xmm4`

			`psraw xmm0, 7`
			`packuswb xmm0, xmm0`

			`movq [rdi], xmm0`

			`movq xmm0, [rsi + 8] ;A`
			`movq xmm1, [rsi + rdx + 8] ;B`
			`movq xmm2, [rsi + rdx * 2 + 8] ;C`
			`movq xmm3, [rax + rdx * 2 + 8] ;D`
			`movq xmm4, [rsi + rdx * 4 + 8] ;E`
			`movq xmm5, [rax + rdx * 4 + 8] ;F`

			`punpcklbw xmm0, xmm1 ;A B`
			`punpcklbw xmm2, xmm3 ;C D`
			`punpcklbw xmm4, xmm5 ;E F`


			`movq xmm6, [rsi + rbx + 8] ;G`
			`movq xmm7, [rax + rbx + 8] ;H`
			`punpcklbw xmm6, xmm7 ;G H`


			`pmaddubsw xmm0, k0k1`
			`pmaddubsw xmm2, k2k3`
			`pmaddubsw xmm4, k4k5`
			`pmaddubsw xmm6, k6k7`

			`paddsw xmm0, xmm2`
			`paddsw xmm4, xmm6`
			`paddsw xmm0, krd`
			`paddsw xmm0, xmm4`

			`psraw xmm0, 7`
			`packuswb xmm0, xmm0`

			`add rsi, rdx`
			`add rax, rdx`

			`movq [rdi+8], xmm0`

			`%if ABI_IS_32BIT`
			`add rdi, DWORD PTR arg(3) ;out_pitch`
			`%else`
			`add rdi, r8`
			`%endif`
			`dec rcx`
Change common vp8_ public symbol prefixes to vp9_. Change-Id: Ic5a5f60e1ff9d9ccae4174160d36529466eeb509 2012-10-31 00:25:53 +01:00			`jnz .vp9_filter_block1d16_v8_ssse3_loop`
Faster 8t filtering Quickly modified the ssse3 sixtap filters to support eight taps. For the test clip used, a 23+% boost in decoder performance was seen. We can revisit later and improve further. Change-Id: I5f59860459e80d6fa23e6cc0fd91296a969f5240 2012-10-26 02:24:50 +02:00
			`add rsp, 16*5`
			`pop rsp`
			`pop rbx`
			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_XMM`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`

Change common vp8_ public symbol prefixes to vp9_. Change-Id: Ic5a5f60e1ff9d9ccae4174160d36529466eeb509 2012-10-31 00:25:53 +01:00			`;void vp9_filter_block1d8_h8_ssse3`
Faster 8t filtering Quickly modified the ssse3 sixtap filters to support eight taps. For the test clip used, a 23+% boost in decoder performance was seen. We can revisit later and improve further. Change-Id: I5f59860459e80d6fa23e6cc0fd91296a969f5240 2012-10-26 02:24:50 +02:00			`;(`
			`; unsigned char *src_ptr,`
			`; unsigned int src_pixels_per_line,`
			`; unsigned char *output_ptr,`
			`; unsigned int output_pitch,`
			`; unsigned int output_height,`
			`; short *filter`
			`;)`
add private to assembly files to insure proper chromebuild Change-Id: I6e43ca73f35401a974ed8ee27738d4318f09fd37 2012-12-20 18:40:18 +01:00			`global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE`
Change common vp8_ public symbol prefixes to vp9_. Change-Id: Ic5a5f60e1ff9d9ccae4174160d36529466eeb509 2012-10-31 00:25:53 +01:00			`sym(vp9_filter_block1d8_h8_ssse3):`
Faster 8t filtering Quickly modified the ssse3 sixtap filters to support eight taps. For the test clip used, a 23+% boost in decoder performance was seen. We can revisit later and improve further. Change-Id: I5f59860459e80d6fa23e6cc0fd91296a969f5240 2012-10-26 02:24:50 +02:00			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 6`
			`SAVE_XMM 7`
			`GET_GOT rbx`
			`push rsi`
			`push rdi`
			`; end prolog`

			`ALIGN_STACK 16, rax`
			`sub rsp, 16*5`
			`%define k0k1 [rsp + 16*0]`
			`%define k2k3 [rsp + 16*1]`
			`%define k4k5 [rsp + 16*2]`
			`%define k6k7 [rsp + 16*3]`
			`%define krd [rsp + 16*4]`

			`mov rdx, arg(5) ;filter ptr`
			`mov rsi, arg(0) ;src_ptr`
			`mov rdi, arg(2) ;output_ptr`
			`mov rcx, 0x0400040`

			`movdqa xmm4, [rdx] ;load filters`
			`movd xmm5, rcx`
			`packsswb xmm4, xmm4`
			`pshuflw xmm0, xmm4, 0b ;k0_k1`
			`pshuflw xmm1, xmm4, 01010101b ;k2_k3`
			`pshuflw xmm2, xmm4, 10101010b ;k4_k5`
			`pshuflw xmm3, xmm4, 11111111b ;k6_k7`

			`punpcklqdq xmm0, xmm0`
			`punpcklqdq xmm1, xmm1`
			`punpcklqdq xmm2, xmm2`
			`punpcklqdq xmm3, xmm3`

			`movdqa k0k1, xmm0`
			`movdqa k2k3, xmm1`
			`pshufd xmm5, xmm5, 0`
			`movdqa k4k5, xmm2`
			`movdqa k6k7, xmm3`
			`; movdqa krd, xmm5`

			`movsxd rax, dword ptr arg(1) ;src_pixels_per_line`
			`movsxd rdx, dword ptr arg(3) ;output_pitch`
			`movsxd rcx, dword ptr arg(4) ;output_height`

			`.filter_block1d8_h8_rowloop_ssse3:`
			`movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4`

			`; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11`
			`movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12`
			`;note: if we create a k0_k7 filter, we can save a pshufb`
			`; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11`
			`punpcklqdq xmm0, xmm3`

			`movdqa xmm1, xmm0`
			`pshufb xmm0, [GLOBAL(shuf_t0t1)]`
			`pmaddubsw xmm0, k0k1`

			`movdqa xmm2, xmm1`
			`pshufb xmm1, [GLOBAL(shuf_t2t3)]`
			`pmaddubsw xmm1, k2k3`

			`movdqa xmm4, xmm2`
			`pshufb xmm2, [GLOBAL(shuf_t4t5)]`
			`pmaddubsw xmm2, k4k5`

			`pshufb xmm4, [GLOBAL(shuf_t6t7)]`
			`pmaddubsw xmm4, k6k7`

			`paddsw xmm0, xmm1`
			`paddsw xmm0, xmm2`
			`paddsw xmm0, xmm5`
			`paddsw xmm0, xmm4`
			`psraw xmm0, 7`
			`packuswb xmm0, xmm0`

			`lea rsi, [rsi + rax]`
			`movq [rdi], xmm0`

			`lea rdi, [rdi + rdx]`
			`dec rcx`
			`jnz .filter_block1d8_h8_rowloop_ssse3`

			`add rsp, 16*5`
			`pop rsp`

			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_GOT`
			`RESTORE_XMM`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`

Change common vp8_ public symbol prefixes to vp9_. Change-Id: Ic5a5f60e1ff9d9ccae4174160d36529466eeb509 2012-10-31 00:25:53 +01:00			`;void vp9_filter_block1d16_h8_ssse3`
Faster 8t filtering Quickly modified the ssse3 sixtap filters to support eight taps. For the test clip used, a 23+% boost in decoder performance was seen. We can revisit later and improve further. Change-Id: I5f59860459e80d6fa23e6cc0fd91296a969f5240 2012-10-26 02:24:50 +02:00			`;(`
			`; unsigned char *src_ptr,`
			`; unsigned int src_pixels_per_line,`
			`; unsigned char *output_ptr,`
			`; unsigned int output_pitch,`
			`; unsigned int output_height,`
			`; short *filter`
			`;)`
add private to assembly files to insure proper chromebuild Change-Id: I6e43ca73f35401a974ed8ee27738d4318f09fd37 2012-12-20 18:40:18 +01:00			`global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE`
Change common vp8_ public symbol prefixes to vp9_. Change-Id: Ic5a5f60e1ff9d9ccae4174160d36529466eeb509 2012-10-31 00:25:53 +01:00			`sym(vp9_filter_block1d16_h8_ssse3):`
Faster 8t filtering Quickly modified the ssse3 sixtap filters to support eight taps. For the test clip used, a 23+% boost in decoder performance was seen. We can revisit later and improve further. Change-Id: I5f59860459e80d6fa23e6cc0fd91296a969f5240 2012-10-26 02:24:50 +02:00			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 6`
			`SAVE_XMM 7`
			`GET_GOT rbx`
			`push rsi`
			`push rdi`
			`; end prolog`

			`ALIGN_STACK 16, rax`
			`sub rsp, 16*5`
			`%define k0k1 [rsp + 16*0]`
			`%define k2k3 [rsp + 16*1]`
			`%define k4k5 [rsp + 16*2]`
			`%define k6k7 [rsp + 16*3]`
			`%define krd [rsp + 16*4]`

			`mov rdx, arg(5) ;filter ptr`
			`mov rsi, arg(0) ;src_ptr`
			`mov rdi, arg(2) ;output_ptr`
			`mov rcx, 0x0400040`

			`movdqa xmm4, [rdx] ;load filters`
			`movd xmm5, rcx`
			`packsswb xmm4, xmm4`
			`pshuflw xmm0, xmm4, 0b ;k0_k1`
			`pshuflw xmm1, xmm4, 01010101b ;k2_k3`
			`pshuflw xmm2, xmm4, 10101010b ;k4_k5`
			`pshuflw xmm3, xmm4, 11111111b ;k6_k7`

			`punpcklqdq xmm0, xmm0`
			`punpcklqdq xmm1, xmm1`
			`punpcklqdq xmm2, xmm2`
			`punpcklqdq xmm3, xmm3`

			`movdqa k0k1, xmm0`
			`movdqa k2k3, xmm1`
			`pshufd xmm5, xmm5, 0`
			`movdqa k4k5, xmm2`
			`movdqa k6k7, xmm3`
			`movdqa krd, xmm5`

			`movsxd rax, dword ptr arg(1) ;src_pixels_per_line`
			`movsxd rdx, dword ptr arg(3) ;output_pitch`
			`movsxd rcx, dword ptr arg(4) ;output_height`

			`.filter_block1d16_h8_rowloop_ssse3:`
			`movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4`

			`; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11`
			`movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12`
			`;note: if we create a k0_k7 filter, we can save a pshufb`
			`; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11`
			`punpcklqdq xmm0, xmm3`

			`movdqa xmm1, xmm0`
			`pshufb xmm0, [GLOBAL(shuf_t0t1)]`
			`pmaddubsw xmm0, k0k1`

			`movdqa xmm2, xmm1`
			`pshufb xmm1, [GLOBAL(shuf_t2t3)]`
			`pmaddubsw xmm1, k2k3`

			`movdqa xmm4, xmm2`
			`pshufb xmm2, [GLOBAL(shuf_t4t5)]`
			`pmaddubsw xmm2, k4k5`

			`pshufb xmm4, [GLOBAL(shuf_t6t7)]`
			`pmaddubsw xmm4, k6k7`

			`paddsw xmm0, xmm1`
			`paddsw xmm0, xmm4`
			`paddsw xmm0, xmm2`
			`paddsw xmm0, krd`
			`psraw xmm0, 7`
			`packuswb xmm0, xmm0`


			`movq xmm3, [rsi + 5]`
			`; movq xmm7, [rsi + 12]`
			`movq xmm7, [rsi + 13]`
			`;note: same as above`
			`; punpcklbw xmm3, xmm7`
			`punpcklqdq xmm3, xmm7`

			`movdqa xmm1, xmm3`
			`pshufb xmm3, [GLOBAL(shuf_t0t1)]`
			`pmaddubsw xmm3, k0k1`

			`movdqa xmm2, xmm1`
			`pshufb xmm1, [GLOBAL(shuf_t2t3)]`
			`pmaddubsw xmm1, k2k3`

			`movdqa xmm4, xmm2`
			`pshufb xmm2, [GLOBAL(shuf_t4t5)]`
			`pmaddubsw xmm2, k4k5`

			`pshufb xmm4, [GLOBAL(shuf_t6t7)]`
			`pmaddubsw xmm4, k6k7`

			`paddsw xmm3, xmm1`
			`paddsw xmm3, xmm2`
			`paddsw xmm3, krd`
			`paddsw xmm3, xmm4`
			`psraw xmm3, 7`
			`packuswb xmm3, xmm3`
			`punpcklqdq xmm0, xmm3`

			`lea rsi, [rsi + rax]`
			`movdqa [rdi], xmm0`

			`lea rdi, [rdi + rdx]`
			`dec rcx`
			`jnz .filter_block1d16_h8_rowloop_ssse3`

			`add rsp, 16*5`
			`pop rsp`

			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_GOT`
			`RESTORE_XMM`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`


			`SECTION_RODATA`
			`align 16`
			`shuf_t0t1:`
			`db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8`
			`align 16`
			`shuf_t2t3:`
			`db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10`
			`align 16`
			`shuf_t4t5:`
			`db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12`
			`align 16`
			`shuf_t6t7:`
			`db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14`