vpx/vp9/encoder/x86/vp9_dct_sse2.asm
Linfeng Zhang af7fb17c09 Upgrade fwht4x4_mmx() to fwht4x4_sse2() for vp9 and vp10.
Function level timing test shows about 27% time saving on
a Xeon E5-2680 v2 desktop.

Rename vp9_dct_sse2.c to vp9_dct_intrin_sse2.c for vp9 and
rename dct_sse2.c to dct_intrin_sse2.c for vp10 to avoid
duplicate basenames.

Actually vp9_fwht4x4_mmx/sse2() and vp10_fwht4x4_mmx/sse2()
are identical. TODO: They should be unified later if there is
no intention to keep a duplicate.

Change-Id: I3e537b7bbd9ba417c606cd7c68c4dbbfa583f77d
2016-05-27 09:51:16 -07:00

88 lines
2.7 KiB
NASM

;
; Copyright (c) 2016 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%define private_prefix vp9
%include "third_party/x86inc/x86inc.asm"
SECTION .text
%macro TRANSFORM_COLS 0
paddw m0, m1
movq m4, m0
psubw m3, m2
psubw m4, m3
psraw m4, 1
movq m5, m4
psubw m5, m1 ;b1
psubw m4, m2 ;c1
psubw m0, m4
paddw m3, m5
; m0 a0
SWAP 1, 4 ; m1 c1
SWAP 2, 3 ; m2 d1
SWAP 3, 5 ; m3 b1
%endmacro
%macro TRANSPOSE_4X4 0
; 00 01 02 03
; 10 11 12 13
; 20 21 22 23
; 30 31 32 33
punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13
punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33
mova m1, m0
punpckldq m0, m2 ; 00 10 20 30 01 11 21 31
punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33
%endmacro
INIT_XMM sse2
cglobal fwht4x4, 3, 4, 8, input, output, stride
; TODO(linfeng): The duplication with vp10 should be resolved.
lea r3q, [inputq + strideq*4]
movq m0, [inputq] ;a1
movq m1, [inputq + strideq*2] ;b1
movq m2, [r3q] ;c1
movq m3, [r3q + strideq*2] ;d1
TRANSFORM_COLS
TRANSPOSE_4X4
SWAP 1, 2
psrldq m1, m0, 8
psrldq m3, m2, 8
TRANSFORM_COLS
TRANSPOSE_4X4
psllw m0, 2
psllw m1, 2
%if CONFIG_VP9_HIGHBITDEPTH
; sign extension
mova m2, m0
mova m3, m1
punpcklwd m0, m0
punpcklwd m1, m1
punpckhwd m2, m2
punpckhwd m3, m3
psrad m0, 16
psrad m1, 16
psrad m2, 16
psrad m3, 16
mova [outputq], m0
mova [outputq + 16], m2
mova [outputq + 32], m1
mova [outputq + 48], m3
%else
mova [outputq], m0
mova [outputq + 16], m1
%endif
RET