vpx/vp9/encoder/x86/vp9_dct_mmx.asm
Peter de Rivaz 7e40a55ef9 Added high bitdepth sse2 transform functions
Also removes some spurious changes in common/vp9_blockd.h which
was introduced by a rebase issue between nextgen and master branches.

Change-Id: If359f0e9a71bca9c2ba685a87a355873536bb282
(cherry picked from commit 005d80cd05)
(cherry picked from commit 08d2f54800)
(cherry picked from commit 4230c2306c)
2014-12-02 11:16:24 -08:00

102 lines
3.1 KiB
NASM

;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "third_party/x86inc/x86inc.asm"
SECTION .text
%macro TRANSFORM_COLS 0
paddw m0, m1
movq m4, m0
psubw m3, m2
psubw m4, m3
psraw m4, 1
movq m5, m4
psubw m5, m1 ;b1
psubw m4, m2 ;c1
psubw m0, m4
paddw m3, m5
; m0 a0
SWAP 1, 4 ; m1 c1
SWAP 2, 3 ; m2 d1
SWAP 3, 5 ; m3 b1
%endmacro
%macro TRANSPOSE_4X4 0
movq m4, m0
movq m5, m2
punpcklwd m4, m1
punpckhwd m0, m1
punpcklwd m5, m3
punpckhwd m2, m3
movq m1, m4
movq m3, m0
punpckldq m1, m5
punpckhdq m4, m5
punpckldq m3, m2
punpckhdq m0, m2
SWAP 2, 3, 0, 1, 4
%endmacro
INIT_MMX mmx
cglobal fwht4x4, 3, 4, 8, input, output, stride
lea r3q, [inputq + strideq*4]
movq m0, [inputq] ;a1
movq m1, [inputq + strideq*2] ;b1
movq m2, [r3q] ;c1
movq m3, [r3q + strideq*2] ;d1
TRANSFORM_COLS
TRANSPOSE_4X4
TRANSFORM_COLS
TRANSPOSE_4X4
psllw m0, 2
psllw m1, 2
psllw m2, 2
psllw m3, 2
%if CONFIG_VP9_HIGHBITDEPTH
pxor m4, m4
pxor m5, m5
pcmpgtw m4, m0
pcmpgtw m5, m1
movq m6, m0
movq m7, m1
punpcklwd m0, m4
punpcklwd m1, m5
punpckhwd m6, m4
punpckhwd m7, m5
movq [outputq], m0
movq [outputq + 8], m6
movq [outputq + 16], m1
movq [outputq + 24], m7
pxor m4, m4
pxor m5, m5
pcmpgtw m4, m2
pcmpgtw m5, m3
movq m6, m2
movq m7, m3
punpcklwd m2, m4
punpcklwd m3, m5
punpckhwd m6, m4
punpckhwd m7, m5
movq [outputq + 32], m2
movq [outputq + 40], m6
movq [outputq + 48], m3
movq [outputq + 56], m7
%else
movq [outputq], m0
movq [outputq + 8], m1
movq [outputq + 16], m2
movq [outputq + 24], m3
%endif
RET