vpx/vp8/decoder/x86/dequantize_mmx.asm
Ronald S. Bultje 9c9d6743d4 Sign-extend input argument so it can be used in pointer arithmetic.
Change-Id: I6cbd4de96f9dcc783cef170bfd7652f6cbee36a2
2012-06-25 14:16:39 -07:00

407 lines
14 KiB
NASM

;
; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "third_party/x86inc/x86inc.asm"
SECTION_RODATA
align 16
x_s1sqr2: times 4 dw 0x8A8C
align 16
x_c1sqr2less1: times 4 dw 0x4E7B
align 16
pw_16: times 4 dw 16
SECTION .text
INIT_MMX
;void dequantize_b_impl_mmx(short *sq, short *dq, short *q)
cglobal dequantize_b_impl_mmx, 3,3,0,sq,dq,arg3
mova m1, [sqq]
pmullw m1, [arg3q+0] ; mm4 *= kernel 0 modifiers.
mova [dqq+ 0], m1
mova m1, [sqq+8]
pmullw m1, [arg3q+8] ; mm4 *= kernel 0 modifiers.
mova [dqq+ 8], m1
mova m1, [sqq+16]
pmullw m1, [arg3q+16] ; mm4 *= kernel 0 modifiers.
mova [dqq+16], m1
mova m1, [sqq+24]
pmullw m1, [arg3q+24] ; mm4 *= kernel 0 modifiers.
mova [dqq+24], m1
RET
;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
cglobal dequant_idct_add_mmx, 4,6,0,inp,dq,pred,dest,pit,stride
%if ARCH_X86_64
movsxd strideq, dword stridem
movsxd pitq, dword pitm
%else
mov strideq, stridem
mov pitq, pitm
%endif
mova m0, [inpq+ 0]
pmullw m0, [dqq]
mova m1, [inpq+ 8]
pmullw m1, [dqq+ 8]
mova m2, [inpq+16]
pmullw m2, [dqq+16]
mova m3, [inpq+24]
pmullw m3, [dqq+24]
pxor m7, m7
mova [inpq], m7
mova [inpq+8], m7
mova [inpq+16], m7
mova [inpq+24], m7
psubw m0, m2 ; b1= 0-2
paddw m2, m2 ;
mova m5, m1
paddw m2, m0 ; a1 =0+2
pmulhw m5, [x_s1sqr2];
paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
mova m7, m3 ;
pmulhw m7, [x_c1sqr2less1];
paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
psubw m7, m5 ; c1
mova m5, m1
mova m4, m3
pmulhw m5, [x_c1sqr2less1]
paddw m5, m1
pmulhw m3, [x_s1sqr2]
paddw m3, m4
paddw m3, m5 ; d1
mova m6, m2 ; a1
mova m4, m0 ; b1
paddw m2, m3 ;0
paddw m4, m7 ;1
psubw m0, m7 ;2
psubw m6, m3 ;3
mova m1, m2 ; 03 02 01 00
mova m3, m4 ; 23 22 21 20
punpcklwd m1, m0 ; 11 01 10 00
punpckhwd m2, m0 ; 13 03 12 02
punpcklwd m3, m6 ; 31 21 30 20
punpckhwd m4, m6 ; 33 23 32 22
mova m0, m1 ; 11 01 10 00
mova m5, m2 ; 13 03 12 02
punpckldq m0, m3 ; 30 20 10 00
punpckhdq m1, m3 ; 31 21 11 01
punpckldq m2, m4 ; 32 22 12 02
punpckhdq m5, m4 ; 33 23 13 03
mova m3, m5 ; 33 23 13 03
psubw m0, m2 ; b1= 0-2
paddw m2, m2 ;
mova m5, m1
paddw m2, m0 ; a1 =0+2
pmulhw m5, [x_s1sqr2];
paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
mova m7, m3 ;
pmulhw m7, [x_c1sqr2less1];
paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
psubw m7, m5 ; c1
mova m5, m1
mova m4, m3
pmulhw m5, [x_c1sqr2less1]
paddw m5, m1
pmulhw m3, [x_s1sqr2]
paddw m3, m4
paddw m3, m5 ; d1
paddw m0, [pw_16]
paddw m2, [pw_16]
mova m6, m2 ; a1
mova m4, m0 ; b1
paddw m2, m3 ;0
paddw m4, m7 ;1
psubw m0, m7 ;2
psubw m6, m3 ;3
psraw m2, 5
psraw m0, 5
psraw m4, 5
psraw m6, 5
mova m1, m2 ; 03 02 01 00
mova m3, m4 ; 23 22 21 20
punpcklwd m1, m0 ; 11 01 10 00
punpckhwd m2, m0 ; 13 03 12 02
punpcklwd m3, m6 ; 31 21 30 20
punpckhwd m4, m6 ; 33 23 32 22
mova m0, m1 ; 11 01 10 00
mova m5, m2 ; 13 03 12 02
punpckldq m0, m3 ; 30 20 10 00
punpckhdq m1, m3 ; 31 21 11 01
punpckldq m2, m4 ; 32 22 12 02
punpckhdq m5, m4 ; 33 23 13 03
pxor m7, m7
movh m4, [predq]
punpcklbw m4, m7
paddsw m0, m4
packuswb m0, m7
movh [destq], m0
movh m4, [predq+pitq]
punpcklbw m4, m7
paddsw m1, m4
packuswb m1, m7
movh [destq+strideq], m1
movh m4, [predq+2*pitq]
punpcklbw m4, m7
paddsw m2, m4
packuswb m2, m7
movh [destq+strideq*2], m2
add destq, strideq
add predq, pitq
movh m4, [predq+2*pitq]
punpcklbw m4, m7
paddsw m5, m4
packuswb m5, m7
movh [destq+strideq*2], m5
RET
;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
cglobal dequant_dc_idct_add_mmx, 4,7,0,inp,dq,pred,dest,pit,stride,Dc
%if ARCH_X86_64
movsxd strideq, dword stridem
movsxd pitq, dword pitm
%else
mov strideq, stridem
mov pitq, pitm
%endif
mov Dcq, Dcm
mova m0, [inpq+ 0]
pmullw m0, [dqq+ 0]
mova m1, [inpq+ 8]
pmullw m1, [dqq+ 8]
mova m2, [inpq+16]
pmullw m2, [dqq+16]
mova m3, [inpq+24]
pmullw m3, [dqq+24]
pxor m7, m7
mova [inpq+ 0], m7
mova [inpq+ 8], m7
mova [inpq+16], m7
mova [inpq+24], m7
; move lower word of Dc to lower word of m0
psrlq m0, 16
psllq m0, 16
and Dcq, 0xFFFF ; If Dc < 0, we don't want the full dword precision.
movh m7, Dcq
por m0, m7
psubw m0, m2 ; b1= 0-2
paddw m2, m2 ;
mova m5, m1
paddw m2, m0 ; a1 =0+2
pmulhw m5, [x_s1sqr2];
paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
mova m7, m3 ;
pmulhw m7, [x_c1sqr2less1];
paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
psubw m7, m5 ; c1
mova m5, m1
mova m4, m3
pmulhw m5, [x_c1sqr2less1]
paddw m5, m1
pmulhw m3, [x_s1sqr2]
paddw m3, m4
paddw m3, m5 ; d1
mova m6, m2 ; a1
mova m4, m0 ; b1
paddw m2, m3 ;0
paddw m4, m7 ;1
psubw m0, m7 ;2
psubw m6, m3 ;3
mova m1, m2 ; 03 02 01 00
mova m3, m4 ; 23 22 21 20
punpcklwd m1, m0 ; 11 01 10 00
punpckhwd m2, m0 ; 13 03 12 02
punpcklwd m3, m6 ; 31 21 30 20
punpckhwd m4, m6 ; 33 23 32 22
mova m0, m1 ; 11 01 10 00
mova m5, m2 ; 13 03 12 02
punpckldq m0, m3 ; 30 20 10 00
punpckhdq m1, m3 ; 31 21 11 01
punpckldq m2, m4 ; 32 22 12 02
punpckhdq m5, m4 ; 33 23 13 03
mova m3, m5 ; 33 23 13 03
psubw m0, m2 ; b1= 0-2
paddw m2, m2 ;
mova m5, m1
paddw m2, m0 ; a1 =0+2
pmulhw m5, [x_s1sqr2];
paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
mova m7, m3 ;
pmulhw m7, [x_c1sqr2less1];
paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
psubw m7, m5 ; c1
mova m5, m1
mova m4, m3
pmulhw m5, [x_c1sqr2less1]
paddw m5, m1
pmulhw m3, [x_s1sqr2]
paddw m3, m4
paddw m3, m5 ; d1
paddw m0, [pw_16]
paddw m2, [pw_16]
mova m6, m2 ; a1
mova m4, m0 ; b1
paddw m2, m3 ;0
paddw m4, m7 ;1
psubw m0, m7 ;2
psubw m6, m3 ;3
psraw m2, 5
psraw m0, 5
psraw m4, 5
psraw m6, 5
mova m1, m2 ; 03 02 01 00
mova m3, m4 ; 23 22 21 20
punpcklwd m1, m0 ; 11 01 10 00
punpckhwd m2, m0 ; 13 03 12 02
punpcklwd m3, m6 ; 31 21 30 20
punpckhwd m4, m6 ; 33 23 32 22
mova m0, m1 ; 11 01 10 00
mova m5, m2 ; 13 03 12 02
punpckldq m0, m3 ; 30 20 10 00
punpckhdq m1, m3 ; 31 21 11 01
punpckldq m2, m4 ; 32 22 12 02
punpckhdq m5, m4 ; 33 23 13 03
pxor m7, m7
movh m4, [predq]
punpcklbw m4, m7
paddsw m0, m4
packuswb m0, m7
movh [destq], m0
movh m4, [predq+pitq]
punpcklbw m4, m7
paddsw m1, m4
packuswb m1, m7
movh [destq+strideq], m1
movh m4, [predq+2*pitq]
punpcklbw m4, m7
paddsw m2, m4
packuswb m2, m7
movh [destq+strideq*2], m2
add destq, strideq
add predq, pitq
movh m4, [predq+2*pitq]
punpcklbw m4, m7
paddsw m5, m4
packuswb m5, m7
movh [destq+strideq*2], m5
RET