57f6bcc4b0
Previously the assembly sources had mixed indentation consisting of both spaces and tabs, making it quite hard to read unless the right tab size was used in the editor. Tabs have been interpreted as 4 spaces in most cases, matching the surrounding code.
371 lines
12 KiB
NASM
371 lines
12 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* quant.asm
|
|
;*
|
|
;* Abstract
|
|
;* sse2 quantize inter-block
|
|
;*
|
|
;* History
|
|
;* 7/6/2009 Created
|
|
;*
|
|
;*
|
|
;*************************************************************************/
|
|
|
|
%include "asm_inc.asm"
|
|
|
|
|
|
SECTION .text
|
|
;************************************************
|
|
;NEW_QUANT
|
|
;************************************************
|
|
|
|
%macro SSE2_Quant8 5
|
|
MOVDQ %1, %5
|
|
pxor %2, %2
|
|
pcmpgtw %2, %1
|
|
pxor %1, %2
|
|
psubw %1, %2
|
|
paddusw %1, %3
|
|
pmulhuw %1, %4
|
|
pxor %1, %2
|
|
psubw %1, %2
|
|
MOVDQ %5, %1
|
|
%endmacro
|
|
|
|
%macro SSE2_QuantMax8 6
|
|
MOVDQ %1, %5
|
|
pxor %2, %2
|
|
pcmpgtw %2, %1
|
|
pxor %1, %2
|
|
psubw %1, %2
|
|
paddusw %1, %3
|
|
pmulhuw %1, %4
|
|
pmaxsw %6, %1
|
|
pxor %1, %2
|
|
psubw %1, %2
|
|
MOVDQ %5, %1
|
|
%endmacro
|
|
|
|
%define pDct esp + 4
|
|
%define ff esp + 8
|
|
%define mf esp + 12
|
|
%define max esp + 16
|
|
;***********************************************************************
|
|
; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsQuant4x4_sse2
|
|
%assign push_num 0
|
|
LOAD_3_PARA
|
|
movdqa xmm2, [r1]
|
|
movdqa xmm3, [r2]
|
|
|
|
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
|
|
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
|
|
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsQuant4x4Dc_sse2
|
|
%assign push_num 0
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSIONW r1, r1w
|
|
SIGN_EXTENSIONW r2, r2w
|
|
SSE2_Copy8Times xmm3, r2d
|
|
|
|
SSE2_Copy8Times xmm2, r1d
|
|
|
|
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
|
|
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
|
|
|
|
ret
|
|
|
|
;***********************************************************************
|
|
; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsQuantFour4x4_sse2
|
|
%assign push_num 0
|
|
LOAD_3_PARA
|
|
MOVDQ xmm2, [r1]
|
|
MOVDQ xmm3, [r2]
|
|
|
|
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
|
|
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
|
|
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
|
|
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
|
|
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
|
|
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
|
|
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
|
|
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
|
|
|
|
ret
|
|
|
|
;***********************************************************************
|
|
; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsQuantFour4x4Max_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
MOVDQ xmm2, [r1]
|
|
MOVDQ xmm3, [r2]
|
|
|
|
pxor xmm4, xmm4
|
|
pxor xmm5, xmm5
|
|
pxor xmm6, xmm6
|
|
pxor xmm7, xmm7
|
|
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4
|
|
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
|
|
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
|
|
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
|
|
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
|
|
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
|
|
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
|
|
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
|
|
|
|
SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
|
|
pmaxsw xmm0, xmm4
|
|
pmaxsw xmm0, xmm5
|
|
pmaxsw xmm0, xmm7
|
|
movdqa xmm1, xmm0
|
|
punpckhqdq xmm0, xmm1
|
|
pmaxsw xmm0, xmm1
|
|
|
|
movq [r3], xmm0
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
%macro MMX_Copy4Times 2
|
|
movd %1, %2
|
|
punpcklwd %1, %1
|
|
punpckldq %1, %1
|
|
%endmacro
|
|
|
|
SECTION .text
|
|
|
|
%macro MMX_Quant4 4
|
|
pxor %2, %2
|
|
pcmpgtw %2, %1
|
|
pxor %1, %2
|
|
psubw %1, %2
|
|
paddusw %1, %3
|
|
pmulhuw %1, %4
|
|
pxor %1, %2
|
|
psubw %1, %2
|
|
%endmacro
|
|
|
|
;***********************************************************************
|
|
;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsHadamardQuant2x2_mmx
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
SIGN_EXTENSIONW r1, r1w
|
|
SIGN_EXTENSIONW r2, r2w
|
|
movd mm0, [r0]
|
|
movd mm1, [r0 + 0x20]
|
|
punpcklwd mm0, mm1
|
|
movd mm3, [r0 + 0x40]
|
|
movd mm1, [r0 + 0x60]
|
|
punpcklwd mm3, mm1
|
|
|
|
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
|
|
movq mm5, mm3
|
|
paddw mm3, mm0
|
|
psubw mm0, mm5
|
|
punpcklwd mm3, mm0
|
|
movq mm1, mm3
|
|
psrlq mm1, 32
|
|
movq mm5, mm1
|
|
paddw mm1, mm3
|
|
psubw mm3, mm5
|
|
punpcklwd mm1, mm3
|
|
|
|
;quant_2x2_dc
|
|
MMX_Copy4Times mm3, r2d
|
|
MMX_Copy4Times mm2, r1d
|
|
MMX_Quant4 mm1, mm0, mm2, mm3
|
|
|
|
; store dct_2x2
|
|
movq [r3], mm1
|
|
movq [r4], mm1
|
|
|
|
; pNonZeroCount of dct_2x2
|
|
pcmpeqb mm2, mm2 ; mm2 = FF
|
|
pxor mm3, mm3
|
|
packsswb mm1, mm3
|
|
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
|
|
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
|
|
psadbw mm1, mm3 ;
|
|
mov r1w, 0
|
|
mov [r0], r1w
|
|
mov [r0 + 0x20], r1w
|
|
mov [r0 + 0x40], r1w
|
|
mov [r0 + 0x60], r1w
|
|
|
|
|
|
movd retrd, mm1
|
|
|
|
WELSEMMS
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf);
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
|
|
%assign push_num 0
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSIONW r1, r1w
|
|
SIGN_EXTENSIONW r2, r2w
|
|
movd mm0, [r0]
|
|
movd mm1, [r0 + 0x20]
|
|
punpcklwd mm0, mm1
|
|
movd mm3, [r0 + 0x40]
|
|
movd mm1, [r0 + 0x60]
|
|
punpcklwd mm3, mm1
|
|
|
|
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
|
|
movq mm5, mm3
|
|
paddw mm3, mm0
|
|
psubw mm0, mm5
|
|
punpcklwd mm3, mm0
|
|
movq mm1, mm3
|
|
psrlq mm1, 32
|
|
movq mm5, mm1
|
|
paddw mm1, mm3
|
|
psubw mm3, mm5
|
|
punpcklwd mm1, mm3
|
|
|
|
;quant_2x2_dc
|
|
MMX_Copy4Times mm3, r2d
|
|
MMX_Copy4Times mm2, r1d
|
|
MMX_Quant4 mm1, mm0, mm2, mm3
|
|
|
|
; pNonZeroCount of dct_2x2
|
|
pcmpeqb mm2, mm2 ; mm2 = FF
|
|
pxor mm3, mm3
|
|
packsswb mm1, mm3
|
|
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
|
|
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
|
|
psadbw mm1, mm3 ;
|
|
movd retrd, mm1
|
|
|
|
WELSEMMS
|
|
ret
|
|
|
|
|
|
%macro SSE2_DeQuant8 3
|
|
MOVDQ %2, %1
|
|
pmullw %2, %3
|
|
MOVDQ %1, %2
|
|
%endmacro
|
|
|
|
|
|
;***********************************************************************
|
|
; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsDequant4x4_sse2
|
|
%assign push_num 0
|
|
LOAD_2_PARA
|
|
|
|
movdqa xmm1, [r1]
|
|
SSE2_DeQuant8 [r0 ], xmm0, xmm1
|
|
SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1
|
|
|
|
ret
|
|
|
|
;***********************************************************************====
|
|
;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
|
|
;***********************************************************************====
|
|
|
|
WELS_EXTERN WelsDequantFour4x4_sse2
|
|
%assign push_num 0
|
|
LOAD_2_PARA
|
|
|
|
movdqa xmm1, [r1]
|
|
SSE2_DeQuant8 [r0 ], xmm0, xmm1
|
|
SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1
|
|
SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1
|
|
SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1
|
|
SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1
|
|
SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1
|
|
SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1
|
|
SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1
|
|
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsDequantIHadamard4x4_sse2
|
|
%assign push_num 0
|
|
LOAD_2_PARA
|
|
%ifndef X86_32
|
|
movzx r1, r1w
|
|
%endif
|
|
|
|
; WelsDequantLumaDc4x4
|
|
SSE2_Copy8Times xmm1, r1d
|
|
;psrlw xmm1, 2 ; for the (>>2) in ihdm
|
|
MOVDQ xmm0, [r0]
|
|
MOVDQ xmm2, [r0+0x10]
|
|
pmullw xmm0, xmm1
|
|
pmullw xmm2, xmm1
|
|
|
|
; ihdm_4x4
|
|
movdqa xmm1, xmm0
|
|
psrldq xmm1, 8
|
|
movdqa xmm3, xmm2
|
|
psrldq xmm3, 8
|
|
|
|
SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
|
|
SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
|
|
SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
|
|
SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
|
|
|
|
SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
|
|
SSE2_SumSub xmm2, xmm4, xmm5
|
|
SSE2_SumSub xmm1, xmm0, xmm5
|
|
SSE2_SumSub xmm4, xmm0, xmm5
|
|
SSE2_SumSub xmm2, xmm1, xmm5
|
|
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
|
|
|
|
punpcklqdq xmm0, xmm1
|
|
MOVDQ [r0], xmm0
|
|
|
|
punpcklqdq xmm2, xmm3
|
|
MOVDQ [r0+16], xmm2
|
|
ret
|