openh264/codec/encoder/core/asm/quant.asm

426 lines
11 KiB
NASM
Raw Normal View History

2013-12-09 13:51:09 +01:00
;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* quant.asm
;*
;* Abstract
;* sse2 quantize inter-block
;*
;* History
;* 7/6/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
SECTION .text
2013-12-09 13:51:09 +01:00
;************************************************
;NEW_QUANT
2013-12-09 13:51:09 +01:00
;************************************************
%macro SSE2_Quant8 5
MOVDQ %1, %5
pxor %2, %2
pcmpgtw %2, %1
pxor %1, %2
psubw %1, %2
2013-12-09 13:51:09 +01:00
paddusw %1, %3
pmulhuw %1, %4
pxor %1, %2
psubw %1, %2
MOVDQ %5, %1
%endmacro
%macro SSE2_QuantMax8 6
MOVDQ %1, %5
pxor %2, %2
pcmpgtw %2, %1
pxor %1, %2
psubw %1, %2
2013-12-09 13:51:09 +01:00
paddusw %1, %3
pmulhuw %1, %4
pmaxsw %6, %1
pxor %1, %2
psubw %1, %2
MOVDQ %5, %1
%endmacro
%define pDct esp + 4
%define ff esp + 8
%define mf esp + 12
%define max esp + 16
;***********************************************************************
; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
WELS_EXTERN WelsQuant4x4_sse2
align 16
WelsQuant4x4_sse2:
2014-01-03 07:49:45 +01:00
%assign push_num 0
LOAD_3_PARA
;mov eax, [ff]
;mov ecx, [mf]
movdqa xmm2, [r1]
movdqa xmm3, [r2]
2014-01-03 07:49:45 +01:00
;mov edx, [pDct]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
2013-12-09 13:51:09 +01:00
ret
2013-12-09 13:51:09 +01:00
;***********************************************************************
;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
;***********************************************************************
WELS_EXTERN WelsQuant4x4Dc_sse2
align 16
WelsQuant4x4Dc_sse2:
2014-01-03 07:49:45 +01:00
%assign push_num 0
LOAD_3_PARA
%ifndef X86_32
movsx r1, r1w
movsx r2, r2w
%endif
;mov ax, [mf]
SSE2_Copy8Times xmm3, r2d
;mov cx, [ff]
SSE2_Copy8Times xmm2, r1d
;mov edx, [pDct]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
ret
2013-12-09 13:51:09 +01:00
;***********************************************************************
; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
WELS_EXTERN WelsQuantFour4x4_sse2
align 16
WelsQuantFour4x4_sse2:
2014-01-03 07:49:45 +01:00
%assign push_num 0
LOAD_3_PARA
;mov eax, [ff]
;mov ecx, [mf]
MOVDQ xmm2, [r1]
MOVDQ xmm3, [r2]
;mov edx, [pDct]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
2013-12-09 13:51:09 +01:00
ret
;***********************************************************************
; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
;***********************************************************************
WELS_EXTERN WelsQuantFour4x4Max_sse2
align 16
WelsQuantFour4x4Max_sse2:
2014-01-03 07:49:45 +01:00
%assign push_num 0
LOAD_4_PARA
;mov eax, [ff]
;mov ecx, [mf]
MOVDQ xmm2, [r1]
MOVDQ xmm3, [r2]
;mov edx, [pDct]
2013-12-09 13:51:09 +01:00
pxor xmm4, xmm4
pxor xmm5, xmm5
pxor xmm6, xmm6
pxor xmm7, xmm7
2014-01-03 07:49:45 +01:00
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
2013-12-09 13:51:09 +01:00
SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
pmaxsw xmm0, xmm4
2013-12-09 13:51:09 +01:00
pmaxsw xmm0, xmm5
pmaxsw xmm0, xmm7
2013-12-09 13:51:09 +01:00
movdqa xmm1, xmm0
punpckhqdq xmm0, xmm1
pmaxsw xmm0, xmm1
2014-01-03 07:49:45 +01:00
;mov r0, [r3]
movq [r3], xmm0
LOAD_4_PARA_POP
ret
2013-12-09 13:51:09 +01:00
%macro MMX_Copy4Times 2
movd %1, %2
punpcklwd %1, %1
punpckldq %1, %1
%endmacro
SECTION .text
%macro MMX_Quant4 4
pxor %2, %2
pcmpgtw %2, %1
pxor %1, %2
psubw %1, %2
2013-12-09 13:51:09 +01:00
paddusw %1, %3
pmulhuw %1, %4
pxor %1, %2
psubw %1, %2
%endmacro
%define dct2x2 esp + 16
%define iChromaDc esp + 20
;***********************************************************************
;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
;***********************************************************************
WELS_EXTERN WelsHadamardQuant2x2_mmx
align 16
WelsHadamardQuant2x2_mmx:
2014-01-03 07:49:45 +01:00
%assign push_num 0
LOAD_5_PARA
%ifndef X86_32
movsx r1, r1w
movsx r2, r2w
%endif
;mov eax, [pDct]
movd mm0, [r0]
movd mm1, [r0 + 0x20]
2013-12-09 13:51:09 +01:00
punpcklwd mm0, mm1
2014-01-03 07:49:45 +01:00
movd mm3, [r0 + 0x40]
movd mm1, [r0 + 0x60]
2013-12-09 13:51:09 +01:00
punpcklwd mm3, mm1
2013-12-09 13:51:09 +01:00
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
movq mm5, mm3
paddw mm3, mm0
psubw mm0, mm5
punpcklwd mm3, mm0
movq mm1, mm3
psrlq mm1, 32
movq mm5, mm1
paddw mm1, mm3
psubw mm3, mm5
punpcklwd mm1, mm3
2013-12-09 13:51:09 +01:00
;quant_2x2_dc
2014-01-03 07:49:45 +01:00
;mov ax, [mf]
MMX_Copy4Times mm3, r2d
;mov cx, [ff]
MMX_Copy4Times mm2, r1d
2013-12-09 13:51:09 +01:00
MMX_Quant4 mm1, mm0, mm2, mm3
2013-12-09 13:51:09 +01:00
; store dct_2x2
2014-01-03 07:49:45 +01:00
;mov edx, [dct2x2]
movq [r3], mm1
;mov ecx, [iChromaDc]
movq [r4], mm1
2013-12-09 13:51:09 +01:00
; pNonZeroCount of dct_2x2
pcmpeqb mm2, mm2 ; mm2 = FF
2013-12-09 13:51:09 +01:00
pxor mm3, mm3
packsswb mm1, mm3
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
psadbw mm1, mm3 ;
2014-01-03 07:49:45 +01:00
mov r1w, 0
mov [r0], r1w
mov [r0 + 0x20], r1w
mov [r0 + 0x40], r1w
mov [r0 + 0x60], r1w
movd retrd, mm1
2013-12-09 13:51:09 +01:00
WELSEMMS
2014-01-03 07:49:45 +01:00
LOAD_5_PARA_POP
2013-12-09 13:51:09 +01:00
ret
2013-12-09 13:51:09 +01:00
;***********************************************************************
;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf);
;***********************************************************************
WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
align 16
WelsHadamardQuant2x2Skip_mmx:
2014-01-03 07:49:45 +01:00
%assign push_num 0
LOAD_3_PARA
%ifndef X86_32
movsx r1, r1w
movsx r2, r2w
%endif
;mov eax, [pDct]
movd mm0, [r0]
movd mm1, [r0 + 0x20]
2013-12-09 13:51:09 +01:00
punpcklwd mm0, mm1
2014-01-03 07:49:45 +01:00
movd mm3, [r0 + 0x40]
movd mm1, [r0 + 0x60]
2013-12-09 13:51:09 +01:00
punpcklwd mm3, mm1
2013-12-09 13:51:09 +01:00
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
movq mm5, mm3
paddw mm3, mm0
psubw mm0, mm5
punpcklwd mm3, mm0
movq mm1, mm3
psrlq mm1, 32
movq mm5, mm1
paddw mm1, mm3
psubw mm3, mm5
punpcklwd mm1, mm3
2013-12-09 13:51:09 +01:00
;quant_2x2_dc
2014-01-03 07:49:45 +01:00
;mov ax, [mf]
MMX_Copy4Times mm3, r2d
;mov cx, [ff]
MMX_Copy4Times mm2, r1d
2013-12-09 13:51:09 +01:00
MMX_Quant4 mm1, mm0, mm2, mm3
2013-12-09 13:51:09 +01:00
; pNonZeroCount of dct_2x2
pcmpeqb mm2, mm2 ; mm2 = FF
2013-12-09 13:51:09 +01:00
pxor mm3, mm3
packsswb mm1, mm3
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
psadbw mm1, mm3 ;
2014-01-03 07:49:45 +01:00
movd retrd, mm1
WELSEMMS
ret
%macro SSE2_DeQuant8 3
2013-12-09 13:51:09 +01:00
MOVDQ %2, %1
pmullw %2, %3
MOVDQ %1, %2
%endmacro
2013-12-09 13:51:09 +01:00
ALIGN 16
;***********************************************************************
; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
;***********************************************************************
align 16
WELS_EXTERN WelsDequant4x4_sse2
WelsDequant4x4_sse2:
;ecx = dequant_mf[qp], edx = pDct
2014-01-03 07:49:45 +01:00
%assign push_num 0
LOAD_2_PARA
;mov ecx, [esp + 8]
;mov edx, [esp + 4]
2013-12-09 13:51:09 +01:00
2014-01-03 07:49:45 +01:00
movdqa xmm1, [r1]
SSE2_DeQuant8 [r0 ], xmm0, xmm1
SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1
2013-12-09 13:51:09 +01:00
ret
;***********************************************************************====
;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
;***********************************************************************====
2013-12-09 13:51:09 +01:00
align 16
WELS_EXTERN WelsDequantFour4x4_sse2
WelsDequantFour4x4_sse2:
;ecx = dequant_mf[qp], edx = pDct
2014-01-03 07:49:45 +01:00
%assign push_num 0
LOAD_2_PARA
;mov ecx, [esp + 8]
;mov edx, [esp + 4]
movdqa xmm1, [r1]
SSE2_DeQuant8 [r0 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1
2013-12-09 13:51:09 +01:00
ret
;***********************************************************************
;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
;***********************************************************************
WELS_EXTERN WelsDequantIHadamard4x4_sse2
align 16
WelsDequantIHadamard4x4_sse2:
2014-01-03 07:49:45 +01:00
%assign push_num 0
LOAD_2_PARA
%ifndef X86_32
movzx r1, r1w
%endif
;mov eax, [esp + 4]
;mov cx, [esp + 8]
2013-12-09 13:51:09 +01:00
; WelsDequantLumaDc4x4
2014-01-03 07:49:45 +01:00
SSE2_Copy8Times xmm1, r1d
2013-12-09 13:51:09 +01:00
;psrlw xmm1, 2 ; for the (>>2) in ihdm
2014-01-03 07:49:45 +01:00
MOVDQ xmm0, [r0]
MOVDQ xmm2, [r0+0x10]
pmullw xmm0, xmm1
2013-12-09 13:51:09 +01:00
pmullw xmm2, xmm1
; ihdm_4x4
movdqa xmm1, xmm0
psrldq xmm1, 8
movdqa xmm3, xmm2
psrldq xmm3, 8
SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
2013-12-09 13:51:09 +01:00
SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
SSE2_SumSub xmm2, xmm4, xmm5
SSE2_SumSub xmm1, xmm0, xmm5
SSE2_SumSub xmm4, xmm0, xmm5
SSE2_SumSub xmm2, xmm1, xmm5
2013-12-09 13:51:09 +01:00
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
2013-12-09 13:51:09 +01:00
punpcklqdq xmm0, xmm1
2014-01-03 07:49:45 +01:00
MOVDQ [r0], xmm0
2013-12-09 13:51:09 +01:00
punpcklqdq xmm2, xmm3
2014-01-03 07:49:45 +01:00
MOVDQ [r0+16], xmm2
2013-12-09 13:51:09 +01:00
ret