openh264/codec/decoder/core/asm/block_add.asm

414 lines
9.9 KiB
NASM
Raw Normal View History

2013-12-09 13:51:09 +01:00
;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* block_add.asm
;*
;* Abstract
;* add block
;*
;* History
;* 09/21/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
BITS 32
;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************
%macro BLOCK_ADD_16_SSE2 4
2013-12-09 13:51:09 +01:00
movdqa xmm0, [%2]
movdqa xmm1, [%3]
movdqa xmm2, [%3+10h]
movdqa xmm6, xmm0
punpcklbw xmm0, xmm7
punpckhbw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm6, xmm2
packuswb xmm0, xmm6
movdqa [%1], xmm0
lea %2, [%2+%4]
lea %3, [%3+%4*2]
lea %1, [%1+%4]
2013-12-09 13:51:09 +01:00
%endmacro
%macro BLOCK_ADD_8_MMXEXT 4
movq mm0, [%2]
movq mm1, [%3]
movq mm2, [%3+08h]
movq mm6, mm0
punpcklbw mm0, mm7
punpckhbw mm6, mm7
paddw mm0, mm1
paddw mm6, mm2
packuswb mm0, mm6
movq [%1], mm0
lea %2, [%2+%4]
lea %3, [%3+%4*2]
lea %1, [%1+%4]
%endmacro
%macro BLOCK_ADD_16_STRIDE_SSE2 5
movdqa xmm0, [%2]
movdqa xmm1, [%3]
movdqa xmm2, [%3+10h]
movdqa xmm6, xmm0
punpcklbw xmm0, xmm7
punpckhbw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm6, xmm2
packuswb xmm0, xmm6
movdqa [%1], xmm0
lea %2, [%2+%4]
lea %3, [%3+%5*2]
lea %1, [%1+%4]
2013-12-09 13:51:09 +01:00
%endmacro
%macro BLOCK_ADD_8_STRIDE_MMXEXT 5
movq mm0, [%2]
movq mm1, [%3]
movq mm2, [%3+08h]
movq mm6, mm0
punpcklbw mm0, mm7
punpckhbw mm6, mm7
paddw mm0, mm1
paddw mm6, mm2
packuswb mm0, mm6
movq [%1], mm0
lea %2, [%2+%4]
lea %3, [%3+%5*2]
lea %1, [%1+%4]
%endmacro
%macro BLOCK_ADD_8_STRIDE_2_LINES_SSE2 5
2013-12-09 13:51:09 +01:00
movdqa xmm1, [%3]
movq xmm0, [%2]
punpcklbw xmm0, xmm7
paddw xmm0, xmm1
packuswb xmm0, xmm7
movq [%1], xmm0
2013-12-09 13:51:09 +01:00
movdqa xmm3, [%3+%5*2]
movq xmm2, [%2+%4]
punpcklbw xmm2, xmm7
paddw xmm2, xmm3
packuswb xmm2, xmm7
movq [%1+%4], xmm2
2013-12-09 13:51:09 +01:00
lea %1, [%1+%4*2]
lea %2, [%2+%4*2]
lea %3, [%3+%5*4]
2013-12-09 13:51:09 +01:00
%endmacro
%macro CHECK_DATA_16_ZERO_SSE4 3
mov eax, 0h
movdqa xmm0, [%1]
movdqa xmm1, [%1+10h]
mov ebx, [ecx]
por xmm0, xmm1
ptest xmm7, xmm0
cmovae eax, %3
2013-12-09 13:51:09 +01:00
add %1, 20h
add ecx, 04h
mov byte [%2+ebx], al
%endmacro
%macro CHECK_RS_4x4_BLOCK_2_ZERO_SSE4 5
movdqa xmm0, [%1]
movdqa xmm1, [%1+%3]
movdqa xmm2, [%1+%3*2]
movdqa xmm3, [%1+%4]
2013-12-09 13:51:09 +01:00
mov eax, 0h
mov ebx, 0h
movdqa xmm4, xmm0
movdqa xmm5, xmm2
2013-12-09 13:51:09 +01:00
punpcklqdq xmm0, xmm1
punpckhqdq xmm4, xmm1
punpcklqdq xmm2, xmm3
punpckhqdq xmm5, xmm3
por xmm0, xmm2
por xmm4, xmm5
2013-12-09 13:51:09 +01:00
ptest xmm7, xmm0
cmovae eax, %5
ptest xmm7, xmm4
cmovae ebx, %5
2013-12-09 13:51:09 +01:00
mov byte [%2], al
mov byte [%2+1], bl
%endmacro
%macro DATA_COPY_16x2_SSE2 3
movdqa xmm0, [%1]
movdqa xmm1, [%1+10h]
movdqa xmm2, [%1+%3]
movdqa xmm3, [%1+%3+10h]
movdqa [%2], xmm0
movdqa [%2+10h], xmm1
movdqa [%2+20h], xmm2
movdqa [%2+30h], xmm3
lea %1, [%1+%3*2]
lea %2, [%2+40h]
%endmacro
%macro DATA_COPY_8x4_SSE2 4
movdqa xmm0, [%1]
movdqa xmm1, [%1+%3]
movdqa xmm2, [%1+%3*2]
movdqa xmm3, [%1+%4]
movdqa [%2], xmm0
movdqa [%2+10h], xmm1
movdqa [%2+20h], xmm2
movdqa [%2+30h], xmm3
lea %1, [%1+%3*4]
lea %2, [%2+40h]
%endmacro
%macro CHECK_DATA_16_ZERO_SSE2 3
mov eax, 0h
movdqa xmm0, [%1]
movdqa xmm1, [%1+10h]
mov ebx, [ecx]
2013-12-09 13:51:09 +01:00
pcmpeqw xmm0, xmm7
pcmpeqw xmm1, xmm7
packsswb xmm0, xmm1
pmovmskb edx, xmm0
2013-12-09 13:51:09 +01:00
sub edx, 0ffffh
cmovb eax, ebp
2013-12-09 13:51:09 +01:00
add ecx, 4
add %1, 20h
mov byte [%2+ebx], al
%endmacro
2013-12-09 13:51:09 +01:00
%macro CHECK_RS_4x4_BLOCK_2_ZERO_SSE2 5
movdqa xmm0, [%1]
movdqa xmm1, [%1 + %3]
movdqa xmm2, [%1 + %3*2]
movdqa xmm3, [%1 + %4]
2013-12-09 13:51:09 +01:00
movdqa xmm4, xmm0
movdqa xmm5, xmm2
2013-12-09 13:51:09 +01:00
punpcklqdq xmm0, xmm1
punpckhqdq xmm4, xmm1
punpcklqdq xmm2, xmm3
punpckhqdq xmm5, xmm3
2013-12-09 13:51:09 +01:00
pcmpeqw xmm0, xmm7
pcmpeqw xmm2, xmm7
pcmpeqw xmm4, xmm7
pcmpeqw xmm5, xmm7
2013-12-09 13:51:09 +01:00
packsswb xmm0, xmm2
packsswb xmm4, xmm5
pmovmskb eax, xmm0
pmovmskb ebx, xmm4
2013-12-09 13:51:09 +01:00
sub eax, 0ffffh
mov eax, 0
cmovb eax, %5
sub ebx, 0ffffh
mov ebx, 0
cmovb ebx, %5
mov byte [%2], al
mov byte [%2+1], bl
2013-12-09 13:51:09 +01:00
%endmacro
;*******************************************************************************
; Data
;*******************************************************************************
%ifdef FORMAT_COFF
SECTION .rodata data
%else
SECTION .rodata align=16
%endif
ALIGN 16
SubMbScanIdx:
dd 0x0, 0x1, 0x4, 0x5,
2013-12-09 13:51:09 +01:00
dd 0x2, 0x3, 0x6, 0x7,
dd 0x8, 0x9, 0xc, 0xd,
dd 0xa, 0xb, 0xe, 0xf,
dd 0x10, 0x11, 0x14, 0x15,
dd 0x12, 0x13, 0x16, 0x17,
2013-12-09 13:51:09 +01:00
;*******************************************************************************
; Code
;*******************************************************************************
SECTION .text
WELS_EXTERN WelsResBlockZero16x16_sse2
ALIGN 16
;*******************************************************************************
; void_t WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
;*******************************************************************************
WelsResBlockZero16x16_sse2:
push esi
2013-12-09 13:51:09 +01:00
mov esi, [esp+08h]
mov ecx, [esp+0ch]
2013-12-09 13:51:09 +01:00
lea ecx, [ecx*2]
lea eax, [ecx*3]
pxor xmm7, xmm7
; four lines
movdqa [esi], xmm7
movdqa [esi+10h], xmm7
movdqa [esi+ecx], xmm7
movdqa [esi+ecx+10h], xmm7
movdqa [esi+ecx*2], xmm7
movdqa [esi+ecx*2+10h], xmm7
movdqa [esi+eax], xmm7
movdqa [esi+eax+10h], xmm7
; four lines
lea esi, [esi+ecx*4]
movdqa [esi], xmm7
movdqa [esi+10h], xmm7
movdqa [esi+ecx], xmm7
movdqa [esi+ecx+10h], xmm7
movdqa [esi+ecx*2], xmm7
movdqa [esi+ecx*2+10h], xmm7
movdqa [esi+eax], xmm7
movdqa [esi+eax+10h], xmm7
; four lines
lea esi, [esi+ecx*4]
movdqa [esi], xmm7
movdqa [esi+10h], xmm7
movdqa [esi+ecx], xmm7
movdqa [esi+ecx+10h], xmm7
movdqa [esi+ecx*2], xmm7
movdqa [esi+ecx*2+10h], xmm7
movdqa [esi+eax], xmm7
movdqa [esi+eax+10h], xmm7
; four lines
lea esi, [esi+ecx*4]
movdqa [esi], xmm7
movdqa [esi+10h], xmm7
movdqa [esi+ecx], xmm7
movdqa [esi+ecx+10h], xmm7
movdqa [esi+ecx*2], xmm7
movdqa [esi+ecx*2+10h], xmm7
movdqa [esi+eax], xmm7
movdqa [esi+eax+10h], xmm7
2013-12-09 13:51:09 +01:00
pop esi
ret
WELS_EXTERN WelsResBlockZero8x8_sse2
ALIGN 16
;*******************************************************************************
; void_t WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
;*******************************************************************************
WelsResBlockZero8x8_sse2:
2013-12-09 13:51:09 +01:00
push esi
mov esi, [esp+08h]
mov ecx, [esp+0ch]
lea ecx, [ecx*2]
lea eax, [ecx*3]
pxor xmm7, xmm7
movdqa [esi], xmm7
movdqa [esi+ecx], xmm7
movdqa [esi+ecx*2], xmm7
movdqa [esi+eax], xmm7
lea esi, [esi+ecx*4]
movdqa [esi], xmm7
movdqa [esi+ecx], xmm7
movdqa [esi+ecx*2], xmm7
movdqa [esi+eax], xmm7
2013-12-09 13:51:09 +01:00
pop esi
ret