openh264/codec/common/vaa.asm

;*!
;* \copy
;*     Copyright (c)  2010-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*	vaa.asm
;*
;*	Abstract
;*      sse2 for pVaa routines
;*
;*  History
;*      04/14/2010	Created
;*		06/07/2010	Added AnalysisVaaInfoIntra_sse2(ssse3)
;*		06/10/2010	Tune rc_sad_frame_sse2 and got about 40% improvement
;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
;*
;*************************************************************************/
%include "asm_inc.asm"


;***********************************************************************
; Macros and other preprocessor constants
;***********************************************************************

; by comparing it outperforms than phaddw(SSSE3) sets
%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
	; @sum_8x2 begin
	pshufd %2, %1, 04Eh	; 01001110 B
	paddw %1, %2
	pshuflw %2, %1, 04Eh	; 01001110 B
	paddw %1, %2
	pshuflw %2, %1, 0B1h	; 10110001 B
	paddw %1, %2
	; end of @sum_8x2
%endmacro	; END of SUM_WORD_8x2_SSE2


%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
	movdqa %1, [r0    ]	; line 0
	movdqa %2, [r0+r1]	; line 1
	movdqa %3, %1
	punpcklbw %1, xmm7
	punpckhbw %3, xmm7
	movdqa %4, %2
	punpcklbw %4, xmm7
	punpckhbw %2, xmm7
	paddw %1, %4
	paddw %2, %3
	movdqa %3, [r0+r2]	; line 2
	movdqa %4, [r0+r3]	; line 3
	movdqa %5, %3
	punpcklbw %3, xmm7
	punpckhbw %5, xmm7
	movdqa %6, %4
	punpcklbw %6, xmm7
	punpckhbw %4, xmm7
	paddw %3, %6
	paddw %4, %5
	paddw %1, %3	; block 0, 1
	paddw %2, %4	; block 2, 3
	pshufd %3, %1, 0B1h
	pshufd %4, %2, 0B1h
	paddw %1, %3
	paddw %2, %4
	movdqa %3, %1
	movdqa %4, %2
	pshuflw %5, %1, 0B1h
	pshufhw %6, %3, 0B1h
	paddw %1, %5
	paddw %3, %6
	pshuflw %5, %2, 0B1h
	pshufhw %6, %4, 0B1h
	paddw %2, %5
	paddw %4, %6
	punpcklwd %1, %2
	punpckhwd %3, %4
	punpcklwd %1, %3
	psraw %1, $4
%endmacro

%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
	movdqa %1, [r0    ]	; line 0
	movdqa %2, [r0+r1]	; line 1
	movdqa %3, %1
	punpcklbw %1, xmm7
	punpckhbw %3, xmm7
	movdqa %4, %2
	punpcklbw %4, xmm7
	punpckhbw %2, xmm7
	paddw %1, %4
	paddw %2, %3
	movdqa %3, [r0+r2]	; line 2
	movdqa %4, [r0+r3]	; line 3
	movdqa %5, %3
	punpcklbw %3, xmm7
	punpckhbw %5, xmm7
	movdqa %6, %4
	punpcklbw %6, xmm7
	punpckhbw %4, xmm7
	paddw %3, %6
	paddw %4, %5
	paddw %1, %3	; block 0, 1
	paddw %2, %4	; block 2, 3
	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
	psraw %1, $4
%endmacro


;***********************************************************************
; Local Data (Read Only)
;***********************************************************************

;SECTION .rodata align=16

;ALIGN 16
;pack1_8x2:
;	dw 1, 1, 1, 1, 1, 1, 1, 1

;***********************************************************************
; Code
;***********************************************************************

SECTION .text

; , 6/7/2010

WELS_EXTERN AnalysisVaaInfoIntra_sse2
;***********************************************************************
;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t iLineSize );
;***********************************************************************
ALIGN 16
AnalysisVaaInfoIntra_sse2:

    %assign push_num 0
    LOAD_2_PARA
    SIGN_EXTENTION r1,r1d

%ifdef X86_32
    push r3
    push r4
    push r5
    push r6
    %assign push_num push_num+4
%endif

    mov  r5,r7
    and  r5,0fh
    sub  r7,r5
    sub  r7,32


    mov r2,r1
    sal r2,$1   ;r2 = 2*iLineSize
    mov r3,r2
    add r3,r1   ;r3 = 3*iLineSize

    mov r4,r2
    sal r4,$1   ;r4 = 4*iLineSize

	pxor xmm7, xmm7

	; loops
	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
	movq [r7], xmm0

	lea r0, [r0+r4]
	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
	movq [r7+8], xmm0

	lea r0, [r0+r4]
	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
	movq [r7+16], xmm0

	lea r0, [r0+r4]
	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
	movq [r7+24], xmm0

	movdqa xmm0, [r7]		; block 0~7
	movdqa xmm1, [r7+16]	; block 8~15
	movdqa xmm2, xmm0
	paddw xmm0, xmm1
	SUM_WORD_8x2_SSE2 xmm0, xmm3

	pmullw xmm1, xmm1
	pmullw xmm2, xmm2
	movdqa xmm3, xmm1
	movdqa xmm4, xmm2
	punpcklwd xmm1, xmm7
	punpckhwd xmm3, xmm7
	punpcklwd xmm2, xmm7
	punpckhwd xmm4, xmm7
	paddd xmm1, xmm2
	paddd xmm3, xmm4
	paddd xmm1, xmm3
	pshufd xmm2, xmm1, 01Bh
	paddd xmm1, xmm2
	pshufd xmm2, xmm1, 0B1h
	paddd xmm1, xmm2


	movd r2d, xmm0
	and r2, 0ffffh		; effective low work truncated
	mov r3, r2
	imul r2, r3
	sar r2, $4
	movd retrd, xmm1
	sub retrd, r2d

	add r7,32
	add r7,r5

%ifdef X86_32
	pop r6
	pop r5
	pop r4
	pop r3
%endif

	ret

WELS_EXTERN AnalysisVaaInfoIntra_ssse3
;***********************************************************************
;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t iLineSize );
;***********************************************************************
ALIGN 16
AnalysisVaaInfoIntra_ssse3:

    %assign push_num 0
    LOAD_2_PARA
    SIGN_EXTENTION r1,r1d

%ifdef X86_32
    push r3
    push r4
    push r5
    push r6
    %assign push_num push_num+4
%endif

    mov  r5,r7
    and  r5,0fh
    sub  r7,r5
    sub  r7,32


    mov r2,r1
    sal r2,$1   ;r2 = 2*iLineSize
    mov r3,r2
    add r3,r1   ;r3 = 3*iLineSize

    mov r4,r2
    sal r4,$1   ;r4 = 4*iLineSize

	pxor xmm7, xmm7

	; loops
	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
    movq [r7],xmm0

	lea r0,[r0+r4]
	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
    movq [r7+8],xmm1


	lea r0,[r0+r4]
	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
    movq [r7+16],xmm0

	lea r0,[r0+r4]
	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
    movq [r7+24],xmm1


	movdqa xmm0,[r7]
	movdqa xmm1,[r7+16]
	movdqa xmm2, xmm0
	paddw xmm0, xmm1
	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets

	pmullw xmm1, xmm1
	pmullw xmm2, xmm2
	movdqa xmm3, xmm1
	movdqa xmm4, xmm2
	punpcklwd xmm1, xmm7
	punpckhwd xmm3, xmm7
	punpcklwd xmm2, xmm7
	punpckhwd xmm4, xmm7
	paddd xmm1, xmm2
	paddd xmm3, xmm4
	paddd xmm1, xmm3
	pshufd xmm2, xmm1, 01Bh
	paddd xmm1, xmm2
	pshufd xmm2, xmm1, 0B1h
	paddd xmm1, xmm2


    movd r2d, xmm0
    and r2, 0ffffh          ; effective low work truncated
    mov r3, r2
    imul r2, r3
    sar r2, $4
    movd retrd, xmm1
	sub retrd, r2d

	add r7,32
	add r7,r5
%ifdef X86_32
	pop r6
	pop r5
	pop r4
	pop r3
%endif

	ret

WELS_EXTERN MdInterAnalysisVaaInfo_sse41
;***********************************************************************
;	uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
;***********************************************************************
ALIGN 16
MdInterAnalysisVaaInfo_sse41:
	%assign push_num 0
	LOAD_1_PARA
	movdqa xmm0,[r0]
	pshufd xmm1, xmm0, 01Bh
	paddd xmm1, xmm0
	pshufd xmm2, xmm1, 0B1h
	paddd xmm1, xmm2
	psrad xmm1, 02h		; iAverageSad
	movdqa xmm2, xmm1
	psrad xmm2, 06h
	movdqa xmm3, xmm0	; iSadBlock
	psrad xmm3, 06h
	psubd xmm3, xmm2
	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets
	pshufd xmm4, xmm3, 01Bh
	paddd xmm4, xmm3
	pshufd xmm3, xmm4, 0B1h
	paddd xmm3, xmm4
	movd r0d, xmm3
	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD

	jb near .threshold_exit
	pshufd xmm0, xmm0, 01Bh
	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
	movmskps retrd, xmm0
	ret
.threshold_exit:
	mov retrd, 15
	ret

WELS_EXTERN MdInterAnalysisVaaInfo_sse2
;***********************************************************************
;	uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
;***********************************************************************
ALIGN 16
MdInterAnalysisVaaInfo_sse2:
	%assign push_num 0
	LOAD_1_PARA
	movdqa xmm0, [r0]
	pshufd xmm1, xmm0, 01Bh
	paddd xmm1, xmm0
	pshufd xmm2, xmm1, 0B1h
	paddd xmm1, xmm2
	psrad xmm1, 02h		; iAverageSad
	movdqa xmm2, xmm1
	psrad xmm2, 06h
	movdqa xmm3, xmm0	; iSadBlock
	psrad xmm3, 06h
	psubd xmm3, xmm2

	; to replace pmulld functionality as below
	movdqa xmm2, xmm3
	pmuludq xmm2, xmm3
	pshufd xmm4, xmm3, 0B1h
	pmuludq xmm4, xmm4
	movdqa xmm5, xmm2
	punpckldq xmm5, xmm4
	punpckhdq xmm2, xmm4
	punpcklqdq xmm5, xmm2

	pshufd xmm4, xmm5, 01Bh
	paddd xmm4, xmm5
	pshufd xmm5, xmm4, 0B1h
	paddd xmm5, xmm4

	movd r0d, xmm5
	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
	jb near .threshold_exit
	pshufd xmm0, xmm0, 01Bh
	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
	movmskps retrd, xmm0
	ret
.threshold_exit:
	mov retrd, 15
	ret
Initial Commit 2013-12-09 13:51:09 +01:00			`;*!`
			`;* \copy`
			`;* Copyright (c) 2010-2013, Cisco Systems`
			`;* All rights reserved.`
			`;*`
			`;* Redistribution and use in source and binary forms, with or without`
			`;* modification, are permitted provided that the following conditions`
			`;* are met:`
			`;*`
			`;* * Redistributions of source code must retain the above copyright`
			`;* notice, this list of conditions and the following disclaimer.`
			`;*`
			`;* * Redistributions in binary form must reproduce the above copyright`
			`;* notice, this list of conditions and the following disclaimer in`
			`;* the documentation and/or other materials provided with the`
			`;* distribution.`
			`;*`
			`;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS`
			`;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT`
			`;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS`
			`;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE`
			`;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,`
			`;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,`
			`;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;`
			`;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER`
			`;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT`
			`;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN`
			`;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`;* POSSIBILITY OF SUCH DAMAGE.`
			`;*`
			`;*`
			`;* vaa.asm`
			`;*`
			`;* Abstract`
			`;* sse2 for pVaa routines`
			`;*`
			`;* History`
			`;* 04/14/2010 Created`
			`;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)`
			`;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00			`;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2`
Initial Commit 2013-12-09 13:51:09 +01:00			`;*`
			`;*************************************************************************/`
			`%include "asm_inc.asm"`
resolve conflict 2014-01-03 07:49:45 +01:00
Initial Commit 2013-12-09 13:51:09 +01:00
			`;***********************************************************************`
			`; Macros and other preprocessor constants`
			`;***********************************************************************`

			`; by comparing it outperforms than phaddw(SSSE3) sets`
			`%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp`
			`; @sum_8x2 begin`
			`pshufd %2, %1, 04Eh ; 01001110 B`
			`paddw %1, %2`
			`pshuflw %2, %1, 04Eh ; 01001110 B`
			`paddw %1, %2`
			`pshuflw %2, %1, 0B1h ; 10110001 B`
			`paddw %1, %2`
			`; end of @sum_8x2`
			`%endmacro ; END of SUM_WORD_8x2_SSE2`


			`%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4`
resolve conflict 2014-01-03 07:49:45 +01:00			`movdqa %1, [r0 ] ; line 0`
			`movdqa %2, [r0+r1] ; line 1`
Initial Commit 2013-12-09 13:51:09 +01:00			`movdqa %3, %1`
			`punpcklbw %1, xmm7`
			`punpckhbw %3, xmm7`
			`movdqa %4, %2`
			`punpcklbw %4, xmm7`
			`punpckhbw %2, xmm7`
			`paddw %1, %4`
			`paddw %2, %3`
resolve conflict 2014-01-03 07:49:45 +01:00			`movdqa %3, [r0+r2] ; line 2`
			`movdqa %4, [r0+r3] ; line 3`
Initial Commit 2013-12-09 13:51:09 +01:00			`movdqa %5, %3`
			`punpcklbw %3, xmm7`
			`punpckhbw %5, xmm7`
			`movdqa %6, %4`
			`punpcklbw %6, xmm7`
			`punpckhbw %4, xmm7`
			`paddw %3, %6`
			`paddw %4, %5`
			`paddw %1, %3 ; block 0, 1`
			`paddw %2, %4 ; block 2, 3`
			`pshufd %3, %1, 0B1h`
			`pshufd %4, %2, 0B1h`
			`paddw %1, %3`
			`paddw %2, %4`
			`movdqa %3, %1`
			`movdqa %4, %2`
			`pshuflw %5, %1, 0B1h`
			`pshufhw %6, %3, 0B1h`
			`paddw %1, %5`
			`paddw %3, %6`
			`pshuflw %5, %2, 0B1h`
			`pshufhw %6, %4, 0B1h`
			`paddw %2, %5`
			`paddw %4, %6`
			`punpcklwd %1, %2`
			`punpckhwd %3, %4`
			`punpcklwd %1, %3`
			`psraw %1, $4`
			`%endmacro`

			`%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4`
resolve conflict 2014-01-03 07:49:45 +01:00			`movdqa %1, [r0 ] ; line 0`
			`movdqa %2, [r0+r1] ; line 1`
Initial Commit 2013-12-09 13:51:09 +01:00			`movdqa %3, %1`
			`punpcklbw %1, xmm7`
			`punpckhbw %3, xmm7`
			`movdqa %4, %2`
			`punpcklbw %4, xmm7`
			`punpckhbw %2, xmm7`
			`paddw %1, %4`
			`paddw %2, %3`
resolve conflict 2014-01-03 07:49:45 +01:00			`movdqa %3, [r0+r2] ; line 2`
			`movdqa %4, [r0+r3] ; line 3`
Initial Commit 2013-12-09 13:51:09 +01:00			`movdqa %5, %3`
			`punpcklbw %3, xmm7`
			`punpckhbw %5, xmm7`
			`movdqa %6, %4`
			`punpcklbw %6, xmm7`
			`punpckhbw %4, xmm7`
			`paddw %3, %6`
			`paddw %4, %5`
			`paddw %1, %3 ; block 0, 1`
			`paddw %2, %4 ; block 2, 3`
			`phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..`
			`phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....`
			`psraw %1, $4`
			`%endmacro`



			`;***********************************************************************`
			`; Local Data (Read Only)`
			`;***********************************************************************`

			`;SECTION .rodata align=16`

			`;ALIGN 16`
			`;pack1_8x2:`
			`; dw 1, 1, 1, 1, 1, 1, 1, 1`

			`;***********************************************************************`
			`; Code`
			`;***********************************************************************`

			`SECTION .text`

			`; , 6/7/2010`

			`WELS_EXTERN AnalysisVaaInfoIntra_sse2`
			`;***********************************************************************`
			`; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );`
			`;***********************************************************************`
			`ALIGN 16`
			`AnalysisVaaInfoIntra_sse2:`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00
resolve conflict 2014-01-03 07:49:45 +01:00			`%assign push_num 0`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00			`LOAD_2_PARA`
resolve conflict 2014-01-03 07:49:45 +01:00			`SIGN_EXTENTION r1,r1d`

			`%ifdef X86_32`
			`push r3`
			`push r4`
			`push r5`
			`push r6`
			`%assign push_num push_num+4`
			`%endif`

			`mov r5,r7`
			`and r5,0fh`
			`sub r7,r5`
			`sub r7,32`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00

			`mov r2,r1`
resolve conflict 2014-01-03 07:49:45 +01:00			`sal r2,$1 ;r2 = 2*iLineSize`
			`mov r3,r2`
			`add r3,r1 ;r3 = 3*iLineSize`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00
resolve conflict 2014-01-03 07:49:45 +01:00			`mov r4,r2`
			`sal r4,$1 ;r4 = 4*iLineSize`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00
Initial Commit 2013-12-09 13:51:09 +01:00			`pxor xmm7, xmm7`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00
Initial Commit 2013-12-09 13:51:09 +01:00			`; loops`
			`VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5`
resolve conflict 2014-01-03 07:49:45 +01:00			`movq [r7], xmm0`
Initial Commit 2013-12-09 13:51:09 +01:00
resolve conflict 2014-01-03 07:49:45 +01:00			`lea r0, [r0+r4]`
Initial Commit 2013-12-09 13:51:09 +01:00			`VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5`
resolve conflict 2014-01-03 07:49:45 +01:00			`movq [r7+8], xmm0`
Initial Commit 2013-12-09 13:51:09 +01:00
resolve conflict 2014-01-03 07:49:45 +01:00			`lea r0, [r0+r4]`
Initial Commit 2013-12-09 13:51:09 +01:00			`VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5`
resolve conflict 2014-01-03 07:49:45 +01:00			`movq [r7+16], xmm0`
Initial Commit 2013-12-09 13:51:09 +01:00
resolve conflict 2014-01-03 07:49:45 +01:00			`lea r0, [r0+r4]`
Initial Commit 2013-12-09 13:51:09 +01:00			`VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5`
resolve conflict 2014-01-03 07:49:45 +01:00			`movq [r7+24], xmm0`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00
resolve conflict 2014-01-03 07:49:45 +01:00			`movdqa xmm0, [r7] ; block 0~7`
			`movdqa xmm1, [r7+16] ; block 8~15`
Initial Commit 2013-12-09 13:51:09 +01:00			`movdqa xmm2, xmm0`
			`paddw xmm0, xmm1`
			`SUM_WORD_8x2_SSE2 xmm0, xmm3`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00
Initial Commit 2013-12-09 13:51:09 +01:00			`pmullw xmm1, xmm1`
			`pmullw xmm2, xmm2`
			`movdqa xmm3, xmm1`
			`movdqa xmm4, xmm2`
			`punpcklwd xmm1, xmm7`
			`punpckhwd xmm3, xmm7`
			`punpcklwd xmm2, xmm7`
			`punpckhwd xmm4, xmm7`
			`paddd xmm1, xmm2`
			`paddd xmm3, xmm4`
			`paddd xmm1, xmm3`
			`pshufd xmm2, xmm1, 01Bh`
			`paddd xmm1, xmm2`
			`pshufd xmm2, xmm1, 0B1h`
			`paddd xmm1, xmm2`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00

resolve conflict 2014-01-03 07:49:45 +01:00			`movd r2d, xmm0`
			`and r2, 0ffffh ; effective low work truncated`
			`mov r3, r2`
			`imul r2, r3`
			`sar r2, $4`
			`movd retrd, xmm1`
			`sub retrd, r2d`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00
resolve conflict 2014-01-03 07:49:45 +01:00			`add r7,32`
			`add r7,r5`

			`%ifdef X86_32`
			`pop r6`
			`pop r5`
			`pop r4`
			`pop r3`
			`%endif`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00
Initial Commit 2013-12-09 13:51:09 +01:00			`ret`

			`WELS_EXTERN AnalysisVaaInfoIntra_ssse3`
			`;***********************************************************************`
			`; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );`
			`;***********************************************************************`
			`ALIGN 16`
			`AnalysisVaaInfoIntra_ssse3:`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00
resolve conflict 2014-01-03 07:49:45 +01:00			`%assign push_num 0`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00			`LOAD_2_PARA`
resolve conflict 2014-01-03 07:49:45 +01:00			`SIGN_EXTENTION r1,r1d`

			`%ifdef X86_32`
			`push r3`
			`push r4`
			`push r5`
			`push r6`
			`%assign push_num push_num+4`
			`%endif`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00
resolve conflict 2014-01-03 07:49:45 +01:00			`mov r5,r7`
			`and r5,0fh`
			`sub r7,r5`
			`sub r7,32`

Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00
			`mov r2,r1`
resolve conflict 2014-01-03 07:49:45 +01:00			`sal r2,$1 ;r2 = 2*iLineSize`
			`mov r3,r2`
			`add r3,r1 ;r3 = 3*iLineSize`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00
resolve conflict 2014-01-03 07:49:45 +01:00			`mov r4,r2`
			`sal r4,$1 ;r4 = 4*iLineSize`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00
Initial Commit 2013-12-09 13:51:09 +01:00			`pxor xmm7, xmm7`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00
Initial Commit 2013-12-09 13:51:09 +01:00			`; loops`
			`VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5`
resolve conflict 2014-01-03 07:49:45 +01:00			`movq [r7],xmm0`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00
resolve conflict 2014-01-03 07:49:45 +01:00			`lea r0,[r0+r4]`
Initial Commit 2013-12-09 13:51:09 +01:00			`VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6`
resolve conflict 2014-01-03 07:49:45 +01:00			`movq [r7+8],xmm1`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00

resolve conflict 2014-01-03 07:49:45 +01:00			`lea r0,[r0+r4]`
Initial Commit 2013-12-09 13:51:09 +01:00			`VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5`
resolve conflict 2014-01-03 07:49:45 +01:00			`movq [r7+16],xmm0`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00
resolve conflict 2014-01-03 07:49:45 +01:00			`lea r0,[r0+r4]`
Initial Commit 2013-12-09 13:51:09 +01:00			`VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6`
resolve conflict 2014-01-03 07:49:45 +01:00			`movq [r7+24],xmm1`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00

resolve conflict 2014-01-03 07:49:45 +01:00			`movdqa xmm0,[r7]`
			`movdqa xmm1,[r7+16]`
Initial Commit 2013-12-09 13:51:09 +01:00			`movdqa xmm2, xmm0`
			`paddw xmm0, xmm1`
			`SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets`

			`pmullw xmm1, xmm1`
			`pmullw xmm2, xmm2`
			`movdqa xmm3, xmm1`
			`movdqa xmm4, xmm2`
			`punpcklwd xmm1, xmm7`
			`punpckhwd xmm3, xmm7`
			`punpcklwd xmm2, xmm7`
			`punpckhwd xmm4, xmm7`
			`paddd xmm1, xmm2`
			`paddd xmm3, xmm4`
			`paddd xmm1, xmm3`
			`pshufd xmm2, xmm1, 01Bh`
			`paddd xmm1, xmm2`
			`pshufd xmm2, xmm1, 0B1h`
			`paddd xmm1, xmm2`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00
resolve conflict 2014-01-03 07:49:45 +01:00			`movd r2d, xmm0`
			`and r2, 0ffffh ; effective low work truncated`
			`mov r3, r2`
			`imul r2, r3`
			`sar r2, $4`
			`movd retrd, xmm1`
			`sub retrd, r2d`

			`add r7,32`
			`add r7,r5`
			`%ifdef X86_32`
			`pop r6`
			`pop r5`
			`pop r4`
			`pop r3`
			`%endif`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00
Initial Commit 2013-12-09 13:51:09 +01:00			`ret`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00
Initial Commit 2013-12-09 13:51:09 +01:00			`WELS_EXTERN MdInterAnalysisVaaInfo_sse41`
			`;***********************************************************************`
			`; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )`
			`;***********************************************************************`
			`ALIGN 16`
			`MdInterAnalysisVaaInfo_sse41:`
resolve conflict 2014-01-03 07:49:45 +01:00			`%assign push_num 0`
			`LOAD_1_PARA`
			`movdqa xmm0,[r0]`
Initial Commit 2013-12-09 13:51:09 +01:00			`pshufd xmm1, xmm0, 01Bh`
			`paddd xmm1, xmm0`
			`pshufd xmm2, xmm1, 0B1h`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00			`paddd xmm1, xmm2`
Initial Commit 2013-12-09 13:51:09 +01:00			`psrad xmm1, 02h ; iAverageSad`
			`movdqa xmm2, xmm1`
			`psrad xmm2, 06h`
			`movdqa xmm3, xmm0 ; iSadBlock`
			`psrad xmm3, 06h`
			`psubd xmm3, xmm2`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00			`pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets`
Initial Commit 2013-12-09 13:51:09 +01:00			`pshufd xmm4, xmm3, 01Bh`
			`paddd xmm4, xmm3`
			`pshufd xmm3, xmm4, 0B1h`
			`paddd xmm3, xmm4`
resolve conflict 2014-01-03 07:49:45 +01:00			`movd r0d, xmm3`
			`cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00
Initial Commit 2013-12-09 13:51:09 +01:00			`jb near .threshold_exit`
resolve conflict 2014-01-03 07:49:45 +01:00			`pshufd xmm0, xmm0, 01Bh`
Initial Commit 2013-12-09 13:51:09 +01:00			`pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad`
resolve conflict 2014-01-03 07:49:45 +01:00			`movmskps retrd, xmm0`
Initial Commit 2013-12-09 13:51:09 +01:00			`ret`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00			`.threshold_exit:`
resolve conflict 2014-01-03 07:49:45 +01:00			`mov retrd, 15`
Initial Commit 2013-12-09 13:51:09 +01:00			`ret`

			`WELS_EXTERN MdInterAnalysisVaaInfo_sse2`
			`;***********************************************************************`
			`; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )`
			`;***********************************************************************`
			`ALIGN 16`
			`MdInterAnalysisVaaInfo_sse2:`
resolve conflict 2014-01-03 07:49:45 +01:00			`%assign push_num 0`
			`LOAD_1_PARA`
			`movdqa xmm0, [r0]`
Initial Commit 2013-12-09 13:51:09 +01:00			`pshufd xmm1, xmm0, 01Bh`
			`paddd xmm1, xmm0`
			`pshufd xmm2, xmm1, 0B1h`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00			`paddd xmm1, xmm2`
Initial Commit 2013-12-09 13:51:09 +01:00			`psrad xmm1, 02h ; iAverageSad`
			`movdqa xmm2, xmm1`
			`psrad xmm2, 06h`
			`movdqa xmm3, xmm0 ; iSadBlock`
			`psrad xmm3, 06h`
			`psubd xmm3, xmm2`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00
Initial Commit 2013-12-09 13:51:09 +01:00			`; to replace pmulld functionality as below`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00			`movdqa xmm2, xmm3`
Initial Commit 2013-12-09 13:51:09 +01:00			`pmuludq xmm2, xmm3`
			`pshufd xmm4, xmm3, 0B1h`
			`pmuludq xmm4, xmm4`
			`movdqa xmm5, xmm2`
			`punpckldq xmm5, xmm4`
			`punpckhdq xmm2, xmm4`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00			`punpcklqdq xmm5, xmm2`

Initial Commit 2013-12-09 13:51:09 +01:00			`pshufd xmm4, xmm5, 01Bh`
			`paddd xmm4, xmm5`
			`pshufd xmm5, xmm4, 0B1h`
			`paddd xmm5, xmm4`
Get rid of trailing whitespace in the assembly source files 2014-01-05 13:16:22 +01:00
resolve conflict 2014-01-03 07:49:45 +01:00			`movd r0d, xmm5`
			`cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD`
Initial Commit 2013-12-09 13:51:09 +01:00			`jb near .threshold_exit`
resolve conflict 2014-01-03 07:49:45 +01:00			`pshufd xmm0, xmm0, 01Bh`
Initial Commit 2013-12-09 13:51:09 +01:00			`pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad`
resolve conflict 2014-01-03 07:49:45 +01:00			`movmskps retrd, xmm0`
Initial Commit 2013-12-09 13:51:09 +01:00			`ret`
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged. 2013-12-13 09:06:44 +01:00			`.threshold_exit:`
resolve conflict 2014-01-03 07:49:45 +01:00			`mov retrd, 15`
Initial Commit 2013-12-09 13:51:09 +01:00			`ret`