openh264/codec/encoder/core/x86/matrix_transpose.asm

396 lines
9.9 KiB
NASM
Raw Normal View History

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* ?Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* ?Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*************************************************************************/
%include "asm_inc.asm"
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE_8x8B_MMX 10
MMX_XSwap bw, %1, %2, %8
MMX_XSwap bw, %3, %4, %2
MMX_XSwap bw, %5, %6, %4
movq %6, %9
movq %10, %4
MMX_XSwap bw, %7, %6, %4
MMX_XSwap wd, %1, %3, %6
MMX_XSwap wd, %8, %2, %3
MMX_XSwap wd, %5, %7, %2
movq %7, %10
movq %10, %3
MMX_XSwap wd, %7, %4, %3
MMX_XSwap dq, %1, %5, %4
MMX_XSwap dq, %6, %2, %5
MMX_XSwap dq, %8, %7, %2
movq %7, %10
movq %10, %5
MMX_XSwap dq, %7, %3, %5
movq %3, %10
%endmacro
;in: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
movq [%1], mm0 ; result of line 1, x8 bytes
movq [%1+%2], mm3 ; result of line 2
lea %1, [%1+2*%2]
movq [%1], mm5 ; result of line 3
movq [%1+%2], mm2 ; result of line 4
lea %1, [%1+2*%2]
movq [%1], mm7 ; result of line 5
movq [%1+%2], mm1 ; result of line 6
lea %1, [%1+2*%2]
movq [%1], mm6 ; result of line 7
movq [%1+%2], mm4 ; result of line 8
%endmacro
;in: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
movq [%1], mm0 ; result of line 1, x8 bytes
movq [%1+%2], mm3 ; result of line 2
lea %3, [%1+2*%2]
movq [%3], mm5 ; result of line 3
movq [%3+%2], mm2 ; result of line 4
lea %3, [%3+2*%2]
movq [%3], mm7 ; result of line 5
movq [%3+%2], mm1 ; result of line 6
lea %3, [%3+2*%2]
movq [%3], mm6 ; result of line 7
movq [%3+%2], mm4 ; result of line 8
%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
; for transpose 16x8
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
%macro TRANSPOSE_8x16B_SSE2 10
SSE2_XSawp bw, %1, %2, %8
SSE2_XSawp bw, %3, %4, %2
SSE2_XSawp bw, %5, %6, %4
movdqa %6, %9
movdqa %10, %4
SSE2_XSawp bw, %7, %6, %4
SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2
movdqa %7, %10
movdqa %10, %3
SSE2_XSawp wd, %7, %4, %3
SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2
movdqa %7, %10
movdqa %10, %5
SSE2_XSawp dq, %7, %3, %5
SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %10
movdqa %10, %1
SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %10
%endmacro ; end of TRANSPOSE_8x16B_SSE2
%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
movq [%1], xmm4 ; result of line 1, x8 bytes
movq [%1+%2], xmm2 ; result of line 2
lea %1, [%1+2*%2]
movq [%1], xmm3 ; result of line 3
movq [%1+%2], xmm7 ; result of line 4
lea %1, [%1+2*%2]
movq [%1], xmm5 ; result of line 5
movq [%1+%2], xmm1 ; result of line 6
lea %1, [%1+2*%2]
movq [%1], xmm6 ; result of line 7
movq [%1+%2], xmm0 ; result of line 8
lea %1, [%1+2*%2]
movhpd [%1], xmm4 ; result of line 9
movhpd [%1+%2], xmm2 ; result of line 10
lea %1, [%1+2*%2]
movhpd [%1], xmm3 ; result of line 11
movhpd [%1+%2], xmm7 ; result of line 12
lea %1, [%1+2*%2]
movhpd [%1], xmm5 ; result of line 13
movhpd [%1+%2], xmm1 ; result of line 14
lea %1, [%1+2*%2]
movhpd [%1], xmm6 ; result of line 15
movhpd [%1+%2], xmm0 ; result of line 16
%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
movq [%1], xmm4 ; result of line 1, x8 bytes
movq [%1+%2], xmm2 ; result of line 2
lea %3, [%1+2*%2]
movq [%3], xmm3 ; result of line 3
movq [%3+%2], xmm7 ; result of line 4
lea %3, [%3+2*%2]
movq [%3], xmm5 ; result of line 5
movq [%3+%2], xmm1 ; result of line 6
lea %3, [%3+2*%2]
movq [%3], xmm6 ; result of line 7
movq [%3+%2], xmm0 ; result of line 8
lea %3, [%3+2*%2]
movhpd [%3], xmm4 ; result of line 9
movhpd [%3+%2], xmm2 ; result of line 10
lea %3, [%3+2*%2]
movhpd [%3], xmm3 ; result of line 11
movhpd [%3+%2], xmm7 ; result of line 12
lea %3, [%3+2*%2]
movhpd [%3], xmm5 ; result of line 13
movhpd [%3+%2], xmm1 ; result of line 14
lea %3, [%3+2*%2]
movhpd [%3], xmm6 ; result of line 15
movhpd [%3+%2], xmm0 ; result of line 16
%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
SECTION .text
WELS_EXTERN TransposeMatrixBlock16x16_sse2
; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
push r4
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
mov r4, r7
and r4, 0Fh
sub r7, 10h
sub r7, r4
lea r5, [r3+r3*2]
; top 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+r3*2]
movdqa xmm3, [r2+r5]
lea r2, [r2+r3*4]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+r3*2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
TRANSPOSE8x16_WRITE_SSE2 r0, r1
; bottom 8x16 block
lea r2, [r2+r3*4]
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+r3*2]
movdqa xmm3, [r2+r5]
lea r2, [r2+r3*4]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+r3*2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
mov r5, r1
sal r5, 4
sub r0, r5
lea r0, [r0+r1*2+8]
TRANSPOSE8x16_WRITE_SSE2 r0, r1
add r7, r4
add r7, 10h
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
WELS_EXTERN TransposeMatrixBlocksx16_sse2
; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
push r5
push r6
%assign push_num 2
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
mov r5, r7
and r5, 0Fh
sub r7, 10h
sub r7, r5
TRANSPOSE_LOOP_SSE2:
; explictly loading next loop data
lea r6, [r2+r3*8]
push r4
%rep 8
mov r4, [r6]
mov r4, [r6+r3]
lea r6, [r6+r3*2]
%endrep
pop r4
; top 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm2, [r2]
movdqa xmm3, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
lea r2, [r2+r3*2]
; bottom 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm2, [r2]
movdqa xmm3, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
lea r2, [r2+r3*2]
lea r0, [r0+16]
dec r4
jg near TRANSPOSE_LOOP_SSE2
add r7, r5
add r7, 10h
POP_XMM
LOAD_5_PARA_POP
pop r6
pop r5
ret
WELS_EXTERN TransposeMatrixBlock8x8_mmx
; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
sub r7, 8
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
TRANSPOSE8x8_WRITE_MMX r0, r1
emms
add r7, 8
LOAD_4_PARA_POP
ret
WELS_EXTERN TransposeMatrixBlocksx8_mmx
; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
push r5
push r6
%assign push_num 2
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
sub r7, 8
lea r5, [r2+r3*8]
TRANSPOSE_BLOCKS_X8_LOOP_MMX:
; explictly loading next loop data
%rep 4
mov r6, [r5]
mov r6, [r5+r3]
lea r5, [r5+r3*2]
%endrep
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
lea r0, [r0+8]
lea r2, [r2+2*r3]
dec r4
jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
emms
add r7, 8
LOAD_5_PARA_POP
pop r6
pop r5
ret