openh264/codec/encoder/core/x86/matrix_transpose.asm
Martin Storsjö 57f6bcc4b0 Convert all tabs to spaces in assembly sources, unify indentation
Previously the assembly sources had mixed indentation consisting
of both spaces and tabs, making it quite hard to read unless
the right tab size was used in the editor.

Tabs have been interpreted as 4 spaces in most cases, matching
the surrounding code.
2014-06-01 01:35:43 +03:00

396 lines
11 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* ?Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* ?Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*************************************************************************/
%include "asm_inc.asm"
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE_8x8B_MMX 10
MMX_XSwap bw, %1, %2, %8
MMX_XSwap bw, %3, %4, %2
MMX_XSwap bw, %5, %6, %4
movq %6, %9
movq %10, %4
MMX_XSwap bw, %7, %6, %4
MMX_XSwap wd, %1, %3, %6
MMX_XSwap wd, %8, %2, %3
MMX_XSwap wd, %5, %7, %2
movq %7, %10
movq %10, %3
MMX_XSwap wd, %7, %4, %3
MMX_XSwap dq, %1, %5, %4
MMX_XSwap dq, %6, %2, %5
MMX_XSwap dq, %8, %7, %2
movq %7, %10
movq %10, %5
MMX_XSwap dq, %7, %3, %5
movq %3, %10
%endmacro
;in: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
movq [%1], mm0 ; result of line 1, x8 bytes
movq [%1+%2], mm3 ; result of line 2
lea %1, [%1+2*%2]
movq [%1], mm5 ; result of line 3
movq [%1+%2], mm2 ; result of line 4
lea %1, [%1+2*%2]
movq [%1], mm7 ; result of line 5
movq [%1+%2], mm1 ; result of line 6
lea %1, [%1+2*%2]
movq [%1], mm6 ; result of line 7
movq [%1+%2], mm4 ; result of line 8
%endmacro
;in: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
movq [%1], mm0 ; result of line 1, x8 bytes
movq [%1+%2], mm3 ; result of line 2
lea %3, [%1+2*%2]
movq [%3], mm5 ; result of line 3
movq [%3+%2], mm2 ; result of line 4
lea %3, [%3+2*%2]
movq [%3], mm7 ; result of line 5
movq [%3+%2], mm1 ; result of line 6
lea %3, [%3+2*%2]
movq [%3], mm6 ; result of line 7
movq [%3+%2], mm4 ; result of line 8
%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
; for transpose 16x8
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
%macro TRANSPOSE_8x16B_SSE2 10
SSE2_XSawp bw, %1, %2, %8
SSE2_XSawp bw, %3, %4, %2
SSE2_XSawp bw, %5, %6, %4
movdqa %6, %9
movdqa %10, %4
SSE2_XSawp bw, %7, %6, %4
SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2
movdqa %7, %10
movdqa %10, %3
SSE2_XSawp wd, %7, %4, %3
SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2
movdqa %7, %10
movdqa %10, %5
SSE2_XSawp dq, %7, %3, %5
SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %10
movdqa %10, %1
SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %10
%endmacro ; end of TRANSPOSE_8x16B_SSE2
%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
movq [%1], xmm4 ; result of line 1, x8 bytes
movq [%1+%2], xmm2 ; result of line 2
lea %1, [%1+2*%2]
movq [%1], xmm3 ; result of line 3
movq [%1+%2], xmm7 ; result of line 4
lea %1, [%1+2*%2]
movq [%1], xmm5 ; result of line 5
movq [%1+%2], xmm1 ; result of line 6
lea %1, [%1+2*%2]
movq [%1], xmm6 ; result of line 7
movq [%1+%2], xmm0 ; result of line 8
lea %1, [%1+2*%2]
movhpd [%1], xmm4 ; result of line 9
movhpd [%1+%2], xmm2 ; result of line 10
lea %1, [%1+2*%2]
movhpd [%1], xmm3 ; result of line 11
movhpd [%1+%2], xmm7 ; result of line 12
lea %1, [%1+2*%2]
movhpd [%1], xmm5 ; result of line 13
movhpd [%1+%2], xmm1 ; result of line 14
lea %1, [%1+2*%2]
movhpd [%1], xmm6 ; result of line 15
movhpd [%1+%2], xmm0 ; result of line 16
%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
movq [%1], xmm4 ; result of line 1, x8 bytes
movq [%1+%2], xmm2 ; result of line 2
lea %3, [%1+2*%2]
movq [%3], xmm3 ; result of line 3
movq [%3+%2], xmm7 ; result of line 4
lea %3, [%3+2*%2]
movq [%3], xmm5 ; result of line 5
movq [%3+%2], xmm1 ; result of line 6
lea %3, [%3+2*%2]
movq [%3], xmm6 ; result of line 7
movq [%3+%2], xmm0 ; result of line 8
lea %3, [%3+2*%2]
movhpd [%3], xmm4 ; result of line 9
movhpd [%3+%2], xmm2 ; result of line 10
lea %3, [%3+2*%2]
movhpd [%3], xmm3 ; result of line 11
movhpd [%3+%2], xmm7 ; result of line 12
lea %3, [%3+2*%2]
movhpd [%3], xmm5 ; result of line 13
movhpd [%3+%2], xmm1 ; result of line 14
lea %3, [%3+2*%2]
movhpd [%3], xmm6 ; result of line 15
movhpd [%3+%2], xmm0 ; result of line 16
%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
SECTION .text
WELS_EXTERN TransposeMatrixBlock16x16_sse2
; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
push r4
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
mov r4, r7
and r4, 0Fh
sub r7, 10h
sub r7, r4
lea r5, [r3+r3*2]
; top 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+r3*2]
movdqa xmm3, [r2+r5]
lea r2, [r2+r3*4]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+r3*2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
TRANSPOSE8x16_WRITE_SSE2 r0, r1
; bottom 8x16 block
lea r2, [r2+r3*4]
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+r3*2]
movdqa xmm3, [r2+r5]
lea r2, [r2+r3*4]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+r3*2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
mov r5, r1
sal r5, 4
sub r0, r5
lea r0, [r0+r1*2+8]
TRANSPOSE8x16_WRITE_SSE2 r0, r1
add r7, r4
add r7, 10h
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
WELS_EXTERN TransposeMatrixBlocksx16_sse2
; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
push r5
push r6
%assign push_num 2
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
mov r5, r7
and r5, 0Fh
sub r7, 10h
sub r7, r5
TRANSPOSE_LOOP_SSE2:
; explictly loading next loop data
lea r6, [r2+r3*8]
push r4
%rep 8
mov r4, [r6]
mov r4, [r6+r3]
lea r6, [r6+r3*2]
%endrep
pop r4
; top 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm2, [r2]
movdqa xmm3, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
lea r2, [r2+r3*2]
; bottom 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm2, [r2]
movdqa xmm3, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
lea r2, [r2+r3*2]
lea r0, [r0+16]
dec r4
jg near TRANSPOSE_LOOP_SSE2
add r7, r5
add r7, 10h
POP_XMM
LOAD_5_PARA_POP
pop r6
pop r5
ret
WELS_EXTERN TransposeMatrixBlock8x8_mmx
; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
sub r7, 8
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
TRANSPOSE8x8_WRITE_MMX r0, r1
emms
add r7, 8
LOAD_4_PARA_POP
ret
WELS_EXTERN TransposeMatrixBlocksx8_mmx
; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
push r5
push r6
%assign push_num 2
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
sub r7, 8
lea r5, [r2+r3*8]
TRANSPOSE_BLOCKS_X8_LOOP_MMX:
; explictly loading next loop data
%rep 4
mov r6, [r5]
mov r6, [r5+r3]
lea r5, [r5+r3*2]
%endrep
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
lea r0, [r0+8]
lea r2, [r2+2*r3]
dec r4
jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
emms
add r7, 8
LOAD_5_PARA_POP
pop r6
pop r5
ret