396 lines
9.9 KiB
NASM
396 lines
9.9 KiB
NASM
|
;*!
|
||
|
;* \copy
|
||
|
;* Copyright (c) 2009-2013, Cisco Systems
|
||
|
;* All rights reserved.
|
||
|
;*
|
||
|
;* Redistribution and use in source and binary forms, with or without
|
||
|
;* modification, are permitted provided that the following conditions
|
||
|
;* are met:
|
||
|
;*
|
||
|
;* ?Redistributions of source code must retain the above copyright
|
||
|
;* notice, this list of conditions and the following disclaimer.
|
||
|
;*
|
||
|
;* ?Redistributions in binary form must reproduce the above copyright
|
||
|
;* notice, this list of conditions and the following disclaimer in
|
||
|
;* the documentation and/or other materials provided with the
|
||
|
;* distribution.
|
||
|
;*
|
||
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||
|
;* POSSIBILITY OF SUCH DAMAGE.
|
||
|
;*************************************************************************/
|
||
|
|
||
|
%include "asm_inc.asm"
|
||
|
|
||
|
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||
|
;out: m0, m3, m5, m2, m7, m1, m6, m4
|
||
|
%macro TRANSPOSE_8x8B_MMX 10
|
||
|
MMX_XSwap bw, %1, %2, %8
|
||
|
MMX_XSwap bw, %3, %4, %2
|
||
|
MMX_XSwap bw, %5, %6, %4
|
||
|
movq %6, %9
|
||
|
movq %10, %4
|
||
|
MMX_XSwap bw, %7, %6, %4
|
||
|
|
||
|
MMX_XSwap wd, %1, %3, %6
|
||
|
MMX_XSwap wd, %8, %2, %3
|
||
|
MMX_XSwap wd, %5, %7, %2
|
||
|
movq %7, %10
|
||
|
movq %10, %3
|
||
|
MMX_XSwap wd, %7, %4, %3
|
||
|
|
||
|
MMX_XSwap dq, %1, %5, %4
|
||
|
MMX_XSwap dq, %6, %2, %5
|
||
|
MMX_XSwap dq, %8, %7, %2
|
||
|
movq %7, %10
|
||
|
movq %10, %5
|
||
|
MMX_XSwap dq, %7, %3, %5
|
||
|
|
||
|
movq %3, %10
|
||
|
%endmacro
|
||
|
|
||
|
;in: m0, m3, m5, m2, m7, m1, m6, m4
|
||
|
%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
|
||
|
movq [%1], mm0 ; result of line 1, x8 bytes
|
||
|
movq [%1+%2], mm3 ; result of line 2
|
||
|
lea %1, [%1+2*%2]
|
||
|
movq [%1], mm5 ; result of line 3
|
||
|
movq [%1+%2], mm2 ; result of line 4
|
||
|
lea %1, [%1+2*%2]
|
||
|
movq [%1], mm7 ; result of line 5
|
||
|
movq [%1+%2], mm1 ; result of line 6
|
||
|
lea %1, [%1+2*%2]
|
||
|
movq [%1], mm6 ; result of line 7
|
||
|
movq [%1+%2], mm4 ; result of line 8
|
||
|
%endmacro
|
||
|
|
||
|
;in: m0, m3, m5, m2, m7, m1, m6, m4
|
||
|
%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
|
||
|
movq [%1], mm0 ; result of line 1, x8 bytes
|
||
|
movq [%1+%2], mm3 ; result of line 2
|
||
|
lea %3, [%1+2*%2]
|
||
|
movq [%3], mm5 ; result of line 3
|
||
|
movq [%3+%2], mm2 ; result of line 4
|
||
|
lea %3, [%3+2*%2]
|
||
|
movq [%3], mm7 ; result of line 5
|
||
|
movq [%3+%2], mm1 ; result of line 6
|
||
|
lea %3, [%3+2*%2]
|
||
|
movq [%3], mm6 ; result of line 7
|
||
|
movq [%3+%2], mm4 ; result of line 8
|
||
|
%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
|
||
|
|
||
|
; for transpose 16x8
|
||
|
|
||
|
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||
|
;out: m4, m2, m3, m7, m5, m1, m6, m0
|
||
|
%macro TRANSPOSE_8x16B_SSE2 10
|
||
|
SSE2_XSawp bw, %1, %2, %8
|
||
|
SSE2_XSawp bw, %3, %4, %2
|
||
|
SSE2_XSawp bw, %5, %6, %4
|
||
|
movdqa %6, %9
|
||
|
movdqa %10, %4
|
||
|
SSE2_XSawp bw, %7, %6, %4
|
||
|
|
||
|
SSE2_XSawp wd, %1, %3, %6
|
||
|
SSE2_XSawp wd, %8, %2, %3
|
||
|
SSE2_XSawp wd, %5, %7, %2
|
||
|
movdqa %7, %10
|
||
|
movdqa %10, %3
|
||
|
SSE2_XSawp wd, %7, %4, %3
|
||
|
|
||
|
SSE2_XSawp dq, %1, %5, %4
|
||
|
SSE2_XSawp dq, %6, %2, %5
|
||
|
SSE2_XSawp dq, %8, %7, %2
|
||
|
movdqa %7, %10
|
||
|
movdqa %10, %5
|
||
|
SSE2_XSawp dq, %7, %3, %5
|
||
|
|
||
|
SSE2_XSawp qdq, %1, %8, %3
|
||
|
SSE2_XSawp qdq, %4, %2, %8
|
||
|
SSE2_XSawp qdq, %6, %7, %2
|
||
|
movdqa %7, %10
|
||
|
movdqa %10, %1
|
||
|
SSE2_XSawp qdq, %7, %5, %1
|
||
|
movdqa %5, %10
|
||
|
%endmacro ; end of TRANSPOSE_8x16B_SSE2
|
||
|
|
||
|
|
||
|
%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
|
||
|
movq [%1], xmm4 ; result of line 1, x8 bytes
|
||
|
movq [%1+%2], xmm2 ; result of line 2
|
||
|
lea %1, [%1+2*%2]
|
||
|
movq [%1], xmm3 ; result of line 3
|
||
|
movq [%1+%2], xmm7 ; result of line 4
|
||
|
|
||
|
lea %1, [%1+2*%2]
|
||
|
movq [%1], xmm5 ; result of line 5
|
||
|
movq [%1+%2], xmm1 ; result of line 6
|
||
|
lea %1, [%1+2*%2]
|
||
|
movq [%1], xmm6 ; result of line 7
|
||
|
movq [%1+%2], xmm0 ; result of line 8
|
||
|
|
||
|
lea %1, [%1+2*%2]
|
||
|
movhpd [%1], xmm4 ; result of line 9
|
||
|
movhpd [%1+%2], xmm2 ; result of line 10
|
||
|
lea %1, [%1+2*%2]
|
||
|
movhpd [%1], xmm3 ; result of line 11
|
||
|
movhpd [%1+%2], xmm7 ; result of line 12
|
||
|
|
||
|
lea %1, [%1+2*%2]
|
||
|
movhpd [%1], xmm5 ; result of line 13
|
||
|
movhpd [%1+%2], xmm1 ; result of line 14
|
||
|
lea %1, [%1+2*%2]
|
||
|
movhpd [%1], xmm6 ; result of line 15
|
||
|
movhpd [%1+%2], xmm0 ; result of line 16
|
||
|
%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
|
||
|
|
||
|
%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
|
||
|
movq [%1], xmm4 ; result of line 1, x8 bytes
|
||
|
movq [%1+%2], xmm2 ; result of line 2
|
||
|
lea %3, [%1+2*%2]
|
||
|
movq [%3], xmm3 ; result of line 3
|
||
|
movq [%3+%2], xmm7 ; result of line 4
|
||
|
|
||
|
lea %3, [%3+2*%2]
|
||
|
movq [%3], xmm5 ; result of line 5
|
||
|
movq [%3+%2], xmm1 ; result of line 6
|
||
|
lea %3, [%3+2*%2]
|
||
|
movq [%3], xmm6 ; result of line 7
|
||
|
movq [%3+%2], xmm0 ; result of line 8
|
||
|
|
||
|
lea %3, [%3+2*%2]
|
||
|
movhpd [%3], xmm4 ; result of line 9
|
||
|
movhpd [%3+%2], xmm2 ; result of line 10
|
||
|
lea %3, [%3+2*%2]
|
||
|
movhpd [%3], xmm3 ; result of line 11
|
||
|
movhpd [%3+%2], xmm7 ; result of line 12
|
||
|
|
||
|
lea %3, [%3+2*%2]
|
||
|
movhpd [%3], xmm5 ; result of line 13
|
||
|
movhpd [%3+%2], xmm1 ; result of line 14
|
||
|
lea %3, [%3+2*%2]
|
||
|
movhpd [%3], xmm6 ; result of line 15
|
||
|
movhpd [%3+%2], xmm0 ; result of line 16
|
||
|
%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
|
||
|
|
||
|
|
||
|
SECTION .text
|
||
|
|
||
|
WELS_EXTERN TransposeMatrixBlock16x16_sse2
|
||
|
; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
|
||
|
push r4
|
||
|
push r5
|
||
|
%assign push_num 2
|
||
|
LOAD_4_PARA
|
||
|
PUSH_XMM 8
|
||
|
SIGN_EXTENSION r1, r1d
|
||
|
SIGN_EXTENSION r3, r3d
|
||
|
|
||
|
mov r4, r7
|
||
|
and r4, 0Fh
|
||
|
sub r7, 10h
|
||
|
sub r7, r4
|
||
|
lea r5, [r3+r3*2]
|
||
|
; top 8x16 block
|
||
|
movdqa xmm0, [r2]
|
||
|
movdqa xmm1, [r2+r3]
|
||
|
movdqa xmm2, [r2+r3*2]
|
||
|
movdqa xmm3, [r2+r5]
|
||
|
lea r2, [r2+r3*4]
|
||
|
movdqa xmm4, [r2]
|
||
|
movdqa xmm5, [r2+r3]
|
||
|
movdqa xmm6, [r2+r3*2]
|
||
|
|
||
|
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||
|
;out: m4, m2, m3, m7, m5, m1, m6, m0
|
||
|
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
|
||
|
|
||
|
TRANSPOSE8x16_WRITE_SSE2 r0, r1
|
||
|
|
||
|
; bottom 8x16 block
|
||
|
lea r2, [r2+r3*4]
|
||
|
movdqa xmm0, [r2]
|
||
|
movdqa xmm1, [r2+r3]
|
||
|
movdqa xmm2, [r2+r3*2]
|
||
|
movdqa xmm3, [r2+r5]
|
||
|
lea r2, [r2+r3*4]
|
||
|
movdqa xmm4, [r2]
|
||
|
movdqa xmm5, [r2+r3]
|
||
|
movdqa xmm6, [r2+r3*2]
|
||
|
|
||
|
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||
|
;out: m4, m2, m3, m7, m5, m1, m6, m0
|
||
|
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
|
||
|
|
||
|
mov r5, r1
|
||
|
sal r5, 4
|
||
|
sub r0, r5
|
||
|
lea r0, [r0+r1*2+8]
|
||
|
TRANSPOSE8x16_WRITE_SSE2 r0, r1
|
||
|
|
||
|
add r7, r4
|
||
|
add r7, 10h
|
||
|
POP_XMM
|
||
|
LOAD_4_PARA_POP
|
||
|
pop r5
|
||
|
pop r4
|
||
|
ret
|
||
|
|
||
|
WELS_EXTERN TransposeMatrixBlocksx16_sse2
|
||
|
; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
|
||
|
push r5
|
||
|
push r6
|
||
|
%assign push_num 2
|
||
|
LOAD_5_PARA
|
||
|
PUSH_XMM 8
|
||
|
SIGN_EXTENSION r1, r1d
|
||
|
SIGN_EXTENSION r3, r3d
|
||
|
SIGN_EXTENSION r4, r4d
|
||
|
mov r5, r7
|
||
|
and r5, 0Fh
|
||
|
sub r7, 10h
|
||
|
sub r7, r5
|
||
|
TRANSPOSE_LOOP_SSE2:
|
||
|
; explictly loading next loop data
|
||
|
lea r6, [r2+r3*8]
|
||
|
push r4
|
||
|
%rep 8
|
||
|
mov r4, [r6]
|
||
|
mov r4, [r6+r3]
|
||
|
lea r6, [r6+r3*2]
|
||
|
%endrep
|
||
|
pop r4
|
||
|
; top 8x16 block
|
||
|
movdqa xmm0, [r2]
|
||
|
movdqa xmm1, [r2+r3]
|
||
|
lea r2, [r2+r3*2]
|
||
|
movdqa xmm2, [r2]
|
||
|
movdqa xmm3, [r2+r3]
|
||
|
lea r2, [r2+r3*2]
|
||
|
movdqa xmm4, [r2]
|
||
|
movdqa xmm5, [r2+r3]
|
||
|
lea r2, [r2+r3*2]
|
||
|
movdqa xmm6, [r2]
|
||
|
|
||
|
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||
|
;out: m4, m2, m3, m7, m5, m1, m6, m0
|
||
|
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
|
||
|
TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
|
||
|
lea r2, [r2+r3*2]
|
||
|
|
||
|
; bottom 8x16 block
|
||
|
movdqa xmm0, [r2]
|
||
|
movdqa xmm1, [r2+r3]
|
||
|
lea r2, [r2+r3*2]
|
||
|
movdqa xmm2, [r2]
|
||
|
movdqa xmm3, [r2+r3]
|
||
|
lea r2, [r2+r3*2]
|
||
|
movdqa xmm4, [r2]
|
||
|
movdqa xmm5, [r2+r3]
|
||
|
lea r2, [r2+r3*2]
|
||
|
movdqa xmm6, [r2]
|
||
|
|
||
|
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||
|
;out: m4, m2, m3, m7, m5, m1, m6, m0
|
||
|
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
|
||
|
TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
|
||
|
lea r2, [r2+r3*2]
|
||
|
lea r0, [r0+16]
|
||
|
dec r4
|
||
|
jg near TRANSPOSE_LOOP_SSE2
|
||
|
|
||
|
add r7, r5
|
||
|
add r7, 10h
|
||
|
POP_XMM
|
||
|
LOAD_5_PARA_POP
|
||
|
pop r6
|
||
|
pop r5
|
||
|
ret
|
||
|
|
||
|
WELS_EXTERN TransposeMatrixBlock8x8_mmx
|
||
|
; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
|
||
|
%assign push_num 0
|
||
|
LOAD_4_PARA
|
||
|
SIGN_EXTENSION r1, r1d
|
||
|
SIGN_EXTENSION r3, r3d
|
||
|
sub r7, 8
|
||
|
|
||
|
movq mm0, [r2]
|
||
|
movq mm1, [r2+r3]
|
||
|
lea r2, [r2+2*r3]
|
||
|
movq mm2, [r2]
|
||
|
movq mm3, [r2+r3]
|
||
|
lea r2, [r2+2*r3]
|
||
|
movq mm4, [r2]
|
||
|
movq mm5, [r2+r3]
|
||
|
lea r2, [r2+2*r3]
|
||
|
movq mm6, [r2]
|
||
|
|
||
|
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||
|
;out: m0, m3, m5, m2, m7, m1, m6, m4
|
||
|
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
|
||
|
|
||
|
TRANSPOSE8x8_WRITE_MMX r0, r1
|
||
|
|
||
|
emms
|
||
|
add r7, 8
|
||
|
LOAD_4_PARA_POP
|
||
|
ret
|
||
|
|
||
|
WELS_EXTERN TransposeMatrixBlocksx8_mmx
|
||
|
; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
|
||
|
push r5
|
||
|
push r6
|
||
|
%assign push_num 2
|
||
|
LOAD_5_PARA
|
||
|
SIGN_EXTENSION r1, r1d
|
||
|
SIGN_EXTENSION r3, r3d
|
||
|
SIGN_EXTENSION r4, r4d
|
||
|
sub r7, 8
|
||
|
|
||
|
lea r5, [r2+r3*8]
|
||
|
|
||
|
TRANSPOSE_BLOCKS_X8_LOOP_MMX:
|
||
|
; explictly loading next loop data
|
||
|
%rep 4
|
||
|
mov r6, [r5]
|
||
|
mov r6, [r5+r3]
|
||
|
lea r5, [r5+r3*2]
|
||
|
%endrep
|
||
|
movq mm0, [r2]
|
||
|
movq mm1, [r2+r3]
|
||
|
lea r2, [r2+2*r3]
|
||
|
movq mm2, [r2]
|
||
|
movq mm3, [r2+r3]
|
||
|
lea r2, [r2+2*r3]
|
||
|
movq mm4, [r2]
|
||
|
movq mm5, [r2+r3]
|
||
|
lea r2, [r2+2*r3]
|
||
|
movq mm6, [r2]
|
||
|
|
||
|
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||
|
;out: m0, m3, m5, m2, m7, m1, m6, m4
|
||
|
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
|
||
|
|
||
|
TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
|
||
|
lea r0, [r0+8]
|
||
|
lea r2, [r2+2*r3]
|
||
|
dec r4
|
||
|
jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
|
||
|
|
||
|
emms
|
||
|
add r7, 8
|
||
|
LOAD_5_PARA_POP
|
||
|
pop r6
|
||
|
pop r5
|
||
|
ret
|