Convert all tabs to spaces in assembly sources, unify indentation

Previously the assembly sources had mixed indentation consisting
of both spaces and tabs, making it quite hard to read unless
the right tab size was used in the editor.

Tabs have been interpreted as 4 spaces in most cases, matching
the surrounding code.
This commit is contained in:
Martin Storsjö 2014-05-31 14:13:34 +03:00
parent faaf62afad
commit 57f6bcc4b0
38 changed files with 19904 additions and 19904 deletions

View File

@ -36,75 +36,75 @@
#ifdef __APPLE__ #ifdef __APPLE__
.macro LOAD_ALIGNED_DATA_WITH_STRIDE .macro LOAD_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride // { // input: $0~$3, src*, src_stride
vld1.64 {$0}, [$4,:128], $5 vld1.64 {$0}, [$4,:128], $5
vld1.64 {$1}, [$4,:128], $5 vld1.64 {$1}, [$4,:128], $5
vld1.64 {$2}, [$4,:128], $5 vld1.64 {$2}, [$4,:128], $5
vld1.64 {$3}, [$4,:128], $5 vld1.64 {$3}, [$4,:128], $5
// } // }
.endm .endm
.macro STORE_ALIGNED_DATA_WITH_STRIDE .macro STORE_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride // { // input: $0~$3, dst*, dst_stride
vst1.64 {$0}, [$4,:128], $5 vst1.64 {$0}, [$4,:128], $5
vst1.64 {$1}, [$4,:128], $5 vst1.64 {$1}, [$4,:128], $5
vst1.64 {$2}, [$4,:128], $5 vst1.64 {$2}, [$4,:128], $5
vst1.64 {$3}, [$4,:128], $5 vst1.64 {$3}, [$4,:128], $5
// } // }
.endm .endm
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE .macro LOAD_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride // { // input: $0~$3, src*, src_stride
vld1.64 {$0}, [$4], $5 vld1.64 {$0}, [$4], $5
vld1.64 {$1}, [$4], $5 vld1.64 {$1}, [$4], $5
vld1.64 {$2}, [$4], $5 vld1.64 {$2}, [$4], $5
vld1.64 {$3}, [$4], $5 vld1.64 {$3}, [$4], $5
// } // }
.endm .endm
.macro STORE_UNALIGNED_DATA_WITH_STRIDE .macro STORE_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride // { // input: $0~$3, dst*, dst_stride
vst1.64 {$0}, [$4], $5 vst1.64 {$0}, [$4], $5
vst1.64 {$1}, [$4], $5 vst1.64 {$1}, [$4], $5
vst1.64 {$2}, [$4], $5 vst1.64 {$2}, [$4], $5
vst1.64 {$3}, [$4], $5 vst1.64 {$3}, [$4], $5
// } // }
.endm .endm
#else #else
.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 .macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: \arg0~\arg3, src*, src_stride // { // input: \arg0~\arg3, src*, src_stride
vld1.64 {\arg0}, [\arg4,:128], \arg5 vld1.64 {\arg0}, [\arg4,:128], \arg5
vld1.64 {\arg1}, [\arg4,:128], \arg5 vld1.64 {\arg1}, [\arg4,:128], \arg5
vld1.64 {\arg2}, [\arg4,:128], \arg5 vld1.64 {\arg2}, [\arg4,:128], \arg5
vld1.64 {\arg3}, [\arg4,:128], \arg5 vld1.64 {\arg3}, [\arg4,:128], \arg5
// } // }
.endm .endm
.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 .macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: \arg0~\arg3, dst*, dst_stride // { // input: \arg0~\arg3, dst*, dst_stride
vst1.64 {\arg0}, [\arg4,:128], \arg5 vst1.64 {\arg0}, [\arg4,:128], \arg5
vst1.64 {\arg1}, [\arg4,:128], \arg5 vst1.64 {\arg1}, [\arg4,:128], \arg5
vst1.64 {\arg2}, [\arg4,:128], \arg5 vst1.64 {\arg2}, [\arg4,:128], \arg5
vst1.64 {\arg3}, [\arg4,:128], \arg5 vst1.64 {\arg3}, [\arg4,:128], \arg5
// } // }
.endm .endm
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 .macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: \arg0~\arg3, src*, src_stride // { // input: \arg0~\arg3, src*, src_stride
vld1.64 {\arg0}, [\arg4], \arg5 vld1.64 {\arg0}, [\arg4], \arg5
vld1.64 {\arg1}, [\arg4], \arg5 vld1.64 {\arg1}, [\arg4], \arg5
vld1.64 {\arg2}, [\arg4], \arg5 vld1.64 {\arg2}, [\arg4], \arg5
vld1.64 {\arg3}, [\arg4], \arg5 vld1.64 {\arg3}, [\arg4], \arg5
// } // }
.endm .endm
.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 .macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: \arg0~\arg3, dst*, dst_stride // { // input: \arg0~\arg3, dst*, dst_stride
vst1.64 {\arg0}, [\arg4], \arg5 vst1.64 {\arg0}, [\arg4], \arg5
vst1.64 {\arg1}, [\arg4], \arg5 vst1.64 {\arg1}, [\arg4], \arg5
vst1.64 {\arg2}, [\arg4], \arg5 vst1.64 {\arg2}, [\arg4], \arg5
vst1.64 {\arg3}, [\arg4], \arg5 vst1.64 {\arg3}, [\arg4], \arg5
// } // }
.endm .endm
#endif #endif
@ -112,89 +112,89 @@
WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon
LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
WELS_ASM_FUNC_END WELS_ASM_FUNC_END

File diff suppressed because it is too large Load Diff

View File

@ -37,119 +37,119 @@
WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
stmdb sp!, {r4-r8} stmdb sp!, {r4-r8}
//Save the dst //Save the dst
mov r7, r0 mov r7, r0
mov r8, r3 mov r8, r3
add r4, r7, r2 add r4, r7, r2
sub r4, #1 sub r4, #1
//For the left and right expand //For the left and right expand
_expand_picture_luma_loop2: _expand_picture_luma_loop2:
sub r5, r7, #32 sub r5, r7, #32
add r6, r4, #1 add r6, r4, #1
vld1.8 {d0[], d1[]}, [r7], r1 vld1.8 {d0[], d1[]}, [r7], r1
vld1.8 {d2[], d3[]}, [r4], r1 vld1.8 {d2[], d3[]}, [r4], r1
vst1.8 {q0}, [r5]! vst1.8 {q0}, [r5]!
vst1.8 {q0}, [r5] vst1.8 {q0}, [r5]
vst1.8 {q1}, [r6]! vst1.8 {q1}, [r6]!
vst1.8 {q1}, [r6] vst1.8 {q1}, [r6]
subs r8, #1 subs r8, #1
bne _expand_picture_luma_loop2 bne _expand_picture_luma_loop2
//for the top and bottom expand //for the top and bottom expand
add r2, #64 add r2, #64
sub r0, #32 sub r0, #32
mla r4, r1, r3, r0 mla r4, r1, r3, r0
sub r4, r1 sub r4, r1
_expand_picture_luma_loop0: _expand_picture_luma_loop0:
mov r5, #32 mov r5, #32
mls r5, r5, r1, r0 mls r5, r5, r1, r0
add r6, r4, r1 add r6, r4, r1
vld1.8 {q0}, [r0]! vld1.8 {q0}, [r0]!
vld1.8 {q1}, [r4]! vld1.8 {q1}, [r4]!
mov r8, #32 mov r8, #32
_expand_picture_luma_loop1: _expand_picture_luma_loop1:
vst1.8 {q0}, [r5], r1 vst1.8 {q0}, [r5], r1
vst1.8 {q1}, [r6], r1 vst1.8 {q1}, [r6], r1
subs r8, #1 subs r8, #1
bne _expand_picture_luma_loop1 bne _expand_picture_luma_loop1
subs r2, #16 subs r2, #16
bne _expand_picture_luma_loop0 bne _expand_picture_luma_loop0
//vldreq.32 d0, [r0] //vldreq.32 d0, [r0]
ldmia sp!, {r4-r8} ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
stmdb sp!, {r4-r9} stmdb sp!, {r4-r9}
//Save the dst //Save the dst
mov r7, r0 mov r7, r0
mov r8, r3 mov r8, r3
add r4, r7, r2 add r4, r7, r2
sub r4, #1 sub r4, #1
//For the left and right expand //For the left and right expand
_expand_picture_chroma_loop2: _expand_picture_chroma_loop2:
sub r5, r7, #16 sub r5, r7, #16
add r6, r4, #1 add r6, r4, #1
vld1.8 {d0[], d1[]}, [r7], r1 vld1.8 {d0[], d1[]}, [r7], r1
vld1.8 {d2[], d3[]}, [r4], r1 vld1.8 {d2[], d3[]}, [r4], r1
vst1.8 {q0}, [r5] vst1.8 {q0}, [r5]
vst1.8 {q1}, [r6] vst1.8 {q1}, [r6]
subs r8, #1 subs r8, #1
bne _expand_picture_chroma_loop2 bne _expand_picture_chroma_loop2
//for the top and bottom expand //for the top and bottom expand
add r2, #32 add r2, #32
mov r9, r2 mov r9, r2
bic r2, #15 bic r2, #15
sub r0, #16 sub r0, #16
mla r4, r1, r3, r0 mla r4, r1, r3, r0
sub r4, r1 sub r4, r1
_expand_picture_chroma_loop0: _expand_picture_chroma_loop0:
mov r5, #16 mov r5, #16
mls r5, r5, r1, r0 mls r5, r5, r1, r0
add r6, r4, r1 add r6, r4, r1
vld1.8 {q0}, [r0]! vld1.8 {q0}, [r0]!
vld1.8 {q1}, [r4]! vld1.8 {q1}, [r4]!
mov r8, #16 mov r8, #16
_expand_picture_chroma_loop1: _expand_picture_chroma_loop1:
vst1.8 {q0}, [r5], r1 vst1.8 {q0}, [r5], r1
vst1.8 {q1}, [r6], r1 vst1.8 {q1}, [r6], r1
subs r8, #1 subs r8, #1
bne _expand_picture_chroma_loop1 bne _expand_picture_chroma_loop1
subs r2, #16 subs r2, #16
bne _expand_picture_chroma_loop0 bne _expand_picture_chroma_loop0
//vldreq.32 d0, [r0] //vldreq.32 d0, [r0]
and r9, #15 and r9, #15
cmp r9, #8 cmp r9, #8
bne _expand_picture_chroma_end bne _expand_picture_chroma_end
mov r5, #16 mov r5, #16
mls r5, r5, r1, r0 mls r5, r5, r1, r0
add r6, r4, r1 add r6, r4, r1
vld1.8 {d0}, [r0]! vld1.8 {d0}, [r0]!
vld1.8 {d2}, [r4]! vld1.8 {d2}, [r4]!
mov r8, #16 mov r8, #16
_expand_picture_chroma_loop3: _expand_picture_chroma_loop3:
vst1.8 {d0}, [r5], r1 vst1.8 {d0}, [r5], r1
vst1.8 {d2}, [r6], r1 vst1.8 {d2}, [r6], r1
subs r8, #1 subs r8, #1
bne _expand_picture_chroma_loop3 bne _expand_picture_chroma_loop3
_expand_picture_chroma_end: _expand_picture_chroma_end:
ldmia sp!, {r4-r9} ldmia sp!, {r4-r9}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -53,88 +53,88 @@ _expand_picture_luma_loop2:
sub x8, x8, #1 sub x8, x8, #1
cbnz x8, _expand_picture_luma_loop2 cbnz x8, _expand_picture_luma_loop2
//for the top and bottom expand //for the top and bottom expand
add x2, x2, #64 add x2, x2, #64
sub x0, x0, #32 sub x0, x0, #32
madd x4, x1, x3, x0 madd x4, x1, x3, x0
sub x4, x4, x1 sub x4, x4, x1
_expand_picture_luma_loop0: _expand_picture_luma_loop0:
mov x5, #32 mov x5, #32
msub x5, x5, x1, x0 msub x5, x5, x1, x0
add x6, x4, x1 add x6, x4, x1
ld1 {v0.16b}, [x0], x10 ld1 {v0.16b}, [x0], x10
ld1 {v1.16b}, [x4], x10 ld1 {v1.16b}, [x4], x10
mov x8, #32 mov x8, #32
_expand_picture_luma_loop1: _expand_picture_luma_loop1:
st1 {v0.16b}, [x5], x1 st1 {v0.16b}, [x5], x1
st1 {v1.16b}, [x6], x1 st1 {v1.16b}, [x6], x1
sub x8, x8, #1 sub x8, x8, #1
cbnz x8, _expand_picture_luma_loop1 cbnz x8, _expand_picture_luma_loop1
sub x2, x2, #16 sub x2, x2, #16
cbnz x2, _expand_picture_luma_loop0 cbnz x2, _expand_picture_luma_loop0
WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN ExpandPictureChroma_AArch64_neon WELS_ASM_ARCH64_FUNC_BEGIN ExpandPictureChroma_AArch64_neon
//Save the dst //Save the dst
mov x7, x0 mov x7, x0
mov x8, x3 mov x8, x3
mov x10, #16 mov x10, #16
add x4, x7, x2 add x4, x7, x2
sub x4, x4, #1 sub x4, x4, #1
//For the left and right expand //For the left and right expand
_expand_picture_chroma_loop2: _expand_picture_chroma_loop2:
sub x5, x7, #16 sub x5, x7, #16
add x6, x4, #1 add x6, x4, #1
ld1r {v0.16b}, [x7], x1 ld1r {v0.16b}, [x7], x1
ld1r {v1.16b}, [x4], x1 ld1r {v1.16b}, [x4], x1
st1 {v0.16b}, [x5] st1 {v0.16b}, [x5]
st1 {v1.16b}, [x6] st1 {v1.16b}, [x6]
sub x8, x8, #1 sub x8, x8, #1
cbnz x8, _expand_picture_chroma_loop2 cbnz x8, _expand_picture_chroma_loop2
//for the top and bottom expand //for the top and bottom expand
add x2, x2, #32 add x2, x2, #32
// //
mov x9, x2 mov x9, x2
mov x11, #15 mov x11, #15
bic x2, x2, x11 bic x2, x2, x11
// //
sub x0, x0, #16 sub x0, x0, #16
madd x4, x1, x3, x0 madd x4, x1, x3, x0
sub x4, x4, x1 sub x4, x4, x1
_expand_picture_chroma_loop0: _expand_picture_chroma_loop0:
mov x5, #16 mov x5, #16
msub x5, x5, x1, x0 msub x5, x5, x1, x0
add x6, x4, x1 add x6, x4, x1
ld1 {v0.16b}, [x0], x10 ld1 {v0.16b}, [x0], x10
ld1 {v1.16b}, [x4], x10 ld1 {v1.16b}, [x4], x10
mov x8, #16 mov x8, #16
_expand_picture_chroma_loop1: _expand_picture_chroma_loop1:
st1 {v0.16b}, [x5], x1 st1 {v0.16b}, [x5], x1
st1 {v1.16b}, [x6], x1 st1 {v1.16b}, [x6], x1
sub x8, x8, #1 sub x8, x8, #1
cbnz x8, _expand_picture_chroma_loop1 cbnz x8, _expand_picture_chroma_loop1
sub x2, x2, #16 sub x2, x2, #16
cbnz x2, _expand_picture_chroma_loop0 cbnz x2, _expand_picture_chroma_loop0
and x9, x9, #15 and x9, x9, #15
sub x9, x9, #8 sub x9, x9, #8
cbnz x9, _expand_picture_chroma_end cbnz x9, _expand_picture_chroma_end
mov x5, #16 mov x5, #16
msub x5, x5, x1, x0 msub x5, x5, x1, x0
add x6, x4, x1 add x6, x4, x1
ld1 {v0.8b}, [x0] ld1 {v0.8b}, [x0]
ld1 {v1.8b}, [x4] ld1 {v1.8b}, [x4]
mov x8, #16 mov x8, #16
_expand_picture_chroma_loop3: _expand_picture_chroma_loop3:
st1 {v0.8b}, [x5], x1 st1 {v0.8b}, [x5], x1
st1 {v1.8b}, [x6], x1 st1 {v1.8b}, [x6], x1
sub x8, x8, #1 sub x8, x8, #1
cbnz x8, _expand_picture_chroma_loop3 cbnz x8, _expand_picture_chroma_loop3
_expand_picture_chroma_end: _expand_picture_chroma_end:

File diff suppressed because it is too large Load Diff

View File

@ -44,15 +44,15 @@
;*********************************************************************** ;***********************************************************************
%if 1 %if 1
%define MOVDQ movdqa %define MOVDQ movdqa
%else %else
%define MOVDQ movdqu %define MOVDQ movdqu
%endif %endif
%if 1 %if 1
%define WELSEMMS emms %define WELSEMMS emms
%else %else
%define WELSEMMS %define WELSEMMS
%endif %endif
@ -220,7 +220,7 @@ BITS 32
%macro LOAD_1_PARA 0 %macro LOAD_1_PARA 0
%ifdef X86_32 %ifdef X86_32
mov r0, [esp + push_num*4 + 4] mov r0, [esp + push_num*4 + 4]
%endif %endif
%endmacro %endmacro
@ -234,8 +234,8 @@ BITS 32
%macro LOAD_3_PARA 0 %macro LOAD_3_PARA 0
%ifdef X86_32 %ifdef X86_32
mov r0, [esp + push_num*4 + 4] mov r0, [esp + push_num*4 + 4]
mov r1, [esp + push_num*4 + 8] mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12] mov r2, [esp + push_num*4 + 12]
%endif %endif
%endmacro %endmacro
@ -267,7 +267,7 @@ BITS 32
%macro LOAD_6_PARA 0 %macro LOAD_6_PARA 0
%ifdef X86_32 %ifdef X86_32
push r3 push r3
push r4 push r4
push r5 push r5
%assign push_num push_num+3 %assign push_num push_num+3
@ -310,22 +310,22 @@ BITS 32
%macro LOAD_4_PARA_POP 0 %macro LOAD_4_PARA_POP 0
%ifdef X86_32 %ifdef X86_32
pop r3 pop r3
%endif %endif
%endmacro %endmacro
%macro LOAD_5_PARA_POP 0 %macro LOAD_5_PARA_POP 0
%ifdef X86_32 %ifdef X86_32
pop r4 pop r4
pop r3 pop r3
%endif %endif
%endmacro %endmacro
%macro LOAD_6_PARA_POP 0 %macro LOAD_6_PARA_POP 0
%ifdef X86_32 %ifdef X86_32
pop r5 pop r5
pop r4 pop r4
pop r3 pop r3
%endif %endif
%endmacro %endmacro
@ -416,13 +416,13 @@ BITS 32
%macro SIGN_EXTENSION 2 %macro SIGN_EXTENSION 2
%ifndef X86_32 %ifndef X86_32
movsxd %1, %2 movsxd %1, %2
%endif %endif
%endmacro %endmacro
%macro SIGN_EXTENSIONW 2 %macro SIGN_EXTENSIONW 2
%ifndef X86_32 %ifndef X86_32
movsx %1, %2 movsx %1, %2
%endif %endif
%endmacro %endmacro
@ -438,13 +438,13 @@ BITS 32
%endmacro %endmacro
%macro WELS_AbsW 2 %macro WELS_AbsW 2
pxor %2, %2 pxor %2, %2
psubw %2, %1 psubw %2, %1
pmaxsw %1, %2 pmaxsw %1, %2
%endmacro %endmacro
%macro MMX_XSwap 4 %macro MMX_XSwap 4
movq %4, %2 movq %4, %2
punpckh%1 %4, %3 punpckh%1 %4, %3
punpckl%1 %2, %3 punpckl%1 %2, %3
%endmacro %endmacro
@ -485,35 +485,35 @@ BITS 32
;in: m1, m2, m3, m4, m5, m6, m7, m8 ;in: m1, m2, m3, m4, m5, m6, m7, m8
;pOut: m5, m3, m4, m8, m6, m2, m7, m1 ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
%macro SSE2_TransTwo8x8B 9 %macro SSE2_TransTwo8x8B 9
movdqa %9, %8 movdqa %9, %8
SSE2_XSawp bw, %1, %2, %8 SSE2_XSawp bw, %1, %2, %8
SSE2_XSawp bw, %3, %4, %2 SSE2_XSawp bw, %3, %4, %2
SSE2_XSawp bw, %5, %6, %4 SSE2_XSawp bw, %5, %6, %4
movdqa %6, %9 movdqa %6, %9
movdqa %9, %4 movdqa %9, %4
SSE2_XSawp bw, %7, %6, %4 SSE2_XSawp bw, %7, %6, %4
SSE2_XSawp wd, %1, %3, %6 SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3 SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2 SSE2_XSawp wd, %5, %7, %2
movdqa %7, %9 movdqa %7, %9
movdqa %9, %3 movdqa %9, %3
SSE2_XSawp wd, %7, %4, %3 SSE2_XSawp wd, %7, %4, %3
SSE2_XSawp dq, %1, %5, %4 SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5 SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2 SSE2_XSawp dq, %8, %7, %2
movdqa %7, %9 movdqa %7, %9
movdqa %9, %5 movdqa %9, %5
SSE2_XSawp dq, %7, %3, %5 SSE2_XSawp dq, %7, %3, %5
SSE2_XSawp qdq, %1, %8, %3 SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8 SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2 SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %9 movdqa %7, %9
movdqa %9, %1 movdqa %9, %1
SSE2_XSawp qdq, %7, %5, %1 SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %9 movdqa %5, %9
%endmacro %endmacro
;xmm0, xmm6, xmm7, [eax], [ecx] ;xmm0, xmm6, xmm7, [eax], [ecx]
@ -528,32 +528,32 @@ BITS 32
; m2 = m1 + m2, m1 = m1 - m2 ; m2 = m1 + m2, m1 = m1 - m2
%macro SSE2_SumSub 3 %macro SSE2_SumSub 3
movdqa %3, %2 movdqa %3, %2
paddw %2, %1 paddw %2, %1
psubw %1, %3 psubw %1, %3
%endmacro %endmacro
%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d] %macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
mov %3h, %3l mov %3h, %3l
movd %1, e%3x ; i.e, 1% = eax (=b0) movd %1, e%3x ; i.e, 1% = eax (=b0)
pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0 pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0 pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
%endmacro %endmacro
;copy a dw into a xmm for 8 times ;copy a dw into a xmm for 8 times
%macro SSE2_Copy8Times 2 %macro SSE2_Copy8Times 2
movd %1, %2 movd %1, %2
punpcklwd %1, %1 punpcklwd %1, %1
pshufd %1, %1, 0 pshufd %1, %1, 0
%endmacro %endmacro
;copy a db into a xmm for 16 times ;copy a db into a xmm for 16 times
%macro SSE2_Copy16Times 2 %macro SSE2_Copy16Times 2
movd %1, %2 movd %1, %2
pshuflw %1, %1, 0 pshuflw %1, %1, 0
punpcklqdq %1, %1 punpcklqdq %1, %1
packuswb %1, %1 packuswb %1, %1
%endmacro %endmacro
@ -564,35 +564,35 @@ BITS 32
;dw 32,32,32,32,32,32,32,32 for xmm ;dw 32,32,32,32,32,32,32,32 for xmm
;dw 32,32,32,32 for mm ;dw 32,32,32,32 for mm
%macro WELS_DW32 1 %macro WELS_DW32 1
pcmpeqw %1,%1 pcmpeqw %1,%1
psrlw %1,15 psrlw %1,15
psllw %1,5 psllw %1,5
%endmacro %endmacro
;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm ;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
;dw 1, 1, 1, 1 for mm ;dw 1, 1, 1, 1 for mm
%macro WELS_DW1 1 %macro WELS_DW1 1
pcmpeqw %1,%1 pcmpeqw %1,%1
psrlw %1,15 psrlw %1,15
%endmacro %endmacro
;all 0 for xmm and mm ;all 0 for xmm and mm
%macro WELS_Zero 1 %macro WELS_Zero 1
pxor %1, %1 pxor %1, %1
%endmacro %endmacro
;dd 1, 1, 1, 1 for xmm ;dd 1, 1, 1, 1 for xmm
;dd 1, 1 for mm ;dd 1, 1 for mm
%macro WELS_DD1 1 %macro WELS_DD1 1
pcmpeqw %1,%1 pcmpeqw %1,%1
psrld %1,31 psrld %1,31
%endmacro %endmacro
;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 ;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
%macro WELS_DB1 1 %macro WELS_DB1 1
pcmpeqw %1,%1 pcmpeqw %1,%1
psrlw %1,15 psrlw %1,15
packuswb %1,%1 packuswb %1,%1
%endmacro %endmacro

View File

@ -29,13 +29,13 @@
;* POSSIBILITY OF SUCH DAMAGE. ;* POSSIBILITY OF SUCH DAMAGE.
;* ;*
;* ;*
;* cpu_mmx.asm ;* cpu_mmx.asm
;* ;*
;* Abstract ;* Abstract
;* verify cpuid feature support and cpuid detection ;* verify cpuid feature support and cpuid detection
;* ;*
;* History ;* History
;* 04/29/2009 Created ;* 04/29/2009 Created
;* ;*
;*************************************************************************/ ;*************************************************************************/
@ -115,13 +115,13 @@ WELS_EXTERN WelsCPUId
%elifdef X86_32 %elifdef X86_32
WELS_EXTERN WelsCPUId WELS_EXTERN WelsCPUId
push ebx push ebx
push edi push edi
mov eax, [esp+12] ; operating index mov eax, [esp+12] ; operating index
mov edi, [esp+24] mov edi, [esp+24]
mov ecx, [edi] mov ecx, [edi]
cpuid ; cpuid cpuid ; cpuid
; processing various information return ; processing various information return
mov edi, [esp+16] mov edi, [esp+16]
@ -133,7 +133,7 @@ WELS_EXTERN WelsCPUId
mov edi, [esp+28] mov edi, [esp+28]
mov [edi], edx mov [edi], edx
pop edi pop edi
pop ebx pop ebx
ret ret
@ -145,31 +145,31 @@ WELS_EXTERN WelsCPUId
;**************************************************************************************************** ;****************************************************************************************************
WELS_EXTERN WelsCPUSupportAVX WELS_EXTERN WelsCPUSupportAVX
%ifdef WIN64 %ifdef WIN64
mov eax, ecx mov eax, ecx
mov ecx, edx mov ecx, edx
%elifdef UNIX64 %elifdef UNIX64
mov eax, edi mov eax, edi
mov ecx, esi mov ecx, esi
%else %else
mov eax, [esp+4] mov eax, [esp+4]
mov ecx, [esp+8] mov ecx, [esp+8]
%endif %endif
; refer to detection of AVX addressed in INTEL AVX manual document ; refer to detection of AVX addressed in INTEL AVX manual document
and ecx, 018000000H and ecx, 018000000H
cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
jne avx_not_supported jne avx_not_supported
; processor supports AVX instructions and XGETBV is enabled by OS ; processor supports AVX instructions and XGETBV is enabled by OS
mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
XGETBV ; result in EDX:EAX XGETBV ; result in EDX:EAX
and eax, 06H and eax, 06H
cmp eax, 06H ; check OS has enabled both XMM and YMM state support cmp eax, 06H ; check OS has enabled both XMM and YMM state support
jne avx_not_supported jne avx_not_supported
mov eax, 1 mov eax, 1
ret ret
avx_not_supported: avx_not_supported:
mov eax, 0 mov eax, 0
ret ret
; need call after cpuid=1 and eax, ecx flag got then ; need call after cpuid=1 and eax, ecx flag got then
@ -178,35 +178,35 @@ avx_not_supported:
;**************************************************************************************************** ;****************************************************************************************************
WELS_EXTERN WelsCPUSupportFMA WELS_EXTERN WelsCPUSupportFMA
%ifdef WIN64 %ifdef WIN64
mov eax, ecx mov eax, ecx
mov ecx, edx mov ecx, edx
%elifdef UNIX64 %elifdef UNIX64
mov eax, edi mov eax, edi
mov ecx, esi mov ecx, esi
%else %else
mov eax, [esp+4] mov eax, [esp+4]
mov ecx, [esp+8] mov ecx, [esp+8]
%endif %endif
; refer to detection of FMA addressed in INTEL AVX manual document ; refer to detection of FMA addressed in INTEL AVX manual document
and ecx, 018001000H and ecx, 018001000H
cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
jne fma_not_supported jne fma_not_supported
; processor supports AVX,FMA instructions and XGETBV is enabled by OS ; processor supports AVX,FMA instructions and XGETBV is enabled by OS
mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
XGETBV ; result in EDX:EAX XGETBV ; result in EDX:EAX
and eax, 06H and eax, 06H
cmp eax, 06H ; check OS has enabled both XMM and YMM state support cmp eax, 06H ; check OS has enabled both XMM and YMM state support
jne fma_not_supported jne fma_not_supported
mov eax, 1 mov eax, 1
ret ret
fma_not_supported: fma_not_supported:
mov eax, 0 mov eax, 0
ret ret
;****************************************************************************************** ;******************************************************************************************
; void WelsEmms() ; void WelsEmms()
;****************************************************************************************** ;******************************************************************************************
WELS_EXTERN WelsEmms WELS_EXTERN WelsEmms
emms ; empty mmx technology states emms ; empty mmx technology states
ret ret

File diff suppressed because it is too large Load Diff

View File

@ -77,280 +77,280 @@ SECTION .text
;cccc|ceeeeeeeeeeeeeeeed|dddd ;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd ;cccc|ceeeeeeeeeeeeeeeed|dddd
%macro mov_line_8x4_mmx 3 ; dst, stride, mm? %macro mov_line_8x4_mmx 3 ; dst, stride, mm?
movq [%1], %3 movq [%1], %3
movq [%1+%2], %3 movq [%1+%2], %3
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movq [%1], %3 movq [%1], %3
movq [%1+%2], %3 movq [%1+%2], %3
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
%endmacro %endmacro
%macro mov_line_end8x4_mmx 3 ; dst, stride, mm? %macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
movq [%1], %3 movq [%1], %3
movq [%1+%2], %3 movq [%1+%2], %3
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movq [%1], %3 movq [%1], %3
movq [%1+%2], %3 movq [%1+%2], %3
lea %1, [%1+%2] lea %1, [%1+%2]
%endmacro %endmacro
%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a %macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
movdq%4 [%1], %3 ; top(bottom)_0 movdq%4 [%1], %3 ; top(bottom)_0
movdq%4 [%1+%2], %3 ; top(bottom)_1 movdq%4 [%1+%2], %3 ; top(bottom)_1
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movdq%4 [%1], %3 ; top(bottom)_2 movdq%4 [%1], %3 ; top(bottom)_2
movdq%4 [%1+%2], %3 ; top(bottom)_3 movdq%4 [%1+%2], %3 ; top(bottom)_3
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
%endmacro %endmacro
%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a %macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
movdq%4 [%1], %3 ; top(bottom)_0 movdq%4 [%1], %3 ; top(bottom)_0
movdq%4 [%1+%2], %3 ; top(bottom)_1 movdq%4 [%1+%2], %3 ; top(bottom)_1
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movdq%4 [%1], %3 ; top(bottom)_2 movdq%4 [%1], %3 ; top(bottom)_2
movdq%4 [%1+%2], %3 ; top(bottom)_3 movdq%4 [%1+%2], %3 ; top(bottom)_3
lea %1, [%1+%2] lea %1, [%1+%2]
%endmacro %endmacro
%macro mov_line_32x4_sse2 3 ; dst, stride, xmm? %macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
movdqa [%1], %3 ; top(bottom)_0 movdqa [%1], %3 ; top(bottom)_0
movdqa [%1+16], %3 ; top(bottom)_0 movdqa [%1+16], %3 ; top(bottom)_0
movdqa [%1+%2], %3 ; top(bottom)_1 movdqa [%1+%2], %3 ; top(bottom)_1
movdqa [%1+%2+16], %3 ; top(bottom)_1 movdqa [%1+%2+16], %3 ; top(bottom)_1
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movdqa [%1], %3 ; top(bottom)_2 movdqa [%1], %3 ; top(bottom)_2
movdqa [%1+16], %3 ; top(bottom)_2 movdqa [%1+16], %3 ; top(bottom)_2
movdqa [%1+%2], %3 ; top(bottom)_3 movdqa [%1+%2], %3 ; top(bottom)_3
movdqa [%1+%2+16], %3 ; top(bottom)_3 movdqa [%1+%2+16], %3 ; top(bottom)_3
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
%endmacro %endmacro
%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm? %macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
movdqa [%1], %3 ; top(bottom)_0 movdqa [%1], %3 ; top(bottom)_0
movdqa [%1+16], %3 ; top(bottom)_0 movdqa [%1+16], %3 ; top(bottom)_0
movdqa [%1+%2], %3 ; top(bottom)_1 movdqa [%1+%2], %3 ; top(bottom)_1
movdqa [%1+%2+16], %3 ; top(bottom)_1 movdqa [%1+%2+16], %3 ; top(bottom)_1
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movdqa [%1], %3 ; top(bottom)_2 movdqa [%1], %3 ; top(bottom)_2
movdqa [%1+16], %3 ; top(bottom)_2 movdqa [%1+16], %3 ; top(bottom)_2
movdqa [%1+%2], %3 ; top(bottom)_3 movdqa [%1+%2], %3 ; top(bottom)_3
movdqa [%1+%2+16], %3 ; top(bottom)_3 movdqa [%1+%2+16], %3 ; top(bottom)_3
lea %1, [%1+%2] lea %1, [%1+%2]
%endmacro %endmacro
%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)] %macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
;r2 [width/16(8)] ;r2 [width/16(8)]
;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top ;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom ;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
%if %1 == 32 ; for luma %if %1 == 32 ; for luma
sar r2, 04h ; width / 16(8) pixels sar r2, 04h ; width / 16(8) pixels
.top_bottom_loops: .top_bottom_loops:
; top ; top
movdqa xmm0, [r0] ; first line of picture pData movdqa xmm0, [r0] ; first line of picture pData
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_end16x4_sse2 r5, r1, xmm0, a mov_line_end16x4_sse2 r5, r1, xmm0, a
; bottom ; bottom
movdqa xmm1, [r3] ; last line of picture pData movdqa xmm1, [r3] ; last line of picture pData
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_end16x4_sse2 r4, r1, xmm1, a mov_line_end16x4_sse2 r4, r1, xmm1, a
lea r0, [r0+16] ; top pSrc lea r0, [r0+16] ; top pSrc
lea r5, [r5+16] ; top dst lea r5, [r5+16] ; top dst
lea r3, [r3+16] ; bottom pSrc lea r3, [r3+16] ; bottom pSrc
lea r4, [r4+16] ; bottom dst lea r4, [r4+16] ; bottom dst
neg r1 ; positive/negative stride need for next loop? neg r1 ; positive/negative stride need for next loop?
dec r2 dec r2
jnz near .top_bottom_loops jnz near .top_bottom_loops
%elif %1 == 16 ; for chroma ?? %elif %1 == 16 ; for chroma ??
mov r6, r2 mov r6, r2
sar r2, 04h ; (width / 16) pixels sar r2, 04h ; (width / 16) pixels
.top_bottom_loops: .top_bottom_loops:
; top ; top
movdqa xmm0, [r0] ; first line of picture pData movdqa xmm0, [r0] ; first line of picture pData
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_end16x4_sse2 r5, r1, xmm0, a mov_line_end16x4_sse2 r5, r1, xmm0, a
; bottom ; bottom
movdqa xmm1, [r3] ; last line of picture pData movdqa xmm1, [r3] ; last line of picture pData
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_end16x4_sse2 r4, r1, xmm1, a mov_line_end16x4_sse2 r4, r1, xmm1, a
lea r0, [r0+16] ; top pSrc lea r0, [r0+16] ; top pSrc
lea r5, [r5+16] ; top dst lea r5, [r5+16] ; top dst
lea r3, [r3+16] ; bottom pSrc lea r3, [r3+16] ; bottom pSrc
lea r4, [r4+16] ; bottom dst lea r4, [r4+16] ; bottom dst
neg r1 ; positive/negative stride need for next loop? neg r1 ; positive/negative stride need for next loop?
dec r2 dec r2
jnz near .top_bottom_loops jnz near .top_bottom_loops
; for remaining 8 bytes ; for remaining 8 bytes
and r6, 0fh ; any 8 bytes left? and r6, 0fh ; any 8 bytes left?
test r6, r6 test r6, r6
jz near .to_be_continued ; no left to exit here jz near .to_be_continued ; no left to exit here
; top ; top
movq mm0, [r0] ; remained 8 byte movq mm0, [r0] ; remained 8 byte
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm? mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
; bottom ; bottom
movq mm1, [r3] movq mm1, [r3]
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm? mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
WELSEMMS WELSEMMS
.to_be_continued: .to_be_continued:
%endif %endif
%endmacro %endmacro
%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a %macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
;r6 [height] ;r6 [height]
;r0 [pSrc+0] r5[pSrc-32] r1[stride] ;r0 [pSrc+0] r5[pSrc-32] r1[stride]
;r3 [pSrc+(w-1)] r4[pSrc+w] ;r3 [pSrc+(w-1)] r4[pSrc+w]
%if %1 == 32 ; for luma %if %1 == 32 ; for luma
.left_right_loops: .left_right_loops:
; left ; left
movzx r2d, byte [r0] ; pixel pData for left border movzx r2d, byte [r0] ; pixel pData for left border
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r5], xmm0 movdqa [r5], xmm0
movdqa [r5+16], xmm0 movdqa [r5+16], xmm0
; right ; right
movzx r2d, byte [r3] movzx r2d, byte [r3]
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r4], xmm1 movdqa [r4], xmm1
movdqa [r4+16], xmm1 movdqa [r4+16], xmm1
lea r0, [r0+r1] ; left pSrc lea r0, [r0+r1] ; left pSrc
lea r5, [r5+r1] ; left dst lea r5, [r5+r1] ; left dst
lea r3, [r3+r1] ; right pSrc lea r3, [r3+r1] ; right pSrc
lea r4, [r4+r1] ; right dst lea r4, [r4+r1] ; right dst
dec r6 dec r6
jnz near .left_right_loops jnz near .left_right_loops
%elif %1 == 16 ; for chroma ?? %elif %1 == 16 ; for chroma ??
.left_right_loops: .left_right_loops:
; left ; left
movzx r2d, byte [r0] ; pixel pData for left border movzx r2d, byte [r0] ; pixel pData for left border
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r5], xmm0 movdqa [r5], xmm0
; right ; right
movzx r2d, byte [r3] movzx r2d, byte [r3]
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
lea r0, [r0+r1] ; left pSrc lea r0, [r0+r1] ; left pSrc
lea r5, [r5+r1] ; left dst lea r5, [r5+r1] ; left dst
lea r3, [r3+r1] ; right pSrc lea r3, [r3+r1] ; right pSrc
lea r4, [r4+r1] ; right dst lea r4, [r4+r1] ; right dst
dec r6 dec r6
jnz near .left_right_loops jnz near .left_right_loops
%endif %endif
%endmacro %endmacro
%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a %macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6 ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride ;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
%if %1 == 32 ; luma %if %1 == 32 ; luma
; TL ; TL
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
; TR ; TR
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
; BL ; BL
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
; BR ; BR
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
%elif %1 == 16 ; chroma %elif %1 == 16 ; chroma
; TL ; TL
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
; TR ; TR
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
; BL ; BL
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
; BR ; BR
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
%endif %endif
%endmacro %endmacro
;***********************************************************************---------------- ;***********************************************************************----------------
; void ExpandPictureLuma_sse2( uint8_t *pDst, ; void ExpandPictureLuma_sse2( uint8_t *pDst,
; const int32_t iStride, ; const int32_t iStride,
; const int32_t iWidth, ; const int32_t iWidth,
; const int32_t iHeight ); ; const int32_t iHeight );
;***********************************************************************---------------- ;***********************************************************************----------------
WELS_EXTERN ExpandPictureLuma_sse2 WELS_EXTERN ExpandPictureLuma_sse2
@ -403,8 +403,8 @@ WELS_EXTERN ExpandPictureLuma_sse2
exp_top_bottom_sse2 32 exp_top_bottom_sse2 32
; for both left and right border ; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2 pop r2
pop r1 pop r1
@ -416,8 +416,8 @@ WELS_EXTERN ExpandPictureLuma_sse2
lea r4,[r3+1] ;right border dst lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4 ;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d SSE2_Copy16Times xmm4,r6d
neg r1 ;r1 = stride neg r1 ;r1 = stride
@ -438,8 +438,8 @@ WELS_EXTERN ExpandPictureLuma_sse2
pop r1 pop r1
pop r0 pop r0
; for cross border [top-left, top-right, bottom-left, bottom-right] ; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued.. ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride neg r1 ;r1 = -stride
@ -472,13 +472,13 @@ WELS_EXTERN ExpandPictureLuma_sse2
%assign push_num 0 %assign push_num 0
ret ret
;***********************************************************************---------------- ;***********************************************************************----------------
; void ExpandPictureChromaAlign_sse2( uint8_t *pDst, ; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
; const int32_t iStride, ; const int32_t iStride,
; const int32_t iWidth, ; const int32_t iWidth,
; const int32_t iHeight ); ; const int32_t iHeight );
;***********************************************************************---------------- ;***********************************************************************----------------
WELS_EXTERN ExpandPictureChromaAlign_sse2 WELS_EXTERN ExpandPictureChromaAlign_sse2
@ -531,8 +531,8 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
exp_top_bottom_sse2 16 exp_top_bottom_sse2 16
; for both left and right border ; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2 pop r2
pop r1 pop r1
@ -557,7 +557,7 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
push r0 push r0
push r1 push r1
push r2 push r2
push r6 push r6
exp_left_right_sse2 16,a exp_left_right_sse2 16,a
pop r6 pop r6
@ -565,8 +565,8 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
pop r1 pop r1
pop r0 pop r0
; for cross border [top-left, top-right, bottom-left, bottom-right] ; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued.. ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride neg r1 ;r1 = -stride
@ -599,16 +599,16 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
%assign push_num 0 %assign push_num 0
ret ret
;***********************************************************************---------------- ;***********************************************************************----------------
; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst, ; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
; const int32_t iStride, ; const int32_t iStride,
; const int32_t iWidth, ; const int32_t iWidth,
; const int32_t iHeight ); ; const int32_t iHeight );
;***********************************************************************---------------- ;***********************************************************************----------------
WELS_EXTERN ExpandPictureChromaUnalign_sse2 WELS_EXTERN ExpandPictureChromaUnalign_sse2
push r4 push r4
push r5 push r5
push r6 push r6
@ -657,8 +657,8 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
exp_top_bottom_sse2 16 exp_top_bottom_sse2 16
; for both left and right border ; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2 pop r2
pop r1 pop r1
@ -683,7 +683,7 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
push r0 push r0
push r1 push r1
push r2 push r2
push r6 push r6
exp_left_right_sse2 16,u exp_left_right_sse2 16,u
pop r6 pop r6
@ -691,8 +691,8 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
pop r1 pop r1
pop r0 pop r0
; for cross border [top-left, top-right, bottom-left, bottom-right] ; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued.. ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride neg r1 ;r1 = -stride
@ -725,4 +725,4 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
%assign push_num 0 %assign push_num 0
ret ret

View File

@ -36,9 +36,9 @@
;* ;*
;* History ;* History
;* 15/09/2009 Created ;* 15/09/2009 Created
;* 12/28/2009 Modified with larger throughput ;* 12/28/2009 Modified with larger throughput
;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2, ;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc; ;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
;* ;*
;* ;*
;*********************************************************************************************/ ;*********************************************************************************************/
@ -56,174 +56,174 @@ SECTION .text
;*********************************************************************** ;***********************************************************************
; void WelsCopy16x16_sse2( uint8_t* Dst, ; void WelsCopy16x16_sse2( uint8_t* Dst,
; int32_t iStrideD, ; int32_t iStrideD,
; uint8_t* Src, ; uint8_t* Src,
; int32_t iStrideS ) ; int32_t iStrideS )
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsCopy16x16_sse2 WELS_EXTERN WelsCopy16x16_sse2
push r4 push r4
push r5 push r5
%assign push_num 2 %assign push_num 2
LOAD_4_PARA LOAD_4_PARA
PUSH_XMM 8 PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
movdqa xmm0, [r2] movdqa xmm0, [r2]
movdqa xmm1, [r2+r3] movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+2*r3] movdqa xmm2, [r2+2*r3]
movdqa xmm3, [r2+r5] movdqa xmm3, [r2+r5]
lea r2, [r2+4*r3] lea r2, [r2+4*r3]
movdqa xmm4, [r2] movdqa xmm4, [r2]
movdqa xmm5, [r2+r3] movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+2*r3] movdqa xmm6, [r2+2*r3]
movdqa xmm7, [r2+r5] movdqa xmm7, [r2+r5]
lea r2, [r2+4*r3] lea r2, [r2+4*r3]
movdqa [r0], xmm0 movdqa [r0], xmm0
movdqa [r0+r1], xmm1 movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2 movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3 movdqa [r0+r4], xmm3
lea r0, [r0+4*r1] lea r0, [r0+4*r1]
movdqa [r0], xmm4 movdqa [r0], xmm4
movdqa [r0+r1], xmm5 movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6 movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7 movdqa [r0+r4], xmm7
lea r0, [r0+4*r1] lea r0, [r0+4*r1]
movdqa xmm0, [r2] movdqa xmm0, [r2]
movdqa xmm1, [r2+r3] movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+2*r3] movdqa xmm2, [r2+2*r3]
movdqa xmm3, [r2+r5] movdqa xmm3, [r2+r5]
lea r2, [r2+4*r3] lea r2, [r2+4*r3]
movdqa xmm4, [r2] movdqa xmm4, [r2]
movdqa xmm5, [r2+r3] movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+2*r3] movdqa xmm6, [r2+2*r3]
movdqa xmm7, [r2+r5] movdqa xmm7, [r2+r5]
movdqa [r0], xmm0 movdqa [r0], xmm0
movdqa [r0+r1], xmm1 movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2 movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3 movdqa [r0+r4], xmm3
lea r0, [r0+4*r1] lea r0, [r0+4*r1]
movdqa [r0], xmm4 movdqa [r0], xmm4
movdqa [r0+r1], xmm5 movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6 movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7 movdqa [r0+r4], xmm7
POP_XMM POP_XMM
LOAD_4_PARA_POP LOAD_4_PARA_POP
pop r5 pop r5
pop r4 pop r4
ret ret
;*********************************************************************** ;***********************************************************************
; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst, ; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
; int32_t iStrideD, ; int32_t iStrideD,
; uint8_t* Src, ; uint8_t* Src,
; int32_t iStrideS ) ; int32_t iStrideS )
;*********************************************************************** ;***********************************************************************
; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011 ; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
WELS_EXTERN WelsCopy16x16NotAligned_sse2 WELS_EXTERN WelsCopy16x16NotAligned_sse2
push r4 push r4
push r5 push r5
%assign push_num 2 %assign push_num 2
LOAD_4_PARA LOAD_4_PARA
PUSH_XMM 8 PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
movdqu xmm0, [r2] movdqu xmm0, [r2]
movdqu xmm1, [r2+r3] movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3] movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5] movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3] lea r2, [r2+4*r3]
movdqu xmm4, [r2] movdqu xmm4, [r2]
movdqu xmm5, [r2+r3] movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3] movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5] movdqu xmm7, [r2+r5]
lea r2, [r2+4*r3] lea r2, [r2+4*r3]
movdqa [r0], xmm0 movdqa [r0], xmm0
movdqa [r0+r1], xmm1 movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2 movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3 movdqa [r0+r4], xmm3
lea r0, [r0+4*r1] lea r0, [r0+4*r1]
movdqa [r0], xmm4 movdqa [r0], xmm4
movdqa [r0+r1], xmm5 movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6 movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7 movdqa [r0+r4], xmm7
lea r0, [r0+4*r1] lea r0, [r0+4*r1]
movdqu xmm0, [r2] movdqu xmm0, [r2]
movdqu xmm1, [r2+r3] movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3] movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5] movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3] lea r2, [r2+4*r3]
movdqu xmm4, [r2] movdqu xmm4, [r2]
movdqu xmm5, [r2+r3] movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3] movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5] movdqu xmm7, [r2+r5]
movdqa [r0], xmm0 movdqa [r0], xmm0
movdqa [r0+r1], xmm1 movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2 movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3 movdqa [r0+r4], xmm3
lea r0, [r0+4*r1] lea r0, [r0+4*r1]
movdqa [r0], xmm4 movdqa [r0], xmm4
movdqa [r0+r1], xmm5 movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6 movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7 movdqa [r0+r4], xmm7
POP_XMM POP_XMM
LOAD_4_PARA_POP LOAD_4_PARA_POP
pop r5 pop r5
pop r4 pop r4
ret ret
; , 12/29/2011 ; , 12/29/2011
;*********************************************************************** ;***********************************************************************
; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst, ; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
; int32_t iStrideD, ; int32_t iStrideD,
; uint8_t* Src, ; uint8_t* Src,
; int32_t iStrideS ) ; int32_t iStrideS )
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsCopy16x8NotAligned_sse2 WELS_EXTERN WelsCopy16x8NotAligned_sse2
push r4 push r4
push r5 push r5
%assign push_num 2 %assign push_num 2
LOAD_4_PARA LOAD_4_PARA
PUSH_XMM 8 PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
movdqu xmm0, [r2] movdqu xmm0, [r2]
movdqu xmm1, [r2+r3] movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3] movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5] movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3] lea r2, [r2+4*r3]
movdqu xmm4, [r2] movdqu xmm4, [r2]
movdqu xmm5, [r2+r3] movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3] movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5] movdqu xmm7, [r2+r5]
movdqa [r0], xmm0 movdqa [r0], xmm0
movdqa [r0+r1], xmm1 movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2 movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3 movdqa [r0+r4], xmm3
lea r0, [r0+4*r1] lea r0, [r0+4*r1]
movdqa [r0], xmm4 movdqa [r0], xmm4
movdqa [r0+r1], xmm5 movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6 movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7 movdqa [r0+r4], xmm7
POP_XMM POP_XMM
LOAD_4_PARA_POP LOAD_4_PARA_POP
pop r5 pop r5
pop r4 pop r4
ret ret
;*********************************************************************** ;***********************************************************************
@ -233,62 +233,62 @@ WELS_EXTERN WelsCopy16x8NotAligned_sse2
; int32_t iStrideS ) ; int32_t iStrideS )
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsCopy8x16_mmx WELS_EXTERN WelsCopy8x16_mmx
%assign push_num 0 %assign push_num 0
LOAD_4_PARA LOAD_4_PARA
movq mm0, [r2] movq mm0, [r2]
movq mm1, [r2+r3] movq mm1, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movq mm2, [r2] movq mm2, [r2]
movq mm3, [r2+r3] movq mm3, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movq mm4, [r2] movq mm4, [r2]
movq mm5, [r2+r3] movq mm5, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movq mm6, [r2] movq mm6, [r2]
movq mm7, [r2+r3] movq mm7, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movq [r0], mm0 movq [r0], mm0
movq [r0+r1], mm1 movq [r0+r1], mm1
lea r0, [r0+2*r1] lea r0, [r0+2*r1]
movq [r0], mm2 movq [r0], mm2
movq [r0+r1], mm3 movq [r0+r1], mm3
lea r0, [r0+2*r1] lea r0, [r0+2*r1]
movq [r0], mm4 movq [r0], mm4
movq [r0+r1], mm5 movq [r0+r1], mm5
lea r0, [r0+2*r1] lea r0, [r0+2*r1]
movq [r0], mm6 movq [r0], mm6
movq [r0+r1], mm7 movq [r0+r1], mm7
lea r0, [r0+2*r1] lea r0, [r0+2*r1]
movq mm0, [r2] movq mm0, [r2]
movq mm1, [r2+r3] movq mm1, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movq mm2, [r2] movq mm2, [r2]
movq mm3, [r2+r3] movq mm3, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movq mm4, [r2] movq mm4, [r2]
movq mm5, [r2+r3] movq mm5, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movq mm6, [r2] movq mm6, [r2]
movq mm7, [r2+r3] movq mm7, [r2+r3]
movq [r0], mm0 movq [r0], mm0
movq [r0+r1], mm1 movq [r0+r1], mm1
lea r0, [r0+2*r1] lea r0, [r0+2*r1]
movq [r0], mm2 movq [r0], mm2
movq [r0+r1], mm3 movq [r0+r1], mm3
lea r0, [r0+2*r1] lea r0, [r0+2*r1]
movq [r0], mm4 movq [r0], mm4
movq [r0+r1], mm5 movq [r0+r1], mm5
lea r0, [r0+2*r1] lea r0, [r0+2*r1]
movq [r0], mm6 movq [r0], mm6
movq [r0+r1], mm7 movq [r0+r1], mm7
WELSEMMS WELSEMMS
LOAD_4_PARA_POP LOAD_4_PARA_POP
ret ret
;*********************************************************************** ;***********************************************************************
; void WelsCopy8x8_mmx( uint8_t* Dst, ; void WelsCopy8x8_mmx( uint8_t* Dst,
@ -297,48 +297,48 @@ WELS_EXTERN WelsCopy8x16_mmx
; int32_t iStrideS ) ; int32_t iStrideS )
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsCopy8x8_mmx WELS_EXTERN WelsCopy8x8_mmx
push r4 push r4
%assign push_num 1 %assign push_num 1
LOAD_4_PARA LOAD_4_PARA
lea r4, [r3+2*r3] ;edx, [ebx+2*ebx] lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
; to prefetch next loop ; to prefetch next loop
prefetchnta [r2+2*r3] prefetchnta [r2+2*r3]
prefetchnta [r2+r4] prefetchnta [r2+r4]
movq mm0, [r2] movq mm0, [r2]
movq mm1, [r2+r3] movq mm1, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
; to prefetch next loop ; to prefetch next loop
prefetchnta [r2+2*r3] prefetchnta [r2+2*r3]
prefetchnta [r2+r4] prefetchnta [r2+r4]
movq mm2, [r2] movq mm2, [r2]
movq mm3, [r2+r3] movq mm3, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
; to prefetch next loop ; to prefetch next loop
prefetchnta [r2+2*r3] prefetchnta [r2+2*r3]
prefetchnta [r2+r4] prefetchnta [r2+r4]
movq mm4, [r2] movq mm4, [r2]
movq mm5, [r2+r3] movq mm5, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movq mm6, [r2] movq mm6, [r2]
movq mm7, [r2+r3] movq mm7, [r2+r3]
movq [r0], mm0 movq [r0], mm0
movq [r0+r1], mm1 movq [r0+r1], mm1
lea r0, [r0+2*r1] lea r0, [r0+2*r1]
movq [r0], mm2 movq [r0], mm2
movq [r0+r1], mm3 movq [r0+r1], mm3
lea r0, [r0+2*r1] lea r0, [r0+2*r1]
movq [r0], mm4 movq [r0], mm4
movq [r0+r1], mm5 movq [r0+r1], mm5
lea r0, [r0+2*r1] lea r0, [r0+2*r1]
movq [r0], mm6 movq [r0], mm6
movq [r0+r1], mm7 movq [r0+r1], mm7
WELSEMMS WELSEMMS
LOAD_4_PARA_POP LOAD_4_PARA_POP
pop r4 pop r4
ret ret
; (dunhuang@cisco), 12/21/2011 ; (dunhuang@cisco), 12/21/2011
;*********************************************************************** ;***********************************************************************
@ -349,13 +349,13 @@ WELS_EXTERN UpdateMbMv_sse2
%assign push_num 0 %assign push_num 0
LOAD_2_PARA LOAD_2_PARA
movd xmm0, r1d ; _mv movd xmm0, r1d ; _mv
pshufd xmm1, xmm0, $00 pshufd xmm1, xmm0, $00
movdqa [r0 ], xmm1 movdqa [r0 ], xmm1
movdqa [r0+0x10], xmm1 movdqa [r0+0x10], xmm1
movdqa [r0+0x20], xmm1 movdqa [r0+0x20], xmm1
movdqa [r0+0x30], xmm1 movdqa [r0+0x30], xmm1
ret ret
;******************************************************************************* ;*******************************************************************************
; Macros and other preprocessor constants ; Macros and other preprocessor constants
@ -381,14 +381,14 @@ WELS_EXTERN PixelAvgWidthEq4_mmx
%assign push_num 0 %assign push_num 0
LOAD_7_PARA LOAD_7_PARA
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d SIGN_EXTENSION r6, r6d
ALIGN 4 ALIGN 4
.height_loop: .height_loop:
movd mm0, [r4] movd mm0, [r4]
pavgb mm0, [r2] pavgb mm0, [r2]
movd [r0], mm0 movd [r0], mm0
@ -398,8 +398,8 @@ ALIGN 4
lea r4, [r4+r5] lea r4, [r4+r5]
jne .height_loop jne .height_loop
WELSEMMS WELSEMMS
LOAD_7_PARA_POP LOAD_7_PARA_POP
ret ret
@ -413,29 +413,29 @@ WELS_EXTERN PixelAvgWidthEq8_mmx
%assign push_num 0 %assign push_num 0
LOAD_7_PARA LOAD_7_PARA
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d SIGN_EXTENSION r6, r6d
ALIGN 4 ALIGN 4
.height_loop: .height_loop:
movq mm0, [r2] movq mm0, [r2]
pavgb mm0, [r4] pavgb mm0, [r4]
movq [r0], mm0 movq [r0], mm0
movq mm0, [r2+r3] movq mm0, [r2+r3]
pavgb mm0, [r4+r5] pavgb mm0, [r4+r5]
movq [r0+r1], mm0 movq [r0+r1], mm0
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
lea r4, [r4+2*r5] lea r4, [r4+2*r5]
lea r0, [r0+2*r1] lea r0, [r0+2*r1]
sub r6, 2 sub r6, 2
jnz .height_loop jnz .height_loop
WELSEMMS WELSEMMS
LOAD_7_PARA_POP LOAD_7_PARA_POP
ret ret
@ -450,46 +450,46 @@ WELS_EXTERN PixelAvgWidthEq16_sse2
%assign push_num 0 %assign push_num 0
LOAD_7_PARA LOAD_7_PARA
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d SIGN_EXTENSION r6, r6d
ALIGN 4 ALIGN 4
.height_loop: .height_loop:
movdqu xmm0, [r2] movdqu xmm0, [r2]
movdqu xmm1, [r4] movdqu xmm1, [r4]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
;pavgb xmm0, [r4] ;pavgb xmm0, [r4]
movdqu [r0], xmm0 movdqu [r0], xmm0
movdqu xmm0, [r2+r3] movdqu xmm0, [r2+r3]
movdqu xmm1, [r4+r5] movdqu xmm1, [r4+r5]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
movdqu [r0+r1], xmm0 movdqu [r0+r1], xmm0
movdqu xmm0, [r2+2*r3] movdqu xmm0, [r2+2*r3]
movdqu xmm1, [r4+2*r5] movdqu xmm1, [r4+2*r5]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
movdqu [r0+2*r1], xmm0 movdqu [r0+2*r1], xmm0
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
lea r4, [r4+2*r5] lea r4, [r4+2*r5]
lea r0, [r0+2*r1] lea r0, [r0+2*r1]
movdqu xmm0, [r2+r3] movdqu xmm0, [r2+r3]
movdqu xmm1, [r4+r5] movdqu xmm1, [r4+r5]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
movdqu [r0+r1], xmm0 movdqu [r0+r1], xmm0
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
lea r4, [r4+2*r5] lea r4, [r4+2*r5]
lea r0, [r0+2*r1] lea r0, [r0+2*r1]
sub r6, 4 sub r6, 4
jne .height_loop jne .height_loop
WELSEMMS WELSEMMS
LOAD_7_PARA_POP LOAD_7_PARA_POP
ret ret
;******************************************************************************* ;*******************************************************************************
@ -497,26 +497,26 @@ ALIGN 4
; uint8_t *pDst, int iDstStride, int iHeight ) ; uint8_t *pDst, int iDstStride, int iHeight )
;******************************************************************************* ;*******************************************************************************
WELS_EXTERN McCopyWidthEq4_mmx WELS_EXTERN McCopyWidthEq4_mmx
push r5 push r5
%assign push_num 1 %assign push_num 1
LOAD_5_PARA LOAD_5_PARA
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d SIGN_EXTENSION r4, r4d
ALIGN 4 ALIGN 4
.height_loop: .height_loop:
mov r5d, [r0] mov r5d, [r0]
mov [r2], r5d mov [r2], r5d
add r0, r1 add r0, r1
add r2, r3 add r2, r3
dec r4 dec r4
jnz .height_loop jnz .height_loop
WELSEMMS WELSEMMS
LOAD_5_PARA_POP LOAD_5_PARA_POP
pop r5 pop r5
ret ret
;******************************************************************************* ;*******************************************************************************
@ -527,21 +527,21 @@ WELS_EXTERN McCopyWidthEq8_mmx
%assign push_num 0 %assign push_num 0
LOAD_5_PARA LOAD_5_PARA
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d SIGN_EXTENSION r4, r4d
ALIGN 4 ALIGN 4
.height_loop: .height_loop:
movq mm0, [r0] movq mm0, [r0]
movq [r2], mm0 movq [r2], mm0
add r0, r1 add r0, r1
add r2, r3 add r2, r3
dec r4 dec r4
jnz .height_loop jnz .height_loop
WELSEMMS WELSEMMS
LOAD_5_PARA_POP LOAD_5_PARA_POP
ret ret
@ -550,32 +550,32 @@ ALIGN 4
;******************************************************************************* ;*******************************************************************************
;read unaligned memory ;read unaligned memory
%macro SSE_READ_UNA 2 %macro SSE_READ_UNA 2
movq %1, [%2] movq %1, [%2]
movhps %1, [%2+8] movhps %1, [%2+8]
%endmacro %endmacro
;write unaligned memory ;write unaligned memory
%macro SSE_WRITE_UNA 2 %macro SSE_WRITE_UNA 2
movq [%1], %2 movq [%1], %2
movhps [%1+8], %2 movhps [%1+8], %2
%endmacro %endmacro
WELS_EXTERN McCopyWidthEq16_sse2 WELS_EXTERN McCopyWidthEq16_sse2
%assign push_num 0 %assign push_num 0
LOAD_5_PARA LOAD_5_PARA
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d SIGN_EXTENSION r4, r4d
ALIGN 4 ALIGN 4
.height_loop: .height_loop:
SSE_READ_UNA xmm0, r0 SSE_READ_UNA xmm0, r0
SSE_READ_UNA xmm1, r0+r1 SSE_READ_UNA xmm1, r0+r1
SSE_WRITE_UNA r2, xmm0 SSE_WRITE_UNA r2, xmm0
SSE_WRITE_UNA r2+r3, xmm1 SSE_WRITE_UNA r2+r3, xmm1
sub r4, 2 sub r4, 2
lea r0, [r0+r1*2] lea r0, [r0+r1*2]
lea r2, [r2+r3*2] lea r2, [r2+r3*2]
jnz .height_loop jnz .height_loop
LOAD_5_PARA_POP LOAD_5_PARA_POP
ret ret

View File

@ -53,10 +53,10 @@ SECTION .rodata align=16
ALIGN 16 ALIGN 16
h264_d0x20_sse2: h264_d0x20_sse2:
dw 32,32,32,32,32,32,32,32 dw 32,32,32,32,32,32,32,32
ALIGN 16 ALIGN 16
h264_d0x20_mmx: h264_d0x20_mmx:
dw 32,32,32,32 dw 32,32,32,32
;============================================================================= ;=============================================================================
@ -67,171 +67,171 @@ SECTION .text
;******************************************************************************* ;*******************************************************************************
; void McChromaWidthEq4_mmx( const uint8_t *src, ; void McChromaWidthEq4_mmx( const uint8_t *src,
; int32_t iSrcStride, ; int32_t iSrcStride,
; uint8_t *pDst, ; uint8_t *pDst,
; int32_t iDstStride, ; int32_t iDstStride,
; const uint8_t *pABCD, ; const uint8_t *pABCD,
; int32_t iHeigh ); ; int32_t iHeigh );
;******************************************************************************* ;*******************************************************************************
WELS_EXTERN McChromaWidthEq4_mmx WELS_EXTERN McChromaWidthEq4_mmx
%assign push_num 0 %assign push_num 0
LOAD_6_PARA LOAD_6_PARA
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d SIGN_EXTENSION r5, r5d
movd mm3, [r4]; [eax] movd mm3, [r4]; [eax]
WELS_Zero mm7 WELS_Zero mm7
punpcklbw mm3, mm3 punpcklbw mm3, mm3
movq mm4, mm3 movq mm4, mm3
punpcklwd mm3, mm3 punpcklwd mm3, mm3
punpckhwd mm4, mm4 punpckhwd mm4, mm4
movq mm5, mm3 movq mm5, mm3
punpcklbw mm3, mm7 punpcklbw mm3, mm7
punpckhbw mm5, mm7 punpckhbw mm5, mm7
movq mm6, mm4 movq mm6, mm4
punpcklbw mm4, mm7 punpcklbw mm4, mm7
punpckhbw mm6, mm7 punpckhbw mm6, mm7
lea r4, [r0 + r1] ;lea ebx, [esi + eax] lea r4, [r0 + r1] ;lea ebx, [esi + eax]
movd mm0, [r0] movd mm0, [r0]
movd mm1, [r0+1] movd mm1, [r0+1]
punpcklbw mm0, mm7 punpcklbw mm0, mm7
punpcklbw mm1, mm7 punpcklbw mm1, mm7
.xloop: .xloop:
pmullw mm0, mm3 pmullw mm0, mm3
pmullw mm1, mm5 pmullw mm1, mm5
paddw mm0, mm1 paddw mm0, mm1
movd mm1, [r4] movd mm1, [r4]
punpcklbw mm1, mm7 punpcklbw mm1, mm7
movq mm2, mm1 movq mm2, mm1
pmullw mm1, mm4 pmullw mm1, mm4
paddw mm0, mm1 paddw mm0, mm1
movd mm1, [r4+1] movd mm1, [r4+1]
punpcklbw mm1, mm7 punpcklbw mm1, mm7
movq mm7, mm1 movq mm7, mm1
pmullw mm1,mm6 pmullw mm1,mm6
paddw mm0, mm1 paddw mm0, mm1
movq mm1,mm7 movq mm1,mm7
paddw mm0, [h264_d0x20_mmx] paddw mm0, [h264_d0x20_mmx]
psrlw mm0, 6 psrlw mm0, 6
WELS_Zero mm7 WELS_Zero mm7
packuswb mm0, mm7 packuswb mm0, mm7
movd [r2], mm0 movd [r2], mm0
movq mm0, mm2 movq mm0, mm2
lea r2, [r2 + r3] lea r2, [r2 + r3]
lea r4, [r4 + r1] lea r4, [r4 + r1]
dec r5 dec r5
jnz near .xloop jnz near .xloop
WELSEMMS WELSEMMS
LOAD_6_PARA_POP LOAD_6_PARA_POP
ret ret
;******************************************************************************* ;*******************************************************************************
; void McChromaWidthEq8_sse2( const uint8_t *pSrc, ; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
; int32_t iSrcStride, ; int32_t iSrcStride,
; uint8_t *pDst, ; uint8_t *pDst,
; int32_t iDstStride, ; int32_t iDstStride,
; const uint8_t *pABCD, ; const uint8_t *pABCD,
; int32_t iheigh ); ; int32_t iheigh );
;******************************************************************************* ;*******************************************************************************
WELS_EXTERN McChromaWidthEq8_sse2 WELS_EXTERN McChromaWidthEq8_sse2
%assign push_num 0 %assign push_num 0
LOAD_6_PARA LOAD_6_PARA
PUSH_XMM 8 PUSH_XMM 8
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d SIGN_EXTENSION r5, r5d
movd xmm3, [r4] movd xmm3, [r4]
WELS_Zero xmm7 WELS_Zero xmm7
punpcklbw xmm3, xmm3 punpcklbw xmm3, xmm3
punpcklwd xmm3, xmm3 punpcklwd xmm3, xmm3
movdqa xmm4, xmm3 movdqa xmm4, xmm3
punpckldq xmm3, xmm3 punpckldq xmm3, xmm3
punpckhdq xmm4, xmm4 punpckhdq xmm4, xmm4
movdqa xmm5, xmm3 movdqa xmm5, xmm3
movdqa xmm6, xmm4 movdqa xmm6, xmm4
punpcklbw xmm3, xmm7 punpcklbw xmm3, xmm7
punpckhbw xmm5, xmm7 punpckhbw xmm5, xmm7
punpcklbw xmm4, xmm7 punpcklbw xmm4, xmm7
punpckhbw xmm6, xmm7 punpckhbw xmm6, xmm7
lea r4, [r0 + r1] ;lea ebx, [esi + eax] lea r4, [r0 + r1] ;lea ebx, [esi + eax]
movq xmm0, [r0] movq xmm0, [r0]
movq xmm1, [r0+1] movq xmm1, [r0+1]
punpcklbw xmm0, xmm7 punpcklbw xmm0, xmm7
punpcklbw xmm1, xmm7 punpcklbw xmm1, xmm7
.xloop: .xloop:
pmullw xmm0, xmm3 pmullw xmm0, xmm3
pmullw xmm1, xmm5 pmullw xmm1, xmm5
paddw xmm0, xmm1 paddw xmm0, xmm1
movq xmm1, [r4] movq xmm1, [r4]
punpcklbw xmm1, xmm7 punpcklbw xmm1, xmm7
movdqa xmm2, xmm1 movdqa xmm2, xmm1
pmullw xmm1, xmm4 pmullw xmm1, xmm4
paddw xmm0, xmm1 paddw xmm0, xmm1
movq xmm1, [r4+1] movq xmm1, [r4+1]
punpcklbw xmm1, xmm7 punpcklbw xmm1, xmm7
movdqa xmm7, xmm1 movdqa xmm7, xmm1
pmullw xmm1, xmm6 pmullw xmm1, xmm6
paddw xmm0, xmm1 paddw xmm0, xmm1
movdqa xmm1,xmm7 movdqa xmm1,xmm7
paddw xmm0, [h264_d0x20_sse2] paddw xmm0, [h264_d0x20_sse2]
psrlw xmm0, 6 psrlw xmm0, 6
WELS_Zero xmm7 WELS_Zero xmm7
packuswb xmm0, xmm7 packuswb xmm0, xmm7
movq [r2], xmm0 movq [r2], xmm0
movdqa xmm0, xmm2 movdqa xmm0, xmm2
lea r2, [r2 + r3] lea r2, [r2 + r3]
lea r4, [r4 + r1] lea r4, [r4 + r1]
dec r5 dec r5
jnz near .xloop jnz near .xloop
POP_XMM POP_XMM
LOAD_6_PARA_POP LOAD_6_PARA_POP
ret ret
;*********************************************************************** ;***********************************************************************
; void McChromaWidthEq8_ssse3( const uint8_t *pSrc, ; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
; int32_t iSrcStride, ; int32_t iSrcStride,
; uint8_t *pDst, ; uint8_t *pDst,
; int32_t iDstStride, ; int32_t iDstStride,
; const uint8_t *pABCD, ; const uint8_t *pABCD,
; int32_t iHeigh); ; int32_t iHeigh);
;*********************************************************************** ;***********************************************************************
WELS_EXTERN McChromaWidthEq8_ssse3 WELS_EXTERN McChromaWidthEq8_ssse3
%assign push_num 0 %assign push_num 0
LOAD_6_PARA LOAD_6_PARA
PUSH_XMM 8 PUSH_XMM 8
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d SIGN_EXTENSION r5, r5d
pxor xmm7, xmm7 pxor xmm7, xmm7
movd xmm5, [r4] movd xmm5, [r4]
@ -243,27 +243,27 @@ WELS_EXTERN McChromaWidthEq8_ssse3
sub r2, r3 ;sub esi, edi sub r2, r3 ;sub esi, edi
sub r2, r3 sub r2, r3
movdqa xmm7, [h264_d0x20_sse2] movdqa xmm7, [h264_d0x20_sse2]
movdqu xmm0, [r0] movdqu xmm0, [r0]
movdqa xmm1, xmm0 movdqa xmm1, xmm0
psrldq xmm1, 1 psrldq xmm1, 1
punpcklbw xmm0, xmm1 punpcklbw xmm0, xmm1
.hloop_chroma: .hloop_chroma:
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movdqu xmm2, [r0+r1] movdqu xmm2, [r0+r1]
movdqa xmm3, xmm2 movdqa xmm3, xmm2
psrldq xmm3, 1 psrldq xmm3, 1
punpcklbw xmm2, xmm3 punpcklbw xmm2, xmm3
movdqa xmm4, xmm2 movdqa xmm4, xmm2
pmaddubsw xmm0, xmm5 pmaddubsw xmm0, xmm5
pmaddubsw xmm2, xmm6 pmaddubsw xmm2, xmm6
paddw xmm0, xmm2 paddw xmm0, xmm2
paddw xmm0, xmm7 paddw xmm0, xmm7
psrlw xmm0, 6 psrlw xmm0, 6
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movq [r2],xmm0 movq [r2],xmm0
@ -278,16 +278,16 @@ WELS_EXTERN McChromaWidthEq8_ssse3
pmaddubsw xmm2, xmm6 pmaddubsw xmm2, xmm6
paddw xmm4, xmm2 paddw xmm4, xmm2
paddw xmm4, xmm7 paddw xmm4, xmm7
psrlw xmm4, 6 psrlw xmm4, 6
packuswb xmm4, xmm4 packuswb xmm4, xmm4
movq [r2+r3],xmm4 movq [r2+r3],xmm4
sub r5, 2 sub r5, 2
jnz .hloop_chroma jnz .hloop_chroma
POP_XMM POP_XMM
LOAD_6_PARA_POP LOAD_6_PARA_POP
ret ret

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -29,16 +29,16 @@
;* POSSIBILITY OF SUCH DAMAGE. ;* POSSIBILITY OF SUCH DAMAGE.
;* ;*
;* ;*
;* vaa.asm ;* vaa.asm
;* ;*
;* Abstract ;* Abstract
;* sse2 for pVaa routines ;* sse2 for pVaa routines
;* ;*
;* History ;* History
;* 04/14/2010 Created ;* 04/14/2010 Created
;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3) ;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement ;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2 ;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
;* ;*
;*************************************************************************/ ;*************************************************************************/
%include "asm_inc.asm" %include "asm_inc.asm"
@ -49,87 +49,87 @@
;*********************************************************************** ;***********************************************************************
; by comparing it outperforms than phaddw(SSSE3) sets ; by comparing it outperforms than phaddw(SSSE3) sets
%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp %macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
; @sum_8x2 begin ; @sum_8x2 begin
pshufd %2, %1, 04Eh ; 01001110 B pshufd %2, %1, 04Eh ; 01001110 B
paddw %1, %2 paddw %1, %2
pshuflw %2, %1, 04Eh ; 01001110 B pshuflw %2, %1, 04Eh ; 01001110 B
paddw %1, %2 paddw %1, %2
pshuflw %2, %1, 0B1h ; 10110001 B pshuflw %2, %1, 0B1h ; 10110001 B
paddw %1, %2 paddw %1, %2
; end of @sum_8x2 ; end of @sum_8x2
%endmacro ; END of SUM_WORD_8x2_SSE2 %endmacro ; END of SUM_WORD_8x2_SSE2
%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4 %macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
movdqa %1, [r0 ] ; line 0 movdqa %1, [r0 ] ; line 0
movdqa %2, [r0+r1] ; line 1 movdqa %2, [r0+r1] ; line 1
movdqa %3, %1 movdqa %3, %1
punpcklbw %1, xmm7 punpcklbw %1, xmm7
punpckhbw %3, xmm7 punpckhbw %3, xmm7
movdqa %4, %2 movdqa %4, %2
punpcklbw %4, xmm7 punpcklbw %4, xmm7
punpckhbw %2, xmm7 punpckhbw %2, xmm7
paddw %1, %4 paddw %1, %4
paddw %2, %3 paddw %2, %3
movdqa %3, [r0+r2] ; line 2 movdqa %3, [r0+r2] ; line 2
movdqa %4, [r0+r3] ; line 3 movdqa %4, [r0+r3] ; line 3
movdqa %5, %3 movdqa %5, %3
punpcklbw %3, xmm7 punpcklbw %3, xmm7
punpckhbw %5, xmm7 punpckhbw %5, xmm7
movdqa %6, %4 movdqa %6, %4
punpcklbw %6, xmm7 punpcklbw %6, xmm7
punpckhbw %4, xmm7 punpckhbw %4, xmm7
paddw %3, %6 paddw %3, %6
paddw %4, %5 paddw %4, %5
paddw %1, %3 ; block 0, 1 paddw %1, %3 ; block 0, 1
paddw %2, %4 ; block 2, 3 paddw %2, %4 ; block 2, 3
pshufd %3, %1, 0B1h pshufd %3, %1, 0B1h
pshufd %4, %2, 0B1h pshufd %4, %2, 0B1h
paddw %1, %3 paddw %1, %3
paddw %2, %4 paddw %2, %4
movdqa %3, %1 movdqa %3, %1
movdqa %4, %2 movdqa %4, %2
pshuflw %5, %1, 0B1h pshuflw %5, %1, 0B1h
pshufhw %6, %3, 0B1h pshufhw %6, %3, 0B1h
paddw %1, %5 paddw %1, %5
paddw %3, %6 paddw %3, %6
pshuflw %5, %2, 0B1h pshuflw %5, %2, 0B1h
pshufhw %6, %4, 0B1h pshufhw %6, %4, 0B1h
paddw %2, %5 paddw %2, %5
paddw %4, %6 paddw %4, %6
punpcklwd %1, %2 punpcklwd %1, %2
punpckhwd %3, %4 punpckhwd %3, %4
punpcklwd %1, %3 punpcklwd %1, %3
psraw %1, $04 psraw %1, $04
%endmacro %endmacro
%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4 %macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
movdqa %1, [r0 ] ; line 0 movdqa %1, [r0 ] ; line 0
movdqa %2, [r0+r1] ; line 1 movdqa %2, [r0+r1] ; line 1
movdqa %3, %1 movdqa %3, %1
punpcklbw %1, xmm7 punpcklbw %1, xmm7
punpckhbw %3, xmm7 punpckhbw %3, xmm7
movdqa %4, %2 movdqa %4, %2
punpcklbw %4, xmm7 punpcklbw %4, xmm7
punpckhbw %2, xmm7 punpckhbw %2, xmm7
paddw %1, %4 paddw %1, %4
paddw %2, %3 paddw %2, %3
movdqa %3, [r0+r2] ; line 2 movdqa %3, [r0+r2] ; line 2
movdqa %4, [r0+r3] ; line 3 movdqa %4, [r0+r3] ; line 3
movdqa %5, %3 movdqa %5, %3
punpcklbw %3, xmm7 punpcklbw %3, xmm7
punpckhbw %5, xmm7 punpckhbw %5, xmm7
movdqa %6, %4 movdqa %6, %4
punpcklbw %6, xmm7 punpcklbw %6, xmm7
punpckhbw %4, xmm7 punpckhbw %4, xmm7
paddw %3, %6 paddw %3, %6
paddw %4, %5 paddw %4, %5
paddw %1, %3 ; block 0, 1 paddw %1, %3 ; block 0, 1
paddw %2, %4 ; block 2, 3 paddw %2, %4 ; block 2, 3
phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; .. phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; .... phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
psraw %1, $04 psraw %1, $04
%endmacro %endmacro
@ -143,7 +143,7 @@ SECTION .text
; , 6/7/2010 ; , 6/7/2010
;*********************************************************************** ;***********************************************************************
; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize ); ; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
;*********************************************************************** ;***********************************************************************
WELS_EXTERN AnalysisVaaInfoIntra_sse2 WELS_EXTERN AnalysisVaaInfoIntra_sse2
@ -174,71 +174,71 @@ WELS_EXTERN AnalysisVaaInfoIntra_sse2
mov r4,r2 mov r4,r2
sal r4,$01 ;r4 = 4*iLineSize sal r4,$01 ;r4 = 4*iLineSize
pxor xmm7, xmm7 pxor xmm7, xmm7
; loops ; loops
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7], xmm0 movq [r7], xmm0
lea r0, [r0+r4] lea r0, [r0+r4]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+8], xmm0 movq [r7+8], xmm0
lea r0, [r0+r4] lea r0, [r0+r4]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+16], xmm0 movq [r7+16], xmm0
lea r0, [r0+r4] lea r0, [r0+r4]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+24], xmm0 movq [r7+24], xmm0
movdqa xmm0, [r7] ; block 0~7 movdqa xmm0, [r7] ; block 0~7
movdqa xmm1, [r7+16] ; block 8~15 movdqa xmm1, [r7+16] ; block 8~15
movdqa xmm2, xmm0 movdqa xmm2, xmm0
paddw xmm0, xmm1 paddw xmm0, xmm1
SUM_WORD_8x2_SSE2 xmm0, xmm3 SUM_WORD_8x2_SSE2 xmm0, xmm3
pmullw xmm1, xmm1 pmullw xmm1, xmm1
pmullw xmm2, xmm2 pmullw xmm2, xmm2
movdqa xmm3, xmm1 movdqa xmm3, xmm1
movdqa xmm4, xmm2 movdqa xmm4, xmm2
punpcklwd xmm1, xmm7 punpcklwd xmm1, xmm7
punpckhwd xmm3, xmm7 punpckhwd xmm3, xmm7
punpcklwd xmm2, xmm7 punpcklwd xmm2, xmm7
punpckhwd xmm4, xmm7 punpckhwd xmm4, xmm7
paddd xmm1, xmm2 paddd xmm1, xmm2
paddd xmm3, xmm4 paddd xmm3, xmm4
paddd xmm1, xmm3 paddd xmm1, xmm3
pshufd xmm2, xmm1, 01Bh pshufd xmm2, xmm1, 01Bh
paddd xmm1, xmm2 paddd xmm1, xmm2
pshufd xmm2, xmm1, 0B1h pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2 paddd xmm1, xmm2
movd r2d, xmm0 movd r2d, xmm0
and r2, 0ffffh ; effective low work truncated and r2, 0ffffh ; effective low work truncated
mov r3, r2 mov r3, r2
imul r2, r3 imul r2, r3
sar r2, $04 sar r2, $04
movd retrd, xmm1 movd retrd, xmm1
sub retrd, r2d sub retrd, r2d
add r7,32 add r7,32
add r7,r5 add r7,r5
%ifdef X86_32 %ifdef X86_32
pop r6 pop r6
pop r5 pop r5
pop r4 pop r4
pop r3 pop r3
%endif %endif
POP_XMM POP_XMM
ret ret
;*********************************************************************** ;***********************************************************************
; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize ); ; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
;*********************************************************************** ;***********************************************************************
WELS_EXTERN AnalysisVaaInfoIntra_ssse3 WELS_EXTERN AnalysisVaaInfoIntra_ssse3
@ -269,47 +269,47 @@ WELS_EXTERN AnalysisVaaInfoIntra_ssse3
mov r4,r2 mov r4,r2
sal r4,$01 ;r4 = 4*iLineSize sal r4,$01 ;r4 = 4*iLineSize
pxor xmm7, xmm7 pxor xmm7, xmm7
; loops ; loops
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7],xmm0 movq [r7],xmm0
lea r0,[r0+r4] lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [r7+8],xmm1 movq [r7+8],xmm1
lea r0,[r0+r4] lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+16],xmm0 movq [r7+16],xmm0
lea r0,[r0+r4] lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [r7+24],xmm1 movq [r7+24],xmm1
movdqa xmm0,[r7] movdqa xmm0,[r7]
movdqa xmm1,[r7+16] movdqa xmm1,[r7+16]
movdqa xmm2, xmm0 movdqa xmm2, xmm0
paddw xmm0, xmm1 paddw xmm0, xmm1
SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
pmullw xmm1, xmm1 pmullw xmm1, xmm1
pmullw xmm2, xmm2 pmullw xmm2, xmm2
movdqa xmm3, xmm1 movdqa xmm3, xmm1
movdqa xmm4, xmm2 movdqa xmm4, xmm2
punpcklwd xmm1, xmm7 punpcklwd xmm1, xmm7
punpckhwd xmm3, xmm7 punpckhwd xmm3, xmm7
punpcklwd xmm2, xmm7 punpcklwd xmm2, xmm7
punpckhwd xmm4, xmm7 punpckhwd xmm4, xmm7
paddd xmm1, xmm2 paddd xmm1, xmm2
paddd xmm3, xmm4 paddd xmm3, xmm4
paddd xmm1, xmm3 paddd xmm1, xmm3
pshufd xmm2, xmm1, 01Bh pshufd xmm2, xmm1, 01Bh
paddd xmm1, xmm2 paddd xmm1, xmm2
pshufd xmm2, xmm1, 0B1h pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2 paddd xmm1, xmm2
movd r2d, xmm0 movd r2d, xmm0
@ -318,94 +318,94 @@ WELS_EXTERN AnalysisVaaInfoIntra_ssse3
imul r2, r3 imul r2, r3
sar r2, $04 sar r2, $04
movd retrd, xmm1 movd retrd, xmm1
sub retrd, r2d sub retrd, r2d
add r7,32 add r7,32
add r7,r5 add r7,r5
%ifdef X86_32 %ifdef X86_32
pop r6 pop r6
pop r5 pop r5
pop r4 pop r4
pop r3 pop r3
%endif %endif
POP_XMM POP_XMM
ret ret
;*********************************************************************** ;***********************************************************************
; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 ) ; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
;*********************************************************************** ;***********************************************************************
WELS_EXTERN MdInterAnalysisVaaInfo_sse41 WELS_EXTERN MdInterAnalysisVaaInfo_sse41
%assign push_num 0 %assign push_num 0
LOAD_1_PARA LOAD_1_PARA
movdqa xmm0,[r0] movdqa xmm0,[r0]
pshufd xmm1, xmm0, 01Bh pshufd xmm1, xmm0, 01Bh
paddd xmm1, xmm0 paddd xmm1, xmm0
pshufd xmm2, xmm1, 0B1h pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2 paddd xmm1, xmm2
psrad xmm1, 02h ; iAverageSad psrad xmm1, 02h ; iAverageSad
movdqa xmm2, xmm1 movdqa xmm2, xmm1
psrad xmm2, 06h psrad xmm2, 06h
movdqa xmm3, xmm0 ; iSadBlock movdqa xmm3, xmm0 ; iSadBlock
psrad xmm3, 06h psrad xmm3, 06h
psubd xmm3, xmm2 psubd xmm3, xmm2
pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
pshufd xmm4, xmm3, 01Bh pshufd xmm4, xmm3, 01Bh
paddd xmm4, xmm3 paddd xmm4, xmm3
pshufd xmm3, xmm4, 0B1h pshufd xmm3, xmm4, 0B1h
paddd xmm3, xmm4 paddd xmm3, xmm4
movd r0d, xmm3 movd r0d, xmm3
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
jb near .threshold_exit jb near .threshold_exit
pshufd xmm0, xmm0, 01Bh pshufd xmm0, xmm0, 01Bh
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
movmskps retrd, xmm0 movmskps retrd, xmm0
ret ret
.threshold_exit: .threshold_exit:
mov retrd, 15 mov retrd, 15
ret ret
;*********************************************************************** ;***********************************************************************
; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 ) ; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
;*********************************************************************** ;***********************************************************************
WELS_EXTERN MdInterAnalysisVaaInfo_sse2 WELS_EXTERN MdInterAnalysisVaaInfo_sse2
%assign push_num 0 %assign push_num 0
LOAD_1_PARA LOAD_1_PARA
movdqa xmm0, [r0] movdqa xmm0, [r0]
pshufd xmm1, xmm0, 01Bh pshufd xmm1, xmm0, 01Bh
paddd xmm1, xmm0 paddd xmm1, xmm0
pshufd xmm2, xmm1, 0B1h pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2 paddd xmm1, xmm2
psrad xmm1, 02h ; iAverageSad psrad xmm1, 02h ; iAverageSad
movdqa xmm2, xmm1 movdqa xmm2, xmm1
psrad xmm2, 06h psrad xmm2, 06h
movdqa xmm3, xmm0 ; iSadBlock movdqa xmm3, xmm0 ; iSadBlock
psrad xmm3, 06h psrad xmm3, 06h
psubd xmm3, xmm2 psubd xmm3, xmm2
; to replace pmulld functionality as below ; to replace pmulld functionality as below
movdqa xmm2, xmm3 movdqa xmm2, xmm3
pmuludq xmm2, xmm3 pmuludq xmm2, xmm3
pshufd xmm4, xmm3, 0B1h pshufd xmm4, xmm3, 0B1h
pmuludq xmm4, xmm4 pmuludq xmm4, xmm4
movdqa xmm5, xmm2 movdqa xmm5, xmm2
punpckldq xmm5, xmm4 punpckldq xmm5, xmm4
punpckhdq xmm2, xmm4 punpckhdq xmm2, xmm4
punpcklqdq xmm5, xmm2 punpcklqdq xmm5, xmm2
pshufd xmm4, xmm5, 01Bh pshufd xmm4, xmm5, 01Bh
paddd xmm4, xmm5 paddd xmm4, xmm5
pshufd xmm5, xmm4, 0B1h pshufd xmm5, xmm4, 0B1h
paddd xmm5, xmm4 paddd xmm5, xmm4
movd r0d, xmm5 movd r0d, xmm5
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
jb near .threshold_exit jb near .threshold_exit
pshufd xmm0, xmm0, 01Bh pshufd xmm0, xmm0, 01Bh
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
movmskps retrd, xmm0 movmskps retrd, xmm0
ret ret
.threshold_exit: .threshold_exit:
mov retrd, 15 mov retrd, 15
ret ret

View File

@ -36,128 +36,128 @@
#ifdef __APPLE__ #ifdef __APPLE__
.macro ROW_TRANSFORM_1_STEP .macro ROW_TRANSFORM_1_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 // { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2]; vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2]; vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 $8, $1, #1 vshr.s16 $8, $1, #1
vshr.s16 $9, $3, #1 vshr.s16 $9, $3, #1
vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3]; vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1); vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
// } // }
.endm .endm
.macro TRANSFORM_4BYTES // both row & col transform used .macro TRANSFORM_4BYTES // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3]; // { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3]; vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2]; vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2]; vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3]; vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
// } // }
.endm .endm
.macro COL_TRANSFORM_1_STEP .macro COL_TRANSFORM_1_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3]; // { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j]; vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j]; vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 $6, $1, #1 vshr.s32 $6, $1, #1
vshr.s32 $7, $3, #1 vshr.s32 $7, $3, #1
vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// } // }
.endm .endm
#else #else
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 .macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 // { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 \arg8, \arg1, #1 vshr.s16 \arg8, \arg1, #1
vshr.s16 \arg9, \arg3, #1 vshr.s16 \arg9, \arg3, #1
vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3]; vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1); vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
// } // }
.endm .endm
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used .macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3]; // { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
// } // }
.endm .endm
.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 .macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_q[0]~[3], output: e_q[0]~[3]; // { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 \arg6, \arg1, #1 vshr.s32 \arg6, \arg1, #1
vshr.s32 \arg7, \arg3, #1 vshr.s32 \arg7, \arg3, #1
vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// } // }
.endm .endm
#endif #endif
// r0 int16_t* block, // r0 int16_t* block,
// r1 int8_t* non_zero_count, // r1 int8_t* non_zero_count,
WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
vld1.64 {d0-d2}, [r1] vld1.64 {d0-d2}, [r1]
vceq.s8 q0, q0, #0 vceq.s8 q0, q0, #0
vceq.s8 d2, d2, #0 vceq.s8 d2, d2, #0
vmvn q0, q0 vmvn q0, q0
vmvn d2, d2 vmvn d2, d2
vabs.s8 q0, q0 vabs.s8 q0, q0
vabs.s8 d2, d2 vabs.s8 d2, d2
vst1.64 {d0-d2}, [r1] vst1.64 {d0-d2}, [r1]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
// uint8_t *pred, const int32_t stride, int16_t *rs // uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles! vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles!
ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q8, q9, q10, q11, d4, d5 ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q8, q9, q10, q11, d4, d5
TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11 TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
// transform element 32bits // transform element 32bits
vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
COL_TRANSFORM_1_STEP q0, q1, q2, q3, q8, q9, q10, q11 COL_TRANSFORM_1_STEP q0, q1, q2, q3, q8, q9, q10, q11
TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11 TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
//after clip_table[MAX_NEG_CROP] into [0, 255] //after clip_table[MAX_NEG_CROP] into [0, 255]
mov r2, r0 mov r2, r0
vld1.32 {d20[0]},[r0],r1 vld1.32 {d20[0]},[r0],r1
vld1.32 {d20[1]},[r0],r1 vld1.32 {d20[1]},[r0],r1
vld1.32 {d22[0]},[r0],r1 vld1.32 {d22[0]},[r0],r1
vld1.32 {d22[1]},[r0] vld1.32 {d22[1]},[r0]
vrshrn.s32 d16, q0, #6 vrshrn.s32 d16, q0, #6
vrshrn.s32 d17, q1, #6 vrshrn.s32 d17, q1, #6
vrshrn.s32 d18, q2, #6 vrshrn.s32 d18, q2, #6
vrshrn.s32 d19, q3, #6 vrshrn.s32 d19, q3, #6
vmovl.u8 q0,d20 vmovl.u8 q0,d20
vmovl.u8 q1,d22 vmovl.u8 q1,d22
vadd.s16 q0,q8 vadd.s16 q0,q8
vadd.s16 q1,q9 vadd.s16 q1,q9
vqmovun.s16 d20,q0 vqmovun.s16 d20,q0
vqmovun.s16 d22,q1 vqmovun.s16 d22,q1
vst1.32 {d20[0]},[r2],r1 vst1.32 {d20[0]},[r2],r1
vst1.32 {d20[1]},[r2],r1 vst1.32 {d20[1]},[r2],r1
vst1.32 {d22[0]},[r2],r1 vst1.32 {d22[0]},[r2],r1
vst1.32 {d22[1]},[r2] vst1.32 {d22[1]},[r2]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
#endif #endif

View File

@ -38,104 +38,104 @@
#ifdef __APPLE__ #ifdef __APPLE__
//Global macro //Global macro
.macro GET_8BYTE_DATA .macro GET_8BYTE_DATA
vld1.8 {$0[0]}, [$1], $2 vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2 vld1.8 {$0[1]}, [$1], $2
vld1.8 {$0[2]}, [$1], $2 vld1.8 {$0[2]}, [$1], $2
vld1.8 {$0[3]}, [$1], $2 vld1.8 {$0[3]}, [$1], $2
vld1.8 {$0[4]}, [$1], $2 vld1.8 {$0[4]}, [$1], $2
vld1.8 {$0[5]}, [$1], $2 vld1.8 {$0[5]}, [$1], $2
vld1.8 {$0[6]}, [$1], $2 vld1.8 {$0[6]}, [$1], $2
vld1.8 {$0[7]}, [$1], $2 vld1.8 {$0[7]}, [$1], $2
.endmacro .endmacro
#else #else
//Global macro //Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2 .macro GET_8BYTE_DATA arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2 vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2 vld1.8 {\arg0[1]}, [\arg1], \arg2
vld1.8 {\arg0[2]}, [\arg1], \arg2 vld1.8 {\arg0[2]}, [\arg1], \arg2
vld1.8 {\arg0[3]}, [\arg1], \arg2 vld1.8 {\arg0[3]}, [\arg1], \arg2
vld1.8 {\arg0[4]}, [\arg1], \arg2 vld1.8 {\arg0[4]}, [\arg1], \arg2
vld1.8 {\arg0[5]}, [\arg1], \arg2 vld1.8 {\arg0[5]}, [\arg1], \arg2
vld1.8 {\arg0[6]}, [\arg1], \arg2 vld1.8 {\arg0[6]}, [\arg1], \arg2
vld1.8 {\arg0[7]}, [\arg1], \arg2 vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm .endm
#endif #endif
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
//Get the top line data to 'q0' //Get the top line data to 'q0'
sub r2, r0, r1 sub r2, r0, r1
vldm r2, {d0, d1} vldm r2, {d0, d1}
mov r2, r0 mov r2, r0
mov r3, #4 mov r3, #4
//Set the top line to the each line of MB(16*16) //Set the top line to the each line of MB(16*16)
loop_0_get_i16x16_luma_pred_v: loop_0_get_i16x16_luma_pred_v:
vst1.8 {d0,d1}, [r2], r1 vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1 vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1 vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1 vst1.8 {d0,d1}, [r2], r1
subs r3, #1 subs r3, #1
bne loop_0_get_i16x16_luma_pred_v bne loop_0_get_i16x16_luma_pred_v
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
sub r2, r0, #1 sub r2, r0, #1
mov r3, #4 mov r3, #4
loop_0_get_i16x16_luma_pred_h: loop_0_get_i16x16_luma_pred_h:
//Get one byte data from left side //Get one byte data from left side
vld1.8 {d0[],d1[]}, [r2], r1 vld1.8 {d0[],d1[]}, [r2], r1
vld1.8 {d2[],d3[]}, [r2], r1 vld1.8 {d2[],d3[]}, [r2], r1
vld1.8 {d4[],d5[]}, [r2], r1 vld1.8 {d4[],d5[]}, [r2], r1
vld1.8 {d6[],d7[]}, [r2], r1 vld1.8 {d6[],d7[]}, [r2], r1
//Set the line of MB using the left side byte data //Set the line of MB using the left side byte data
vst1.8 {d0,d1}, [r0], r1 vst1.8 {d0,d1}, [r0], r1
vst1.8 {d2,d3}, [r0], r1 vst1.8 {d2,d3}, [r0], r1
vst1.8 {d4,d5}, [r0], r1 vst1.8 {d4,d5}, [r0], r1
vst1.8 {d6,d7}, [r0], r1 vst1.8 {d6,d7}, [r0], r1
subs r3, #1 subs r3, #1
bne loop_0_get_i16x16_luma_pred_h bne loop_0_get_i16x16_luma_pred_h
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Get the left vertical line data //Get the left vertical line data
sub r2, r0, #1 sub r2, r0, #1
GET_8BYTE_DATA d0, r2, r1 GET_8BYTE_DATA d0, r2, r1
GET_8BYTE_DATA d1, r2, r1 GET_8BYTE_DATA d1, r2, r1
//Get the top horizontal line data //Get the top horizontal line data
sub r2, r0, r1 sub r2, r0, r1
vldm r2, {d2, d3} vldm r2, {d2, d3}
//Calculate the sum of top horizontal line data and vertical line data //Calculate the sum of top horizontal line data and vertical line data
vpaddl.u8 q0, q0 vpaddl.u8 q0, q0
vpaddl.u8 q1, q1 vpaddl.u8 q1, q1
vadd.u16 q0, q0, q1 vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1 vadd.u16 d0, d0, d1
vpaddl.u16 d0, d0 vpaddl.u16 d0, d0
vpaddl.u32 d0, d0 vpaddl.u32 d0, d0
//Calculate the mean value //Calculate the mean value
vrshr.u16 d0, d0, #5 vrshr.u16 d0, d0, #5
vdup.8 q0, d0[0] vdup.8 q0, d0[0]
//Set the mean value to the all of member of MB //Set the mean value to the all of member of MB
mov r2, #4 mov r2, #4
loop_0_get_i16x16_luma_pred_dc_both: loop_0_get_i16x16_luma_pred_dc_both:
vst1.8 {d0,d1}, [r0], r1 vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1 vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1 vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1 vst1.8 {d0,d1}, [r0], r1
subs r2, #1 subs r2, #1
bne loop_0_get_i16x16_luma_pred_dc_both bne loop_0_get_i16x16_luma_pred_dc_both
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
@ -149,386 +149,386 @@ CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the table {(8,7,6,5,4,3,2,1) * 5} //Load the table {(8,7,6,5,4,3,2,1) * 5}
adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
vldr d0, [r2] vldr d0, [r2]
//Pack the top[-1] ~ top[6] to d1 //Pack the top[-1] ~ top[6] to d1
sub r2, r0, r1 sub r2, r0, r1
sub r3, r2, #1 sub r3, r2, #1
vld1.8 d1, [r3] vld1.8 d1, [r3]
//Pack the top[8] ~ top[15] to d2 //Pack the top[8] ~ top[15] to d2
add r3, #9 add r3, #9
vld1.8 d2, [r3] vld1.8 d2, [r3]
//Save the top[15] to d6 for next step //Save the top[15] to d6 for next step
vdup.u8 d6, d2[7] vdup.u8 d6, d2[7]
//Get and pack left[-1] ~ left[6] to d4 //Get and pack left[-1] ~ left[6] to d4
sub r3, r2, #1 sub r3, r2, #1
GET_8BYTE_DATA d4, r3, r1 GET_8BYTE_DATA d4, r3, r1
//Get and pack left[8] ~ left[15] to d3 //Get and pack left[8] ~ left[15] to d3
add r3, r1 add r3, r1
GET_8BYTE_DATA d3, r3, r1 GET_8BYTE_DATA d3, r3, r1
//Save the left[15] to d7 for next step //Save the left[15] to d7 for next step
vdup.u8 d7, d3[7] vdup.u8 d7, d3[7]
//revert the sequence of d2,d3 //revert the sequence of d2,d3
vrev64.8 q1, q1 vrev64.8 q1, q1
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...} vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...} vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
vmovl.u8 q0, d0 vmovl.u8 q0, d0
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5} vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5} vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
//Calculate the sum of items of q1, q2 //Calculate the sum of items of q1, q2
vpadd.s16 d0, d2, d3 vpadd.s16 d0, d2, d3
vpadd.s16 d1, d4, d5 vpadd.s16 d1, d4, d5
vpaddl.s16 q0, q0 vpaddl.s16 q0, q0
vpaddl.s32 q0, q0 vpaddl.s32 q0, q0
//Get the value of 'b', 'c' and extend to q1, q2. //Get the value of 'b', 'c' and extend to q1, q2.
vrshr.s64 q0, #6 vrshr.s64 q0, #6
vdup.s16 q1, d0[0] vdup.s16 q1, d0[0]
vdup.s16 q2, d1[0] vdup.s16 q2, d1[0]
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0 //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
vld1.32 {d0}, [r2] vld1.32 {d0}, [r2]
//Get the value of 'a' and save to q3 //Get the value of 'a' and save to q3
vaddl.u8 q3, d6, d7 vaddl.u8 q3, d6, d7
vshl.u16 q3, #4 vshl.u16 q3, #4
//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7} //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
vmovl.s8 q0, d0 vmovl.s8 q0, d0
vmla.s16 q3, q0, q1 vmla.s16 q3, q0, q1
vmla.s16 q3, q2, d0[0] vmla.s16 q3, q2, d0[0]
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7} //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
vshl.s16 q8, q1, #3 vshl.s16 q8, q1, #3
vadd.s16 q8, q3 vadd.s16 q8, q3
//right shift 5 bits and rounding //right shift 5 bits and rounding
vqrshrun.s16 d0, q3, #5 vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q8, #5 vqrshrun.s16 d1, q8, #5
//Set the line of MB //Set the line of MB
vst1.u32 {d0,d1}, [r0], r1 vst1.u32 {d0,d1}, [r0], r1
//Do the same processing for setting other lines //Do the same processing for setting other lines
mov r2, #15 mov r2, #15
loop_0_get_i16x16_luma_pred_plane: loop_0_get_i16x16_luma_pred_plane:
vadd.s16 q3, q2 vadd.s16 q3, q2
vadd.s16 q8, q2 vadd.s16 q8, q2
vqrshrun.s16 d0, q3, #5 vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q8, #5 vqrshrun.s16 d1, q8, #5
vst1.u32 {d0,d1}, [r0], r1 vst1.u32 {d0,d1}, [r0], r1
subs r2, #1 subs r2, #1
bne loop_0_get_i16x16_luma_pred_plane bne loop_0_get_i16x16_luma_pred_plane
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes) //Load the top row (4 bytes)
sub r2, r0, r1 sub r2, r0, r1
ldr r2, [r2] ldr r2, [r2]
//Set the luma MB using top line //Set the luma MB using top line
str r2, [r0], r1 str r2, [r0], r1
str r2, [r0], r1 str r2, [r0], r1
str r2, [r0], r1 str r2, [r0], r1
str r2, [r0] str r2, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the left column (4 bytes) //Load the left column (4 bytes)
sub r2, r0, #1 sub r2, r0, #1
vld1.8 {d0[]}, [r2], r1 vld1.8 {d0[]}, [r2], r1
vld1.8 {d1[]}, [r2], r1 vld1.8 {d1[]}, [r2], r1
vld1.8 {d2[]}, [r2], r1 vld1.8 {d2[]}, [r2], r1
vld1.8 {d3[]}, [r2] vld1.8 {d3[]}, [r2]
//Set the luma MB using the left side byte //Set the luma MB using the left side byte
vst1.32 {d0[0]}, [r0], r1 vst1.32 {d0[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1 vst1.32 {d1[0]}, [r0], r1
vst1.32 {d2[0]}, [r0], r1 vst1.32 {d2[0]}, [r0], r1
vst1.32 {d3[0]}, [r0] vst1.32 {d3[0]}, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row data(8 bytes) //Load the top row data(8 bytes)
sub r2, r0, r1 sub r2, r0, r1
vld1.32 {d0}, [r2] vld1.32 {d0}, [r2]
//For "t7 + (t7<<1)" //For "t7 + (t7<<1)"
vdup.8 d1, d0[7] vdup.8 d1, d0[7]
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7" //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
vext.8 d1, d0, d1, #1 vext.8 d1, d0, d1, #1
vaddl.u8 q1, d1, d0 vaddl.u8 q1, d1, d0
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7" //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
vext.8 q2, q1, q1, #14 vext.8 q2, q1, q1, #14
vadd.u16 q0, q1, q2 vadd.u16 q0, q1, q2
//right shift 2 bits and rounding //right shift 2 bits and rounding
vqrshrn.u16 d0, q0, #2 vqrshrn.u16 d0, q0, #2
//Save "ddl0, ddl1, ddl2, ddl3" //Save "ddl0, ddl1, ddl2, ddl3"
vext.8 d1, d0, d0, #1 vext.8 d1, d0, d0, #1
vst1.32 d1[0], [r0], r1 vst1.32 d1[0], [r0], r1
//Save "ddl1, ddl2, ddl3, ddl4" //Save "ddl1, ddl2, ddl3, ddl4"
vext.8 d1, d0, d0, #2 vext.8 d1, d0, d0, #2
vst1.32 d1[0], [r0], r1 vst1.32 d1[0], [r0], r1
//Save "ddl2, ddl3, ddl4, ddl5" //Save "ddl2, ddl3, ddl4, ddl5"
vext.8 d1, d0, d0, #3 vext.8 d1, d0, d0, #3
vst1.32 d1[0], [r0], r1 vst1.32 d1[0], [r0], r1
//Save "ddl3, ddl4, ddl5, ddl6" //Save "ddl3, ddl4, ddl5, ddl6"
vst1.32 d0[1], [r0] vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes) //Load the top row (4 bytes)
sub r2, r0, r1 sub r2, r0, r1
vld1.32 {d0[1]}, [r2] vld1.32 {d0[1]}, [r2]
//Load the left column (5 bytes) //Load the left column (5 bytes)
sub r2, #1 sub r2, #1
vld1.8 {d0[3]}, [r2], r1 vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1 vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1 vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2], r1 vld1.8 {d0[0]}, [r2], r1
vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3} vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
//d2:{L3,L2,L1,L0,LT,T0,T1,T2} //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3} //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
vaddl.u8 q2, d2, d0 vaddl.u8 q2, d2, d0
//q1:{TL0+LT0,LT0+T01,...L12+L23} //q1:{TL0+LT0,LT0+T01,...L12+L23}
vext.8 q3, q3, q2, #14 vext.8 q3, q3, q2, #14
vadd.u16 q1, q2, q3 vadd.u16 q1, q2, q3
//right shift 2 bits and rounding //right shift 2 bits and rounding
vqrshrn.u16 d0, q1, #2 vqrshrn.u16 d0, q1, #2
//Adjust the data sequence for setting luma MB of 'pred' //Adjust the data sequence for setting luma MB of 'pred'
vst1.32 d0[1], [r0], r1 vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7 vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0], r1 vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7 vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0], r1 vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7 vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0] vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row (8 bytes) //Load the top row (8 bytes)
sub r2, r0, r1 sub r2, r0, r1
vld1.32 {d0}, [r2] vld1.32 {d0}, [r2]
vext.8 d1, d0, d0, #1 vext.8 d1, d0, d0, #1
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x} vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
vext.8 q2, q1, q1, #2 vext.8 q2, q1, q1, #2
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x} vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
//calculate the "vl0,vl1,vl2,vl3,vl4" //calculate the "vl0,vl1,vl2,vl3,vl4"
vqrshrn.u16 d0, q1, #1 vqrshrn.u16 d0, q1, #1
//calculate the "vl5,vl6,vl7,vl8,vl9" //calculate the "vl5,vl6,vl7,vl8,vl9"
vqrshrn.u16 d1, q2, #2 vqrshrn.u16 d1, q2, #2
//Adjust the data sequence for setting the luma MB //Adjust the data sequence for setting the luma MB
vst1.32 d0[0], [r0], r1 vst1.32 d0[0], [r0], r1
vst1.32 d1[0], [r0], r1 vst1.32 d1[0], [r0], r1
vext.8 d0, d0, d0, #1 vext.8 d0, d0, d0, #1
vext.8 d1, d1, d1, #1 vext.8 d1, d1, d1, #1
vst1.32 d0[0], [r0], r1 vst1.32 d0[0], [r0], r1
vst1.32 d1[0], [r0] vst1.32 d1[0], [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes) //Load the top row (4 bytes)
sub r2, r0, r1 sub r2, r0, r1
vld1.32 {d0[1]}, [r2] vld1.32 {d0[1]}, [r2]
//Load the left column (4 bytes) //Load the left column (4 bytes)
sub r2, #1 sub r2, #1
vld1.8 {d0[3]}, [r2], r1 vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1 vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1 vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2] vld1.8 {d0[0]}, [r2]
vext.8 d1, d0, d0, #7 vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3} vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
vext.u8 q2, q1, q1, #14 vext.u8 q2, q1, q1, #14
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3} vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
//Calculate the vr0 ~ vr9 //Calculate the vr0 ~ vr9
vqrshrn.u16 d1, q2, #2 vqrshrn.u16 d1, q2, #2
vqrshrn.u16 d0, q1, #1 vqrshrn.u16 d0, q1, #1
//Adjust the data sequence for setting the luma MB //Adjust the data sequence for setting the luma MB
vst1.32 d0[1], [r0], r1 vst1.32 d0[1], [r0], r1
vst1.32 d1[1], [r0], r1 vst1.32 d1[1], [r0], r1
add r2, r0, r1 add r2, r0, r1
vst1.8 d1[3], [r0]! vst1.8 d1[3], [r0]!
vst1.16 d0[2], [r0]! vst1.16 d0[2], [r0]!
vst1.8 d0[6], [r0]! vst1.8 d0[6], [r0]!
vst1.8 d1[2], [r2]! vst1.8 d1[2], [r2]!
vst1.16 d1[2], [r2]! vst1.16 d1[2], [r2]!
vst1.8 d1[6], [r2] vst1.8 d1[6], [r2]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the left column data //Load the left column data
sub r2, r0, #1 sub r2, r0, #1
mov r3, #3 mov r3, #3
mul r3, r1 mul r3, r1
add r3, r2 add r3, r2
vld1.8 {d0[]}, [r3] vld1.8 {d0[]}, [r3]
vld1.8 {d0[4]}, [r2], r1 vld1.8 {d0[4]}, [r2], r1
vld1.8 {d0[5]}, [r2], r1 vld1.8 {d0[5]}, [r2], r1
vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3} vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
vext.8 d1, d0, d0, #1 vext.8 d1, d0, d0, #1
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3} vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
vext.u8 d2, d5, d4, #2 vext.u8 d2, d5, d4, #2
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3} vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
//Calculate the hu0 ~ hu5 //Calculate the hu0 ~ hu5
vqrshrn.u16 d2, q2, #1 vqrshrn.u16 d2, q2, #1
vqrshrn.u16 d1, q1, #2 vqrshrn.u16 d1, q1, #2
//Adjust the data sequence for setting the luma MB //Adjust the data sequence for setting the luma MB
vzip.8 d2, d1 vzip.8 d2, d1
vst1.32 d1[0], [r0], r1 vst1.32 d1[0], [r0], r1
vext.8 d2, d1, d1, #2 vext.8 d2, d1, d1, #2
vst1.32 d2[0], [r0], r1 vst1.32 d2[0], [r0], r1
vst1.32 d1[1], [r0], r1 vst1.32 d1[1], [r0], r1
vst1.32 d0[0], [r0] vst1.32 d0[0], [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the data //Load the data
sub r2, r0, r1 sub r2, r0, r1
sub r2, #1 sub r2, #1
vld1.32 {d0[1]}, [r2], r1 vld1.32 {d0[1]}, [r2], r1
vld1.8 {d0[3]}, [r2], r1 vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1 vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1 vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2} vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
vext.8 d1, d0, d0, #7 vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2} vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1} vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2} vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
//Calculate the hd0~hd9 //Calculate the hd0~hd9
vqrshrn.u16 d1, q3, #2 vqrshrn.u16 d1, q3, #2
vqrshrn.u16 d0, q2, #1 vqrshrn.u16 d0, q2, #1
//Adjust the data sequence for setting the luma MB //Adjust the data sequence for setting the luma MB
vmov d3, d1 vmov d3, d1
vtrn.8 d0, d1 vtrn.8 d0, d1
vext.u8 d2, d1, d1, #6 vext.u8 d2, d1, d1, #6
vst2.16 {d2[3], d3[3]}, [r0], r1 vst2.16 {d2[3], d3[3]}, [r0], r1
vst2.16 {d0[2], d1[2]}, [r0], r1 vst2.16 {d0[2], d1[2]}, [r0], r1
vmov d3, d0 vmov d3, d0
vst2.16 {d2[2], d3[2]}, [r0], r1 vst2.16 {d2[2], d3[2]}, [r0], r1
vst2.16 {d0[1], d1[1]}, [r0] vst2.16 {d0[1], d1[1]}, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Get the top row (8 byte) //Get the top row (8 byte)
sub r2, r0, r1 sub r2, r0, r1
vldr d0, [r2] vldr d0, [r2]
//Set the chroma MB using top row data //Set the chroma MB using top row data
vst1.8 {d0}, [r0], r1 vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1 vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1 vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1 vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1 vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1 vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1 vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0] vst1.8 {d0}, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
////Get the left column (8 byte) ////Get the left column (8 byte)
sub r2, r0, #1 sub r2, r0, #1
vld1.8 {d0[]}, [r2], r1 vld1.8 {d0[]}, [r2], r1
vld1.8 {d1[]}, [r2], r1 vld1.8 {d1[]}, [r2], r1
vld1.8 {d2[]}, [r2], r1 vld1.8 {d2[]}, [r2], r1
vld1.8 {d3[]}, [r2], r1 vld1.8 {d3[]}, [r2], r1
vld1.8 {d4[]}, [r2], r1 vld1.8 {d4[]}, [r2], r1
vld1.8 {d5[]}, [r2], r1 vld1.8 {d5[]}, [r2], r1
vld1.8 {d6[]}, [r2], r1 vld1.8 {d6[]}, [r2], r1
vld1.8 {d7[]}, [r2] vld1.8 {d7[]}, [r2]
//Set the chroma MB using left column data //Set the chroma MB using left column data
vst1.8 {d0}, [r0], r1 vst1.8 {d0}, [r0], r1
vst1.8 {d1}, [r0], r1 vst1.8 {d1}, [r0], r1
vst1.8 {d2}, [r0], r1 vst1.8 {d2}, [r0], r1
vst1.8 {d3}, [r0], r1 vst1.8 {d3}, [r0], r1
vst1.8 {d4}, [r0], r1 vst1.8 {d4}, [r0], r1
vst1.8 {d5}, [r0], r1 vst1.8 {d5}, [r0], r1
vst1.8 {d6}, [r0], r1 vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r0] vst1.8 {d7}, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
@ -576,73 +576,73 @@ CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x2823
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003 CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row data //Load the top row data
sub r2, r0, #1 sub r2, r0, #1
sub r2, r1 sub r2, r1
vld1.32 {d1[0]}, [r2] vld1.32 {d1[0]}, [r2]
add r2, #5 add r2, #5
vld1.32 {d0[0]}, [r2] vld1.32 {d0[0]}, [r2]
//Load the left column data //Load the left column data
sub r2, #5 sub r2, #5
vld1.8 {d1[4]}, [r2], r1 vld1.8 {d1[4]}, [r2], r1
vld1.8 {d1[5]}, [r2], r1 vld1.8 {d1[5]}, [r2], r1
vld1.8 {d1[6]}, [r2], r1 vld1.8 {d1[6]}, [r2], r1
vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2} vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
add r2, r1 add r2, r1
vld1.8 {d0[4]}, [r2], r1 vld1.8 {d0[4]}, [r2], r1
vld1.8 {d0[5]}, [r2], r1 vld1.8 {d0[5]}, [r2], r1
vld1.8 {d0[6]}, [r2], r1 vld1.8 {d0[6]}, [r2], r1
vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7} vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
//Save T7 to d3 for next step //Save T7 to d3 for next step
vdup.u8 d3, d0[3] vdup.u8 d3, d0[3]
//Save L7 to d4 for next step //Save L7 to d4 for next step
vdup.u8 d4, d0[7] vdup.u8 d4, d0[7]
//Calculate the value of 'a' and save to q2 //Calculate the value of 'a' and save to q2
vaddl.u8 q2, d3, d4 vaddl.u8 q2, d3, d4
vshl.u16 q2, #4 vshl.u16 q2, #4
//Load the table {{1,2,3,4,1,2,3,4}*17} //Load the table {{1,2,3,4,1,2,3,4}*17}
adr r2, CONST0_GET_I_CHROMA_PRED_PLANE adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
vld1.32 {d2}, [r2] vld1.32 {d2}, [r2]
//Calculate the 'b','c', and save to q0 //Calculate the 'b','c', and save to q0
vrev32.8 d1, d1 vrev32.8 d1, d1
vsubl.u8 q0, d0, d1 vsubl.u8 q0, d0, d1
vmovl.u8 q1, d2 vmovl.u8 q1, d2
vmul.s16 q0, q1 vmul.s16 q0, q1
vpaddl.s16 q0, q0 vpaddl.s16 q0, q0
vpaddl.s32 q0, q0 vpaddl.s32 q0, q0
vrshr.s64 q0, #5 vrshr.s64 q0, #5
//Load the table {-3,-2,-1,0,1,2,3,4} to q3 //Load the table {-3,-2,-1,0,1,2,3,4} to q3
adr r2, CONST1_GET_I_CHROMA_PRED_PLANE adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
vld1.32 {d6, d7}, [r2] vld1.32 {d6, d7}, [r2]
//Duplicate the 'b','c' to q0, q1 for SIMD instruction //Duplicate the 'b','c' to q0, q1 for SIMD instruction
vdup.s16 q1, d1[0] vdup.s16 q1, d1[0]
vdup.s16 q0, d0[0] vdup.s16 q0, d0[0]
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;" //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
vmla.s16 q2, q0, q3 vmla.s16 q2, q0, q3
vmla.s16 q2, q1, d6[0] vmla.s16 q2, q1, d6[0]
vqrshrun.s16 d0, q2, #5 vqrshrun.s16 d0, q2, #5
//Set a line of chroma MB //Set a line of chroma MB
vst1.u32 {d0}, [r0], r1 vst1.u32 {d0}, [r0], r1
//Do the same processing for each line. //Do the same processing for each line.
mov r2, #7 mov r2, #7
loop_0_get_i_chroma_pred_plane: loop_0_get_i_chroma_pred_plane:
vadd.s16 q2, q1 vadd.s16 q2, q1
vqrshrun.s16 d0, q2, #5 vqrshrun.s16 d0, q2, #5
vst1.u32 {d0}, [r0], r1 vst1.u32 {d0}, [r0], r1
subs r2, #1 subs r2, #1
bne loop_0_get_i_chroma_pred_plane bne loop_0_get_i_chroma_pred_plane
WELS_ASM_FUNC_END WELS_ASM_FUNC_END

View File

@ -54,7 +54,7 @@
%endmacro %endmacro
%macro MMX_SumSub 3 %macro MMX_SumSub 3
movq %3, %2 movq %3, %2
psubw %2, %1 psubw %2, %1
paddw %1, %3 paddw %1, %3
%endmacro %endmacro
@ -62,8 +62,8 @@
%macro MMX_IDCT 6 %macro MMX_IDCT 6
MMX_SumSub %4, %5, %6 MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1 MMX_SumSubDiv2 %3, %2, %1
MMX_SumSub %1, %4, %6 MMX_SumSub %1, %4, %6
MMX_SumSub %3, %5, %6 MMX_SumSub %3, %5, %6
%endmacro %endmacro
@ -96,13 +96,13 @@ WELS_EXTERN IdctResAddPred_mmx
movq mm2, [r2+16] movq mm2, [r2+16]
movq mm3, [r2+24] movq mm3, [r2+24]
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4 MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6 MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2 MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6 MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
WELS_Zero mm7 WELS_Zero mm7
WELS_DW32 mm6 WELS_DW32 mm6
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0] MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0]
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1] MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1]
@ -111,5 +111,5 @@ WELS_EXTERN IdctResAddPred_mmx
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1] MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1]
emms emms
ret ret

File diff suppressed because it is too large Load Diff

View File

@ -38,107 +38,107 @@
#ifdef __APPLE__ #ifdef __APPLE__
//Global macro //Global macro
.macro GET_8BYTE_DATA .macro GET_8BYTE_DATA
vld1.8 {$0[0]}, [$1], $2 vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2 vld1.8 {$0[1]}, [$1], $2
vld1.8 {$0[2]}, [$1], $2 vld1.8 {$0[2]}, [$1], $2
vld1.8 {$0[3]}, [$1], $2 vld1.8 {$0[3]}, [$1], $2
vld1.8 {$0[4]}, [$1], $2 vld1.8 {$0[4]}, [$1], $2
vld1.8 {$0[5]}, [$1], $2 vld1.8 {$0[5]}, [$1], $2
vld1.8 {$0[6]}, [$1], $2 vld1.8 {$0[6]}, [$1], $2
vld1.8 {$0[7]}, [$1], $2 vld1.8 {$0[7]}, [$1], $2
.endm .endm
#else #else
//Global macro //Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2 .macro GET_8BYTE_DATA arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2 vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2 vld1.8 {\arg0[1]}, [\arg1], \arg2
vld1.8 {\arg0[2]}, [\arg1], \arg2 vld1.8 {\arg0[2]}, [\arg1], \arg2
vld1.8 {\arg0[3]}, [\arg1], \arg2 vld1.8 {\arg0[3]}, [\arg1], \arg2
vld1.8 {\arg0[4]}, [\arg1], \arg2 vld1.8 {\arg0[4]}, [\arg1], \arg2
vld1.8 {\arg0[5]}, [\arg1], \arg2 vld1.8 {\arg0[5]}, [\arg1], \arg2
vld1.8 {\arg0[6]}, [\arg1], \arg2 vld1.8 {\arg0[6]}, [\arg1], \arg2
vld1.8 {\arg0[7]}, [\arg1], \arg2 vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm .endm
#endif #endif
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
//Get the top line data to 'q0' //Get the top line data to 'q0'
sub r3, r1, r2 sub r3, r1, r2
vldm r3, {d0, d1} vldm r3, {d0, d1}
//mov r2, #16 //mov r2, #16
mov r3, #4 mov r3, #4
//Set the top line to the each line of MB(16*16) //Set the top line to the each line of MB(16*16)
loop_0_get_i16x16_luma_pred_v: loop_0_get_i16x16_luma_pred_v:
vst1.8 {d0,d1}, [r0]! vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]! vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]! vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]! vst1.8 {d0,d1}, [r0]!
subs r3, #1 subs r3, #1
bne loop_0_get_i16x16_luma_pred_v bne loop_0_get_i16x16_luma_pred_v
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
//stmdb sp!, {r4, lr} //stmdb sp!, {r4, lr}
sub r1, r1, #1 sub r1, r1, #1
mov r3, #4 mov r3, #4
loop_0_get_i16x16_luma_pred_h: loop_0_get_i16x16_luma_pred_h:
//Get one byte data from left side //Get one byte data from left side
vld1.8 {d0[],d1[]}, [r1], r2 vld1.8 {d0[],d1[]}, [r1], r2
vld1.8 {d2[],d3[]}, [r1], r2 vld1.8 {d2[],d3[]}, [r1], r2
vld1.8 {d4[],d5[]}, [r1], r2 vld1.8 {d4[],d5[]}, [r1], r2
vld1.8 {d6[],d7[]}, [r1], r2 vld1.8 {d6[],d7[]}, [r1], r2
//Set the line of MB using the left side byte data //Set the line of MB using the left side byte data
vst1.8 {d0,d1}, [r0]! vst1.8 {d0,d1}, [r0]!
//add r0, #16 //add r0, #16
vst1.8 {d2,d3}, [r0]! vst1.8 {d2,d3}, [r0]!
//add r0, #16 //add r0, #16
vst1.8 {d4,d5}, [r0]! vst1.8 {d4,d5}, [r0]!
//add r0, #16 //add r0, #16
vst1.8 {d6,d7}, [r0]! vst1.8 {d6,d7}, [r0]!
//add r0, #16 //add r0, #16
subs r3, #1 subs r3, #1
bne loop_0_get_i16x16_luma_pred_h bne loop_0_get_i16x16_luma_pred_h
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Get the left vertical line data //Get the left vertical line data
sub r3, r1, #1 sub r3, r1, #1
GET_8BYTE_DATA d0, r3, r2 GET_8BYTE_DATA d0, r3, r2
GET_8BYTE_DATA d1, r3, r2 GET_8BYTE_DATA d1, r3, r2
//Get the top horizontal line data //Get the top horizontal line data
sub r3, r1, r2 sub r3, r1, r2
vldm r3, {d2, d3} vldm r3, {d2, d3}
//Calculate the sum of top horizontal line data and vertical line data //Calculate the sum of top horizontal line data and vertical line data
vpaddl.u8 q0, q0 vpaddl.u8 q0, q0
vpaddl.u8 q1, q1 vpaddl.u8 q1, q1
vadd.u16 q0, q0, q1 vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1 vadd.u16 d0, d0, d1
vpaddl.u16 d0, d0 vpaddl.u16 d0, d0
vpaddl.u32 d0, d0 vpaddl.u32 d0, d0
//Calculate the mean value //Calculate the mean value
vrshr.u16 d0, d0, #5 vrshr.u16 d0, d0, #5
vdup.8 q0, d0[0] vdup.8 q0, d0[0]
//Set the mean value to the all of member of MB //Set the mean value to the all of member of MB
mov r3, #4 mov r3, #4
loop_0_get_i16x16_luma_pred_dc_both: loop_0_get_i16x16_luma_pred_dc_both:
vst1.8 {d0,d1}, [r0]! vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]! vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]! vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]! vst1.8 {d0,d1}, [r0]!
subs r3, #1 subs r3, #1
bne loop_0_get_i16x16_luma_pred_dc_both bne loop_0_get_i16x16_luma_pred_dc_both
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
@ -151,383 +151,383 @@ CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
//stmdb sp!, { r4, lr} //stmdb sp!, { r4, lr}
//Load the table {(8,7,6,5,4,3,2,1) * 5} //Load the table {(8,7,6,5,4,3,2,1) * 5}
adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
vldr d0, [r3] vldr d0, [r3]
//Pack the top[-1] ~ top[6] to d1 //Pack the top[-1] ~ top[6] to d1
sub r3, r1, r2 sub r3, r1, r2
sub r1, r3, #1 sub r1, r3, #1
vld1.8 d1, [r1] vld1.8 d1, [r1]
//Pack the top[8] ~ top[15] to d2 //Pack the top[8] ~ top[15] to d2
add r1, #9 add r1, #9
vld1.8 d2, [r1] vld1.8 d2, [r1]
//Save the top[15] to d6 for next step //Save the top[15] to d6 for next step
vdup.u8 d6, d2[7] vdup.u8 d6, d2[7]
//Get and pack left[-1] ~ left[6] to d4 //Get and pack left[-1] ~ left[6] to d4
sub r1, r3, #1 sub r1, r3, #1
GET_8BYTE_DATA d4, r1, r2 GET_8BYTE_DATA d4, r1, r2
//Get and pack left[8] ~ left[15] to d3 //Get and pack left[8] ~ left[15] to d3
add r1, r2 add r1, r2
GET_8BYTE_DATA d3, r1, r2 GET_8BYTE_DATA d3, r1, r2
//Save the left[15] to d7 for next step //Save the left[15] to d7 for next step
vdup.u8 d7, d3[7] vdup.u8 d7, d3[7]
//revert the sequence of d2,d3 //revert the sequence of d2,d3
vrev64.8 q1, q1 vrev64.8 q1, q1
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...} vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...} vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
vmovl.u8 q0, d0 vmovl.u8 q0, d0
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5} vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5} vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
//Calculate the sum of items of q1, q2 //Calculate the sum of items of q1, q2
vpadd.s16 d0, d2, d3 vpadd.s16 d0, d2, d3
vpadd.s16 d1, d4, d5 vpadd.s16 d1, d4, d5
vpaddl.s16 q0, q0 vpaddl.s16 q0, q0
vpaddl.s32 q0, q0 vpaddl.s32 q0, q0
//Get the value of 'b', 'c' and extend to q1, q2. //Get the value of 'b', 'c' and extend to q1, q2.
vrshr.s64 q0, #6 vrshr.s64 q0, #6
vdup.s16 q1, d0[0] vdup.s16 q1, d0[0]
vdup.s16 q2, d1[0] vdup.s16 q2, d1[0]
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0 //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
vld1.32 {d0}, [r3] vld1.32 {d0}, [r3]
//Get the value of 'a' and save to q3 //Get the value of 'a' and save to q3
vaddl.u8 q3, d6, d7 vaddl.u8 q3, d6, d7
vshl.u16 q3, #4 vshl.u16 q3, #4
//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7} //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
vmovl.s8 q0, d0 vmovl.s8 q0, d0
vmla.s16 q3, q0, q1 vmla.s16 q3, q0, q1
vmla.s16 q3, q2, d0[0] vmla.s16 q3, q2, d0[0]
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7} //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
vshl.s16 q8, q1, #3 vshl.s16 q8, q1, #3
vadd.s16 q8, q3 vadd.s16 q8, q3
//right shift 5 bits and rounding //right shift 5 bits and rounding
vqrshrun.s16 d0, q3, #5 vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q8, #5 vqrshrun.s16 d1, q8, #5
//Set the line of MB //Set the line of MB
vst1.u32 {d0,d1}, [r0]! vst1.u32 {d0,d1}, [r0]!
//Do the same processing for setting other lines //Do the same processing for setting other lines
mov r3, #15 mov r3, #15
loop_0_get_i16x16_luma_pred_plane: loop_0_get_i16x16_luma_pred_plane:
vadd.s16 q3, q2 vadd.s16 q3, q2
vadd.s16 q8, q2 vadd.s16 q8, q2
vqrshrun.s16 d0, q3, #5 vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q8, #5 vqrshrun.s16 d1, q8, #5
vst1.u32 {d0,d1}, [r0]! vst1.u32 {d0,d1}, [r0]!
subs r3, #1 subs r3, #1
bne loop_0_get_i16x16_luma_pred_plane bne loop_0_get_i16x16_luma_pred_plane
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredV_neon WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredV_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes) //Load the top row (4 bytes)
sub r3, r1, r2 sub r3, r1, r2
ldr r3, [r3] ldr r3, [r3]
//Set the luma MB using top line //Set the luma MB using top line
str r3, [r0], #4 str r3, [r0], #4
str r3, [r0], #4 str r3, [r0], #4
str r3, [r0], #4 str r3, [r0], #4
str r3, [r0] str r3, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredH_neon WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredH_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the left column (4 bytes) //Load the left column (4 bytes)
sub r3, r1, #1 sub r3, r1, #1
vld1.8 {d0[]}, [r3], r2 vld1.8 {d0[]}, [r3], r2
vld1.8 {d1[]}, [r3], r2 vld1.8 {d1[]}, [r3], r2
vld1.8 {d2[]}, [r3], r2 vld1.8 {d2[]}, [r3], r2
vld1.8 {d3[]}, [r3] vld1.8 {d3[]}, [r3]
//Set the luma MB using the left side byte //Set the luma MB using the left side byte
vst1.32 {d0[0]}, [r0]! vst1.32 {d0[0]}, [r0]!
vst1.32 {d1[0]}, [r0]! vst1.32 {d1[0]}, [r0]!
vst1.32 {d2[0]}, [r0]! vst1.32 {d2[0]}, [r0]!
vst1.32 {d3[0]}, [r0] vst1.32 {d3[0]}, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDL_neon WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDL_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row data(8 bytes) //Load the top row data(8 bytes)
sub r3, r1, r2 sub r3, r1, r2
vld1.32 {d0}, [r3] vld1.32 {d0}, [r3]
//For "t7 + (t7<<1)" //For "t7 + (t7<<1)"
vdup.8 d1, d0[7] vdup.8 d1, d0[7]
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7" //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
vext.8 d1, d0, d1, #1 vext.8 d1, d0, d1, #1
vaddl.u8 q1, d1, d0 vaddl.u8 q1, d1, d0
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7" //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
vext.8 q2, q1, q1, #14 vext.8 q2, q1, q1, #14
vadd.u16 q0, q1, q2 vadd.u16 q0, q1, q2
//right shift 2 bits and rounding //right shift 2 bits and rounding
vqrshrn.u16 d0, q0, #2 vqrshrn.u16 d0, q0, #2
//Save "ddl0, ddl1, ddl2, ddl3" //Save "ddl0, ddl1, ddl2, ddl3"
vext.8 d1, d0, d0, #1 vext.8 d1, d0, d0, #1
vst1.32 d1[0], [r0]! vst1.32 d1[0], [r0]!
//Save "ddl1, ddl2, ddl3, ddl4" //Save "ddl1, ddl2, ddl3, ddl4"
vext.8 d1, d0, d0, #2 vext.8 d1, d0, d0, #2
vst1.32 d1[0], [r0]! vst1.32 d1[0], [r0]!
//Save "ddl2, ddl3, ddl4, ddl5" //Save "ddl2, ddl3, ddl4, ddl5"
vext.8 d1, d0, d0, #3 vext.8 d1, d0, d0, #3
vst1.32 d1[0], [r0]! vst1.32 d1[0], [r0]!
//Save "ddl3, ddl4, ddl5, ddl6" //Save "ddl3, ddl4, ddl5, ddl6"
vst1.32 d0[1], [r0] vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDR_neon WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDR_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes) //Load the top row (4 bytes)
sub r3, r1, r2 sub r3, r1, r2
vld1.32 {d0[1]}, [r3] vld1.32 {d0[1]}, [r3]
//Load the left column (5 bytes) //Load the left column (5 bytes)
sub r3, #1 sub r3, #1
vld1.8 {d0[3]}, [r3], r2 vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2 vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2 vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3], r2 vld1.8 {d0[0]}, [r3], r2
vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3} vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
//d2:{L3,L2,L1,L0,LT,T0,T1,T2} //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3} //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
vaddl.u8 q2, d2, d0 vaddl.u8 q2, d2, d0
//q1:{TL0+LT0,LT0+T01,...L12+L23} //q1:{TL0+LT0,LT0+T01,...L12+L23}
vext.8 q3, q3, q2, #14 vext.8 q3, q3, q2, #14
vadd.u16 q1, q2, q3 vadd.u16 q1, q2, q3
//right shift 2 bits and rounding //right shift 2 bits and rounding
vqrshrn.u16 d0, q1, #2 vqrshrn.u16 d0, q1, #2
//Adjust the data sequence for setting luma MB of 'pred' //Adjust the data sequence for setting luma MB of 'pred'
vst1.32 d0[1], [r0]! vst1.32 d0[1], [r0]!
vext.8 d0, d0, d0, #7 vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]! vst1.32 d0[1], [r0]!
vext.8 d0, d0, d0, #7 vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]! vst1.32 d0[1], [r0]!
vext.8 d0, d0, d0, #7 vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0] vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row (8 bytes) //Load the top row (8 bytes)
sub r3, r1, r2 sub r3, r1, r2
vld1.32 {d0}, [r3] vld1.32 {d0}, [r3]
vext.8 d1, d0, d0, #1 vext.8 d1, d0, d0, #1
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x} vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
vext.8 q2, q1, q1, #2 vext.8 q2, q1, q1, #2
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x} vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
//calculate the "vl0,vl1,vl2,vl3,vl4" //calculate the "vl0,vl1,vl2,vl3,vl4"
vqrshrn.u16 d0, q1, #1 vqrshrn.u16 d0, q1, #1
//calculate the "vl5,vl6,vl7,vl8,vl9" //calculate the "vl5,vl6,vl7,vl8,vl9"
vqrshrn.u16 d1, q2, #2 vqrshrn.u16 d1, q2, #2
//Adjust the data sequence for setting the luma MB //Adjust the data sequence for setting the luma MB
vst1.32 d0[0], [r0]! vst1.32 d0[0], [r0]!
vst1.32 d1[0], [r0]! vst1.32 d1[0], [r0]!
vext.8 d0, d0, d0, #1 vext.8 d0, d0, d0, #1
vext.8 d1, d1, d1, #1 vext.8 d1, d1, d1, #1
vst1.32 d0[0], [r0]! vst1.32 d0[0], [r0]!
vst1.32 d1[0], [r0] vst1.32 d1[0], [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVR_neon WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVR_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes) //Load the top row (4 bytes)
sub r3, r1, r2 sub r3, r1, r2
vld1.32 {d0[1]}, [r3] vld1.32 {d0[1]}, [r3]
//Load the left column (4 bytes) //Load the left column (4 bytes)
sub r3, #1 sub r3, #1
vld1.8 {d0[3]}, [r3], r2 vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2 vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2 vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3] vld1.8 {d0[0]}, [r3]
vext.8 d1, d0, d0, #7 vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3} vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
vext.u8 q2, q1, q1, #14 vext.u8 q2, q1, q1, #14
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3} vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
//Calculate the vr0 ~ vr9 //Calculate the vr0 ~ vr9
vqrshrn.u16 d1, q2, #2 vqrshrn.u16 d1, q2, #2
vqrshrn.u16 d0, q1, #1 vqrshrn.u16 d0, q1, #1
//Adjust the data sequence for setting the luma MB //Adjust the data sequence for setting the luma MB
vst1.32 d0[1], [r0]! vst1.32 d0[1], [r0]!
vst1.32 d1[1], [r0]! vst1.32 d1[1], [r0]!
//add r2, r0, r1 //add r2, r0, r1
vst1.8 d1[3], [r0]! vst1.8 d1[3], [r0]!
vst1.16 d0[2], [r0]! vst1.16 d0[2], [r0]!
vst1.8 d0[6], [r0]! vst1.8 d0[6], [r0]!
vst1.8 d1[2], [r0]! vst1.8 d1[2], [r0]!
vst1.16 d1[2], [r0]! vst1.16 d1[2], [r0]!
vst1.8 d1[6], [r0] vst1.8 d1[6], [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHU_neon WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHU_neon
//stmdb sp!, { r4, lr} //stmdb sp!, { r4, lr}
//Load the left column data //Load the left column data
sub r3, r1, #1 sub r3, r1, #1
mov r1, #3 mov r1, #3
mul r1, r2 mul r1, r2
add r1, r3 add r1, r3
vld1.8 {d0[]}, [r1] vld1.8 {d0[]}, [r1]
vld1.8 {d0[4]}, [r3], r2 vld1.8 {d0[4]}, [r3], r2
vld1.8 {d0[5]}, [r3], r2 vld1.8 {d0[5]}, [r3], r2
vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3} vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
vext.8 d1, d0, d0, #1 vext.8 d1, d0, d0, #1
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3} vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
vext.u8 d2, d5, d4, #2 vext.u8 d2, d5, d4, #2
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3} vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
//Calculate the hu0 ~ hu5 //Calculate the hu0 ~ hu5
vqrshrn.u16 d2, q2, #1 vqrshrn.u16 d2, q2, #1
vqrshrn.u16 d1, q1, #2 vqrshrn.u16 d1, q1, #2
//Adjust the data sequence for setting the luma MB //Adjust the data sequence for setting the luma MB
vzip.8 d2, d1 vzip.8 d2, d1
vst1.32 d1[0], [r0]! vst1.32 d1[0], [r0]!
vext.8 d2, d1, d1, #2 vext.8 d2, d1, d1, #2
vst1.32 d2[0], [r0]! vst1.32 d2[0], [r0]!
vst1.32 d1[1], [r0]! vst1.32 d1[1], [r0]!
vst1.32 d0[0], [r0] vst1.32 d0[0], [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHD_neon WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHD_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the data //Load the data
sub r3, r1, r2 sub r3, r1, r2
sub r3, #1 sub r3, #1
vld1.32 {d0[1]}, [r3], r2 vld1.32 {d0[1]}, [r3], r2
vld1.8 {d0[3]}, [r3], r2 vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2 vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2 vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2} vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
vext.8 d1, d0, d0, #7 vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2} vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1} vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2} vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
//Calculate the hd0~hd9 //Calculate the hd0~hd9
vqrshrn.u16 d1, q3, #2 vqrshrn.u16 d1, q3, #2
vqrshrn.u16 d0, q2, #1 vqrshrn.u16 d0, q2, #1
//Adjust the data sequence for setting the luma MB //Adjust the data sequence for setting the luma MB
vmov d3, d1 vmov d3, d1
vtrn.8 d0, d1 vtrn.8 d0, d1
vext.u8 d2, d1, d1, #6 vext.u8 d2, d1, d1, #6
vst2.16 {d2[3], d3[3]}, [r0]! vst2.16 {d2[3], d3[3]}, [r0]!
vst2.16 {d0[2], d1[2]}, [r0]! vst2.16 {d0[2], d1[2]}, [r0]!
vmov d3, d0 vmov d3, d0
vst2.16 {d2[2], d3[2]}, [r0]! vst2.16 {d2[2], d3[2]}, [r0]!
vst2.16 {d0[1], d1[1]}, [r0] vst2.16 {d0[1], d1[1]}, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIChromaPredV_neon WELS_ASM_FUNC_BEGIN WelsIChromaPredV_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Get the top row (8 byte) //Get the top row (8 byte)
sub r3, r1, r2 sub r3, r1, r2
vldr d0, [r3] vldr d0, [r3]
//Set the chroma MB using top row data //Set the chroma MB using top row data
vst1.8 {d0}, [r0]! vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]! vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]! vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]! vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]! vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]! vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]! vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0] vst1.8 {d0}, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
////Get the left column (8 byte) ////Get the left column (8 byte)
sub r3, r1, #1 sub r3, r1, #1
vld1.8 {d0[]}, [r3], r2 vld1.8 {d0[]}, [r3], r2
vld1.8 {d1[]}, [r3], r2 vld1.8 {d1[]}, [r3], r2
vld1.8 {d2[]}, [r3], r2 vld1.8 {d2[]}, [r3], r2
vld1.8 {d3[]}, [r3], r2 vld1.8 {d3[]}, [r3], r2
vld1.8 {d4[]}, [r3], r2 vld1.8 {d4[]}, [r3], r2
vld1.8 {d5[]}, [r3], r2 vld1.8 {d5[]}, [r3], r2
vld1.8 {d6[]}, [r3], r2 vld1.8 {d6[]}, [r3], r2
vld1.8 {d7[]}, [r3] vld1.8 {d7[]}, [r3]
//Set the chroma MB using left column data //Set the chroma MB using left column data
vst1.8 {d0}, [r0]! vst1.8 {d0}, [r0]!
vst1.8 {d1}, [r0]! vst1.8 {d1}, [r0]!
vst1.8 {d2}, [r0]! vst1.8 {d2}, [r0]!
vst1.8 {d3}, [r0]! vst1.8 {d3}, [r0]!
vst1.8 {d4}, [r0]! vst1.8 {d4}, [r0]!
vst1.8 {d5}, [r0]! vst1.8 {d5}, [r0]!
vst1.8 {d6}, [r0]! vst1.8 {d6}, [r0]!
vst1.8 {d7}, [r0] vst1.8 {d7}, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
@ -575,73 +575,73 @@ CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x2823
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003 CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row data //Load the top row data
sub r3, r1, #1 sub r3, r1, #1
sub r3, r2 sub r3, r2
vld1.32 {d1[0]}, [r3] vld1.32 {d1[0]}, [r3]
add r3, #5 add r3, #5
vld1.32 {d0[0]}, [r3] vld1.32 {d0[0]}, [r3]
//Load the left column data //Load the left column data
sub r3, #5 sub r3, #5
vld1.8 {d1[4]}, [r3], r2 vld1.8 {d1[4]}, [r3], r2
vld1.8 {d1[5]}, [r3], r2 vld1.8 {d1[5]}, [r3], r2
vld1.8 {d1[6]}, [r3], r2 vld1.8 {d1[6]}, [r3], r2
vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2} vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
add r3, r2 add r3, r2
vld1.8 {d0[4]}, [r3], r2 vld1.8 {d0[4]}, [r3], r2
vld1.8 {d0[5]}, [r3], r2 vld1.8 {d0[5]}, [r3], r2
vld1.8 {d0[6]}, [r3], r2 vld1.8 {d0[6]}, [r3], r2
vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7} vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
//Save T7 to d3 for next step //Save T7 to d3 for next step
vdup.u8 d3, d0[3] vdup.u8 d3, d0[3]
//Save L7 to d4 for next step //Save L7 to d4 for next step
vdup.u8 d4, d0[7] vdup.u8 d4, d0[7]
//Calculate the value of 'a' and save to q2 //Calculate the value of 'a' and save to q2
vaddl.u8 q2, d3, d4 vaddl.u8 q2, d3, d4
vshl.u16 q2, #4 vshl.u16 q2, #4
//Load the table {{1,2,3,4,1,2,3,4}*17} //Load the table {{1,2,3,4,1,2,3,4}*17}
adr r3, CONST0_GET_I_CHROMA_PRED_PLANE adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
vld1.32 {d2}, [r3] vld1.32 {d2}, [r3]
//Calculate the 'b','c', and save to q0 //Calculate the 'b','c', and save to q0
vrev32.8 d1, d1 vrev32.8 d1, d1
vsubl.u8 q0, d0, d1 vsubl.u8 q0, d0, d1
vmovl.u8 q1, d2 vmovl.u8 q1, d2
vmul.s16 q0, q1 vmul.s16 q0, q1
vpaddl.s16 q0, q0 vpaddl.s16 q0, q0
vpaddl.s32 q0, q0 vpaddl.s32 q0, q0
vrshr.s64 q0, #5 vrshr.s64 q0, #5
//Load the table {-3,-2,-1,0,1,2,3,4} to q3 //Load the table {-3,-2,-1,0,1,2,3,4} to q3
adr r3, CONST1_GET_I_CHROMA_PRED_PLANE adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
vld1.32 {d6, d7}, [r3] vld1.32 {d6, d7}, [r3]
//Duplicate the 'b','c' to q0, q1 for SIMD instruction //Duplicate the 'b','c' to q0, q1 for SIMD instruction
vdup.s16 q1, d1[0] vdup.s16 q1, d1[0]
vdup.s16 q0, d0[0] vdup.s16 q0, d0[0]
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;" //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
vmla.s16 q2, q0, q3 vmla.s16 q2, q0, q3
vmla.s16 q2, q1, d6[0] vmla.s16 q2, q1, d6[0]
vqrshrun.s16 d0, q2, #5 vqrshrun.s16 d0, q2, #5
//Set a line of chroma MB //Set a line of chroma MB
vst1.u32 {d0}, [r0]! vst1.u32 {d0}, [r0]!
//Do the same processing for each line. //Do the same processing for each line.
mov r3, #7 mov r3, #7
loop_0_get_i_chroma_pred_plane: loop_0_get_i_chroma_pred_plane:
vadd.s16 q2, q1 vadd.s16 q2, q1
vqrshrun.s16 d0, q2, #5 vqrshrun.s16 d0, q2, #5
vst1.u32 {d0}, [r0]! vst1.u32 {d0}, [r0]!
subs r3, #1 subs r3, #1
bne loop_0_get_i_chroma_pred_plane bne loop_0_get_i_chroma_pred_plane
WELS_ASM_FUNC_END WELS_ASM_FUNC_END

File diff suppressed because it is too large Load Diff

View File

@ -66,10 +66,10 @@
vsub.s16 q3, q12, q13 vsub.s16 q3, q12, q13
vadd.s16 q8, q10, q11 vadd.s16 q8, q10, q11
vsub.s16 q9, q10, q11 vsub.s16 q9, q10, q11
vadd.s16 q10, q14, q15 vadd.s16 q10, q14, q15
vsub.s16 q11, q14, q15 vsub.s16 q11, q14, q15
vadd.s16 q12, q0, q2 vadd.s16 q12, q0, q2
vsub.s16 q14, q0, q2 vsub.s16 q14, q0, q2
@ -372,28 +372,28 @@ WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon
stmdb sp!, {r4-r5, lr} stmdb sp!, {r4-r5, lr}
//Loading a horizontal line data (4 bytes) //Loading a horizontal line data (4 bytes)
//line 0 //line 0
ldr r4, [r0], r1 ldr r4, [r0], r1
ldr r5, [r2], r3 ldr r5, [r2], r3
usad8 lr, r4, r5 usad8 lr, r4, r5
//line 1 //line 1
ldr r4, [r0], r1 ldr r4, [r0], r1
ldr r5, [r2], r3 ldr r5, [r2], r3
usada8 lr, r4, r5, lr usada8 lr, r4, r5, lr
//line 2 //line 2
ldr r4, [r0], r1 ldr r4, [r0], r1
ldr r5, [r2], r3 ldr r5, [r2], r3
usada8 lr, r4, r5, lr usada8 lr, r4, r5, lr
//line 3 //line 3
ldr r4, [r0] ldr r4, [r0]
ldr r5, [r2] ldr r5, [r2]
usada8 r0, r4, r5, lr usada8 r0, r4, r5, lr
ldmia sp!, {r4-r5, lr} ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
@ -401,340 +401,340 @@ WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x16_neon
stmdb sp!, {r4-r5, lr} stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr //Generate the pix2 start addr
sub r4, r2, #1 sub r4, r2, #1
add r5, r2, #1 add r5, r2, #1
sub r2, r3 sub r2, r3
//Loading a horizontal line data (16 bytes) //Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1 vld1.8 {q0}, [r0], r1 //save pix1
vld1.8 {q1}, [r2], r3 //save pix2 - stride vld1.8 {q1}, [r2], r3 //save pix2 - stride
vld1.8 {q10}, [r2], r3 //save pix2 vld1.8 {q10}, [r2], r3 //save pix2
vld1.8 {q2}, [r2], r3 //save pix2 + stride vld1.8 {q2}, [r2], r3 //save pix2 + stride
vld1.8 {q3}, [r4], r3 //save pix2 - 1 vld1.8 {q3}, [r4], r3 //save pix2 - 1
vld1.8 {q8}, [r5], r3 //save pix2 + 1 vld1.8 {q8}, [r5], r3 //save pix2 + 1
//Do the SAD for 16 bytes //Do the SAD for 16 bytes
vabdl.u8 q15, d0, d2 vabdl.u8 q15, d0, d2
vabal.u8 q15, d1, d3 vabal.u8 q15, d1, d3
vabdl.u8 q13, d0, d4 vabdl.u8 q13, d0, d4
vabal.u8 q13, d1, d5 vabal.u8 q13, d1, d5
vabdl.u8 q11, d0, d6 vabdl.u8 q11, d0, d6
vabal.u8 q11, d1, d7 vabal.u8 q11, d1, d7
vabdl.u8 q9, d0, d16 vabdl.u8 q9, d0, d16
vabal.u8 q9, d1, d17 vabal.u8 q9, d1, d17
mov lr, #15 mov lr, #15
pixel_sad_4_16x16_loop_0: pixel_sad_4_16x16_loop_0:
//Loading a horizontal line data (16 bytes) //Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1 vld1.8 {q0}, [r0], r1 //save pix1
vmov.8 q1, q10 //save pix2 - stride vmov.8 q1, q10 //save pix2 - stride
vmov.8 q10, q2 vmov.8 q10, q2
vabal.u8 q15, d0, d2 vabal.u8 q15, d0, d2
vld1.8 {q2}, [r2], r3 //save pix2 + stride vld1.8 {q2}, [r2], r3 //save pix2 + stride
vabal.u8 q15, d1, d3 vabal.u8 q15, d1, d3
vld1.8 {q3}, [r4], r3 //save pix2 - 1 vld1.8 {q3}, [r4], r3 //save pix2 - 1
vabal.u8 q13, d0, d4 vabal.u8 q13, d0, d4
vld1.8 {q8}, [r5], r3 //save pix2 + 1 vld1.8 {q8}, [r5], r3 //save pix2 + 1
vabal.u8 q13, d1, d5 vabal.u8 q13, d1, d5
subs lr, #1 subs lr, #1
vabal.u8 q11, d0, d6 vabal.u8 q11, d0, d6
vabal.u8 q11, d1, d7 vabal.u8 q11, d1, d7
vabal.u8 q9, d0, d16 vabal.u8 q9, d0, d16
vabal.u8 q9, d1, d17 vabal.u8 q9, d1, d17
bne pixel_sad_4_16x16_loop_0 bne pixel_sad_4_16x16_loop_0
//Save SAD to 'r0' //Save SAD to 'r0'
ldr r0, [sp, #12] ldr r0, [sp, #12]
vadd.u16 d0, d30, d31 vadd.u16 d0, d30, d31
vadd.u16 d1, d26, d27 vadd.u16 d1, d26, d27
vadd.u16 d2, d22, d23 vadd.u16 d2, d22, d23
vadd.u16 d3, d18, d19 vadd.u16 d3, d18, d19
vpaddl.u16 q0, q0 vpaddl.u16 q0, q0
vpaddl.u16 q1, q1 vpaddl.u16 q1, q1
vpaddl.u32 q0, q0 vpaddl.u32 q0, q0
vpaddl.u32 q1, q1 vpaddl.u32 q1, q1
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr} ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon
stmdb sp!, {r4-r5, lr} stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr //Generate the pix2 start addr
sub r4, r2, #1 sub r4, r2, #1
add r5, r2, #1 add r5, r2, #1
sub r2, r3 sub r2, r3
//Loading a horizontal line data (16 bytes) //Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1 vld1.8 {q0}, [r0], r1 //save pix1
vld1.8 {q1}, [r2], r3 //save pix2 - stride vld1.8 {q1}, [r2], r3 //save pix2 - stride
vld1.8 {q10}, [r2], r3 //save pix2 vld1.8 {q10}, [r2], r3 //save pix2
vld1.8 {q2}, [r2], r3 //save pix2 + stride vld1.8 {q2}, [r2], r3 //save pix2 + stride
vld1.8 {q3}, [r4], r3 //save pix2 - 1 vld1.8 {q3}, [r4], r3 //save pix2 - 1
vld1.8 {q8}, [r5], r3 //save pix2 + 1 vld1.8 {q8}, [r5], r3 //save pix2 + 1
//Do the SAD for 16 bytes //Do the SAD for 16 bytes
vabdl.u8 q15, d0, d2 vabdl.u8 q15, d0, d2
vabal.u8 q15, d1, d3 vabal.u8 q15, d1, d3
vabdl.u8 q13, d0, d4 vabdl.u8 q13, d0, d4
vabal.u8 q13, d1, d5 vabal.u8 q13, d1, d5
vabdl.u8 q11, d0, d6 vabdl.u8 q11, d0, d6
vabal.u8 q11, d1, d7 vabal.u8 q11, d1, d7
vabdl.u8 q9, d0, d16 vabdl.u8 q9, d0, d16
vabal.u8 q9, d1, d17 vabal.u8 q9, d1, d17
mov lr, #7 mov lr, #7
pixel_sad_4_16x8_loop_0: pixel_sad_4_16x8_loop_0:
//Loading a horizontal line data (16 bytes) //Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1 vld1.8 {q0}, [r0], r1 //save pix1
vmov.8 q1, q10 //save pix2 - stride vmov.8 q1, q10 //save pix2 - stride
vmov.8 q10, q2 vmov.8 q10, q2
vabal.u8 q15, d0, d2 vabal.u8 q15, d0, d2
vld1.8 {q2}, [r2], r3 //save pix2 + stride vld1.8 {q2}, [r2], r3 //save pix2 + stride
vabal.u8 q15, d1, d3 vabal.u8 q15, d1, d3
vld1.8 {q3}, [r4], r3 //save pix2 - 1 vld1.8 {q3}, [r4], r3 //save pix2 - 1
vabal.u8 q13, d0, d4 vabal.u8 q13, d0, d4
vld1.8 {q8}, [r5], r3 //save pix2 + 1 vld1.8 {q8}, [r5], r3 //save pix2 + 1
vabal.u8 q13, d1, d5 vabal.u8 q13, d1, d5
subs lr, #1 subs lr, #1
vabal.u8 q11, d0, d6 vabal.u8 q11, d0, d6
vabal.u8 q11, d1, d7 vabal.u8 q11, d1, d7
vabal.u8 q9, d0, d16 vabal.u8 q9, d0, d16
vabal.u8 q9, d1, d17 vabal.u8 q9, d1, d17
bne pixel_sad_4_16x8_loop_0 bne pixel_sad_4_16x8_loop_0
//Save SAD to 'r0' //Save SAD to 'r0'
ldr r0, [sp, #12] ldr r0, [sp, #12]
vadd.u16 d0, d30, d31 vadd.u16 d0, d30, d31
vadd.u16 d1, d26, d27 vadd.u16 d1, d26, d27
vadd.u16 d2, d22, d23 vadd.u16 d2, d22, d23
vadd.u16 d3, d18, d19 vadd.u16 d3, d18, d19
vpaddl.u16 q0, q0 vpaddl.u16 q0, q0
vpaddl.u16 q1, q1 vpaddl.u16 q1, q1
vpaddl.u32 q0, q0 vpaddl.u32 q0, q0
vpaddl.u32 q1, q1 vpaddl.u32 q1, q1
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr} ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon
stmdb sp!, {r4-r5, lr} stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr //Generate the pix2 start addr
sub r4, r2, #1 sub r4, r2, #1
add r5, r2, #1 add r5, r2, #1
sub r2, r3 sub r2, r3
//Loading a horizontal line data (8 bytes) //Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1 vld1.8 {d0}, [r0], r1 //save pix1
vld1.8 {d1}, [r2], r3 //save pix2 - stride vld1.8 {d1}, [r2], r3 //save pix2 - stride
vld1.8 {d6}, [r2], r3 //save pix2 vld1.8 {d6}, [r2], r3 //save pix2
vld1.8 {d2}, [r2], r3 //save pix2 + stride vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1 vld1.8 {d3}, [r4], r3 //save pix2 - 1
vld1.8 {d4}, [r5], r3 //save pix2 + 1 vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes //Do the SAD for 8 bytes
vabdl.u8 q15, d0, d1 vabdl.u8 q15, d0, d1
vabdl.u8 q14, d0, d2 vabdl.u8 q14, d0, d2
vabdl.u8 q13, d0, d3 vabdl.u8 q13, d0, d3
vabdl.u8 q12, d0, d4 vabdl.u8 q12, d0, d4
mov lr, #15 mov lr, #15
pixel_sad_4_8x16_loop_0: pixel_sad_4_8x16_loop_0:
//Loading a horizontal line data (8 bytes) //Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1 vld1.8 {d0}, [r0], r1 //save pix1
vmov.8 d1, d6 //save pix2 - stride vmov.8 d1, d6 //save pix2 - stride
vmov.8 d6, d2 vmov.8 d6, d2
vld1.8 {d2}, [r2], r3 //save pix2 + stride vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1 vld1.8 {d3}, [r4], r3 //save pix2 - 1
vabal.u8 q15, d0, d1 vabal.u8 q15, d0, d1
vld1.8 {d4}, [r5], r3 //save pix2 + 1 vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes //Do the SAD for 8 bytes
vabal.u8 q14, d0, d2 vabal.u8 q14, d0, d2
vabal.u8 q13, d0, d3 vabal.u8 q13, d0, d3
vabal.u8 q12, d0, d4 vabal.u8 q12, d0, d4
subs lr, #1 subs lr, #1
bne pixel_sad_4_8x16_loop_0 bne pixel_sad_4_8x16_loop_0
//Save SAD to 'r0' //Save SAD to 'r0'
ldr r0, [sp, #12] ldr r0, [sp, #12]
vadd.u16 d0, d30, d31 vadd.u16 d0, d30, d31
vadd.u16 d1, d28, d29 vadd.u16 d1, d28, d29
vadd.u16 d2, d26, d27 vadd.u16 d2, d26, d27
vadd.u16 d3, d24, d25 vadd.u16 d3, d24, d25
vpaddl.u16 q0, q0 vpaddl.u16 q0, q0
vpaddl.u16 q1, q1 vpaddl.u16 q1, q1
vpaddl.u32 q0, q0 vpaddl.u32 q0, q0
vpaddl.u32 q1, q1 vpaddl.u32 q1, q1
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr} ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon
stmdb sp!, {r4-r5, lr} stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr //Generate the pix2 start addr
sub r4, r2, #1 sub r4, r2, #1
add r5, r2, #1 add r5, r2, #1
sub r2, r3 sub r2, r3
//Loading a horizontal line data (8 bytes) //Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1 vld1.8 {d0}, [r0], r1 //save pix1
vld1.8 {d1}, [r2], r3 //save pix2 - stride vld1.8 {d1}, [r2], r3 //save pix2 - stride
vld1.8 {d6}, [r2], r3 //save pix2 vld1.8 {d6}, [r2], r3 //save pix2
vld1.8 {d2}, [r2], r3 //save pix2 + stride vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1 vld1.8 {d3}, [r4], r3 //save pix2 - 1
vld1.8 {d4}, [r5], r3 //save pix2 + 1 vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes //Do the SAD for 8 bytes
vabdl.u8 q15, d0, d1 vabdl.u8 q15, d0, d1
vabdl.u8 q14, d0, d2 vabdl.u8 q14, d0, d2
vabdl.u8 q13, d0, d3 vabdl.u8 q13, d0, d3
vabdl.u8 q12, d0, d4 vabdl.u8 q12, d0, d4
mov lr, #7 mov lr, #7
pixel_sad_4_8x8_loop_0: pixel_sad_4_8x8_loop_0:
//Loading a horizontal line data (8 bytes) //Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1 vld1.8 {d0}, [r0], r1 //save pix1
vmov.8 d1, d6 //save pix2 - stride vmov.8 d1, d6 //save pix2 - stride
vmov.8 d6, d2 vmov.8 d6, d2
vld1.8 {d2}, [r2], r3 //save pix2 + stride vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1 vld1.8 {d3}, [r4], r3 //save pix2 - 1
vabal.u8 q15, d0, d1 vabal.u8 q15, d0, d1
vld1.8 {d4}, [r5], r3 //save pix2 + 1 vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes //Do the SAD for 8 bytes
vabal.u8 q14, d0, d2 vabal.u8 q14, d0, d2
vabal.u8 q13, d0, d3 vabal.u8 q13, d0, d3
vabal.u8 q12, d0, d4 vabal.u8 q12, d0, d4
subs lr, #1 subs lr, #1
bne pixel_sad_4_8x8_loop_0 bne pixel_sad_4_8x8_loop_0
//Save SAD to 'r0' //Save SAD to 'r0'
ldr r0, [sp, #12] ldr r0, [sp, #12]
vadd.u16 d0, d30, d31 vadd.u16 d0, d30, d31
vadd.u16 d1, d28, d29 vadd.u16 d1, d28, d29
vadd.u16 d2, d26, d27 vadd.u16 d2, d26, d27
vadd.u16 d3, d24, d25 vadd.u16 d3, d24, d25
vpaddl.u16 q0, q0 vpaddl.u16 q0, q0
vpaddl.u16 q1, q1 vpaddl.u16 q1, q1
vpaddl.u32 q0, q0 vpaddl.u32 q0, q0
vpaddl.u32 q1, q1 vpaddl.u32 q1, q1
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr} ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon
vld1.32 {d0[0]}, [r0], r1 vld1.32 {d0[0]}, [r0], r1
vld1.32 {d0[1]}, [r0], r1 vld1.32 {d0[1]}, [r0], r1
vld1.32 {d1[0]}, [r0], r1 vld1.32 {d1[0]}, [r0], r1
vld1.32 {d1[1]}, [r0] vld1.32 {d1[1]}, [r0]
sub r0, r2, r3 sub r0, r2, r3
vld1.32 {d2[0]}, [r0], r3 vld1.32 {d2[0]}, [r0], r3
vld1.32 {d2[1]}, [r0], r3 vld1.32 {d2[1]}, [r0], r3
vld1.32 {d3[0]}, [r0], r3 vld1.32 {d3[0]}, [r0], r3
vld1.32 {d3[1]}, [r0], r3 vld1.32 {d3[1]}, [r0], r3
vld1.32 {d4[0]}, [r0], r3 vld1.32 {d4[0]}, [r0], r3
vld1.32 {d4[1]}, [r0] vld1.32 {d4[1]}, [r0]
sub r0, r2, #1 sub r0, r2, #1
vld1.32 {d5[0]}, [r0], r3 vld1.32 {d5[0]}, [r0], r3
vld1.32 {d5[1]}, [r0], r3 vld1.32 {d5[1]}, [r0], r3
vld1.32 {d6[0]}, [r0], r3 vld1.32 {d6[0]}, [r0], r3
vld1.32 {d6[1]}, [r0] vld1.32 {d6[1]}, [r0]
add r0, r2, #1 add r0, r2, #1
vld1.32 {d7[0]}, [r0], r3 vld1.32 {d7[0]}, [r0], r3
vld1.32 {d7[1]}, [r0], r3 vld1.32 {d7[1]}, [r0], r3
vld1.32 {d8[0]}, [r0], r3 vld1.32 {d8[0]}, [r0], r3
vld1.32 {d8[1]}, [r0] vld1.32 {d8[1]}, [r0]
vabdl.u8 q15, d0, d2 vabdl.u8 q15, d0, d2
vabdl.u8 q14, d1, d3 vabdl.u8 q14, d1, d3
vabdl.u8 q13, d0, d3 vabdl.u8 q13, d0, d3
vabdl.u8 q12, d1, d4 vabdl.u8 q12, d1, d4
vabdl.u8 q11, d0, d5 vabdl.u8 q11, d0, d5
vabdl.u8 q10, d1, d6 vabdl.u8 q10, d1, d6
vabdl.u8 q9, d0, d7 vabdl.u8 q9, d0, d7
vabdl.u8 q8, d1, d8 vabdl.u8 q8, d1, d8
//Save SAD to 'r4' //Save SAD to 'r4'
ldr r0, [sp] ldr r0, [sp]
vadd.u16 q0, q14, q15 vadd.u16 q0, q14, q15
vadd.u16 q1, q12, q13 vadd.u16 q1, q12, q13
vadd.u16 q2, q10, q11 vadd.u16 q2, q10, q11
vadd.u16 q3, q8 , q9 vadd.u16 q3, q8 , q9
vadd.u16 d0, d1 vadd.u16 d0, d1
vadd.u16 d1, d2, d3 vadd.u16 d1, d2, d3
vadd.u16 d2, d4, d5 vadd.u16 d2, d4, d5
vadd.u16 d3, d6, d7 vadd.u16 d3, d6, d7
vpaddl.u16 q0, q0 vpaddl.u16 q0, q0
vpaddl.u16 q1, q1 vpaddl.u16 q1, q1
vpaddl.u32 q0, q0 vpaddl.u32 q0, q0
vpaddl.u32 q1, q1 vpaddl.u32 q1, q1
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
@ -834,16 +834,16 @@ WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
//Load the pix1 data --- 16 bytes //Load the pix1 data --- 16 bytes
vld1.32 {d0[0]}, [r0], r1 vld1.32 {d0[0]}, [r0], r1
vld1.32 {d0[1]}, [r0], r1 vld1.32 {d0[1]}, [r0], r1
vld1.32 {d1[0]}, [r0], r1 vld1.32 {d1[0]}, [r0], r1
vld1.32 {d1[1]}, [r0] vld1.32 {d1[1]}, [r0]
//Load the pix2 data --- 16 bytes //Load the pix2 data --- 16 bytes
vld1.32 {d2[0]}, [r2], r3 vld1.32 {d2[0]}, [r2], r3
vld1.32 {d2[1]}, [r2], r3 vld1.32 {d2[1]}, [r2], r3
vld1.32 {d3[0]}, [r2], r3 vld1.32 {d3[0]}, [r2], r3
vld1.32 {d3[1]}, [r2] vld1.32 {d3[1]}, [r2]
//Get the difference //Get the difference
vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7} vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7}
@ -874,7 +874,7 @@ WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
vpaddl.u16 d0, d0 vpaddl.u16 d0, d0
vpaddl.u32 d0, d0 vpaddl.u32 d0, d0
vmov.u32 r0, d0[0] vmov.u32 r0, d0[0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END

File diff suppressed because it is too large Load Diff

View File

@ -55,262 +55,262 @@ sse2_b_1 db -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1
align 16 align 16
byte_1pos_table: byte_1pos_table:
db 0,0,0,0,0,0,0,0, ;0 db 0,0,0,0,0,0,0,0, ;0
db 0,0,0,0,0,0,0,1, ;1 db 0,0,0,0,0,0,0,1, ;1
db 1,0,0,0,0,0,0,1, ;2 db 1,0,0,0,0,0,0,1, ;2
db 1,0,0,0,0,0,0,2, ;3 db 1,0,0,0,0,0,0,2, ;3
db 2,0,0,0,0,0,0,1, ;4 db 2,0,0,0,0,0,0,1, ;4
db 2,0,0,0,0,0,0,2, ;5 db 2,0,0,0,0,0,0,2, ;5
db 2,1,0,0,0,0,0,2, ;6 db 2,1,0,0,0,0,0,2, ;6
db 2,1,0,0,0,0,0,3, ;7 db 2,1,0,0,0,0,0,3, ;7
db 3,0,0,0,0,0,0,1, ;8 db 3,0,0,0,0,0,0,1, ;8
db 3,0,0,0,0,0,0,2, ;9 db 3,0,0,0,0,0,0,2, ;9
db 3,1,0,0,0,0,0,2, ;10 db 3,1,0,0,0,0,0,2, ;10
db 3,1,0,0,0,0,0,3, ;11 db 3,1,0,0,0,0,0,3, ;11
db 3,2,0,0,0,0,0,2, ;12 db 3,2,0,0,0,0,0,2, ;12
db 3,2,0,0,0,0,0,3, ;13 db 3,2,0,0,0,0,0,3, ;13
db 3,2,1,0,0,0,0,3, ;14 db 3,2,1,0,0,0,0,3, ;14
db 3,2,1,0,0,0,0,4, ;15 db 3,2,1,0,0,0,0,4, ;15
db 4,0,0,0,0,0,0,1, ;16 db 4,0,0,0,0,0,0,1, ;16
db 4,0,0,0,0,0,0,2, ;17 db 4,0,0,0,0,0,0,2, ;17
db 4,1,0,0,0,0,0,2, ;18 db 4,1,0,0,0,0,0,2, ;18
db 4,1,0,0,0,0,0,3, ;19 db 4,1,0,0,0,0,0,3, ;19
db 4,2,0,0,0,0,0,2, ;20 db 4,2,0,0,0,0,0,2, ;20
db 4,2,0,0,0,0,0,3, ;21 db 4,2,0,0,0,0,0,3, ;21
db 4,2,1,0,0,0,0,3, ;22 db 4,2,1,0,0,0,0,3, ;22
db 4,2,1,0,0,0,0,4, ;23 db 4,2,1,0,0,0,0,4, ;23
db 4,3,0,0,0,0,0,2, ;24 db 4,3,0,0,0,0,0,2, ;24
db 4,3,0,0,0,0,0,3, ;25 db 4,3,0,0,0,0,0,3, ;25
db 4,3,1,0,0,0,0,3, ;26 db 4,3,1,0,0,0,0,3, ;26
db 4,3,1,0,0,0,0,4, ;27 db 4,3,1,0,0,0,0,4, ;27
db 4,3,2,0,0,0,0,3, ;28 db 4,3,2,0,0,0,0,3, ;28
db 4,3,2,0,0,0,0,4, ;29 db 4,3,2,0,0,0,0,4, ;29
db 4,3,2,1,0,0,0,4, ;30 db 4,3,2,1,0,0,0,4, ;30
db 4,3,2,1,0,0,0,5, ;31 db 4,3,2,1,0,0,0,5, ;31
db 5,0,0,0,0,0,0,1, ;32 db 5,0,0,0,0,0,0,1, ;32
db 5,0,0,0,0,0,0,2, ;33 db 5,0,0,0,0,0,0,2, ;33
db 5,1,0,0,0,0,0,2, ;34 db 5,1,0,0,0,0,0,2, ;34
db 5,1,0,0,0,0,0,3, ;35 db 5,1,0,0,0,0,0,3, ;35
db 5,2,0,0,0,0,0,2, ;36 db 5,2,0,0,0,0,0,2, ;36
db 5,2,0,0,0,0,0,3, ;37 db 5,2,0,0,0,0,0,3, ;37
db 5,2,1,0,0,0,0,3, ;38 db 5,2,1,0,0,0,0,3, ;38
db 5,2,1,0,0,0,0,4, ;39 db 5,2,1,0,0,0,0,4, ;39
db 5,3,0,0,0,0,0,2, ;40 db 5,3,0,0,0,0,0,2, ;40
db 5,3,0,0,0,0,0,3, ;41 db 5,3,0,0,0,0,0,3, ;41
db 5,3,1,0,0,0,0,3, ;42 db 5,3,1,0,0,0,0,3, ;42
db 5,3,1,0,0,0,0,4, ;43 db 5,3,1,0,0,0,0,4, ;43
db 5,3,2,0,0,0,0,3, ;44 db 5,3,2,0,0,0,0,3, ;44
db 5,3,2,0,0,0,0,4, ;45 db 5,3,2,0,0,0,0,4, ;45
db 5,3,2,1,0,0,0,4, ;46 db 5,3,2,1,0,0,0,4, ;46
db 5,3,2,1,0,0,0,5, ;47 db 5,3,2,1,0,0,0,5, ;47
db 5,4,0,0,0,0,0,2, ;48 db 5,4,0,0,0,0,0,2, ;48
db 5,4,0,0,0,0,0,3, ;49 db 5,4,0,0,0,0,0,3, ;49
db 5,4,1,0,0,0,0,3, ;50 db 5,4,1,0,0,0,0,3, ;50
db 5,4,1,0,0,0,0,4, ;51 db 5,4,1,0,0,0,0,4, ;51
db 5,4,2,0,0,0,0,3, ;52 db 5,4,2,0,0,0,0,3, ;52
db 5,4,2,0,0,0,0,4, ;53 db 5,4,2,0,0,0,0,4, ;53
db 5,4,2,1,0,0,0,4, ;54 db 5,4,2,1,0,0,0,4, ;54
db 5,4,2,1,0,0,0,5, ;55 db 5,4,2,1,0,0,0,5, ;55
db 5,4,3,0,0,0,0,3, ;56 db 5,4,3,0,0,0,0,3, ;56
db 5,4,3,0,0,0,0,4, ;57 db 5,4,3,0,0,0,0,4, ;57
db 5,4,3,1,0,0,0,4, ;58 db 5,4,3,1,0,0,0,4, ;58
db 5,4,3,1,0,0,0,5, ;59 db 5,4,3,1,0,0,0,5, ;59
db 5,4,3,2,0,0,0,4, ;60 db 5,4,3,2,0,0,0,4, ;60
db 5,4,3,2,0,0,0,5, ;61 db 5,4,3,2,0,0,0,5, ;61
db 5,4,3,2,1,0,0,5, ;62 db 5,4,3,2,1,0,0,5, ;62
db 5,4,3,2,1,0,0,6, ;63 db 5,4,3,2,1,0,0,6, ;63
db 6,0,0,0,0,0,0,1, ;64 db 6,0,0,0,0,0,0,1, ;64
db 6,0,0,0,0,0,0,2, ;65 db 6,0,0,0,0,0,0,2, ;65
db 6,1,0,0,0,0,0,2, ;66 db 6,1,0,0,0,0,0,2, ;66
db 6,1,0,0,0,0,0,3, ;67 db 6,1,0,0,0,0,0,3, ;67
db 6,2,0,0,0,0,0,2, ;68 db 6,2,0,0,0,0,0,2, ;68
db 6,2,0,0,0,0,0,3, ;69 db 6,2,0,0,0,0,0,3, ;69
db 6,2,1,0,0,0,0,3, ;70 db 6,2,1,0,0,0,0,3, ;70
db 6,2,1,0,0,0,0,4, ;71 db 6,2,1,0,0,0,0,4, ;71
db 6,3,0,0,0,0,0,2, ;72 db 6,3,0,0,0,0,0,2, ;72
db 6,3,0,0,0,0,0,3, ;73 db 6,3,0,0,0,0,0,3, ;73
db 6,3,1,0,0,0,0,3, ;74 db 6,3,1,0,0,0,0,3, ;74
db 6,3,1,0,0,0,0,4, ;75 db 6,3,1,0,0,0,0,4, ;75
db 6,3,2,0,0,0,0,3, ;76 db 6,3,2,0,0,0,0,3, ;76
db 6,3,2,0,0,0,0,4, ;77 db 6,3,2,0,0,0,0,4, ;77
db 6,3,2,1,0,0,0,4, ;78 db 6,3,2,1,0,0,0,4, ;78
db 6,3,2,1,0,0,0,5, ;79 db 6,3,2,1,0,0,0,5, ;79
db 6,4,0,0,0,0,0,2, ;80 db 6,4,0,0,0,0,0,2, ;80
db 6,4,0,0,0,0,0,3, ;81 db 6,4,0,0,0,0,0,3, ;81
db 6,4,1,0,0,0,0,3, ;82 db 6,4,1,0,0,0,0,3, ;82
db 6,4,1,0,0,0,0,4, ;83 db 6,4,1,0,0,0,0,4, ;83
db 6,4,2,0,0,0,0,3, ;84 db 6,4,2,0,0,0,0,3, ;84
db 6,4,2,0,0,0,0,4, ;85 db 6,4,2,0,0,0,0,4, ;85
db 6,4,2,1,0,0,0,4, ;86 db 6,4,2,1,0,0,0,4, ;86
db 6,4,2,1,0,0,0,5, ;87 db 6,4,2,1,0,0,0,5, ;87
db 6,4,3,0,0,0,0,3, ;88 db 6,4,3,0,0,0,0,3, ;88
db 6,4,3,0,0,0,0,4, ;89 db 6,4,3,0,0,0,0,4, ;89
db 6,4,3,1,0,0,0,4, ;90 db 6,4,3,1,0,0,0,4, ;90
db 6,4,3,1,0,0,0,5, ;91 db 6,4,3,1,0,0,0,5, ;91
db 6,4,3,2,0,0,0,4, ;92 db 6,4,3,2,0,0,0,4, ;92
db 6,4,3,2,0,0,0,5, ;93 db 6,4,3,2,0,0,0,5, ;93
db 6,4,3,2,1,0,0,5, ;94 db 6,4,3,2,1,0,0,5, ;94
db 6,4,3,2,1,0,0,6, ;95 db 6,4,3,2,1,0,0,6, ;95
db 6,5,0,0,0,0,0,2, ;96 db 6,5,0,0,0,0,0,2, ;96
db 6,5,0,0,0,0,0,3, ;97 db 6,5,0,0,0,0,0,3, ;97
db 6,5,1,0,0,0,0,3, ;98 db 6,5,1,0,0,0,0,3, ;98
db 6,5,1,0,0,0,0,4, ;99 db 6,5,1,0,0,0,0,4, ;99
db 6,5,2,0,0,0,0,3, ;100 db 6,5,2,0,0,0,0,3, ;100
db 6,5,2,0,0,0,0,4, ;101 db 6,5,2,0,0,0,0,4, ;101
db 6,5,2,1,0,0,0,4, ;102 db 6,5,2,1,0,0,0,4, ;102
db 6,5,2,1,0,0,0,5, ;103 db 6,5,2,1,0,0,0,5, ;103
db 6,5,3,0,0,0,0,3, ;104 db 6,5,3,0,0,0,0,3, ;104
db 6,5,3,0,0,0,0,4, ;105 db 6,5,3,0,0,0,0,4, ;105
db 6,5,3,1,0,0,0,4, ;106 db 6,5,3,1,0,0,0,4, ;106
db 6,5,3,1,0,0,0,5, ;107 db 6,5,3,1,0,0,0,5, ;107
db 6,5,3,2,0,0,0,4, ;108 db 6,5,3,2,0,0,0,4, ;108
db 6,5,3,2,0,0,0,5, ;109 db 6,5,3,2,0,0,0,5, ;109
db 6,5,3,2,1,0,0,5, ;110 db 6,5,3,2,1,0,0,5, ;110
db 6,5,3,2,1,0,0,6, ;111 db 6,5,3,2,1,0,0,6, ;111
db 6,5,4,0,0,0,0,3, ;112 db 6,5,4,0,0,0,0,3, ;112
db 6,5,4,0,0,0,0,4, ;113 db 6,5,4,0,0,0,0,4, ;113
db 6,5,4,1,0,0,0,4, ;114 db 6,5,4,1,0,0,0,4, ;114
db 6,5,4,1,0,0,0,5, ;115 db 6,5,4,1,0,0,0,5, ;115
db 6,5,4,2,0,0,0,4, ;116 db 6,5,4,2,0,0,0,4, ;116
db 6,5,4,2,0,0,0,5, ;117 db 6,5,4,2,0,0,0,5, ;117
db 6,5,4,2,1,0,0,5, ;118 db 6,5,4,2,1,0,0,5, ;118
db 6,5,4,2,1,0,0,6, ;119 db 6,5,4,2,1,0,0,6, ;119
db 6,5,4,3,0,0,0,4, ;120 db 6,5,4,3,0,0,0,4, ;120
db 6,5,4,3,0,0,0,5, ;121 db 6,5,4,3,0,0,0,5, ;121
db 6,5,4,3,1,0,0,5, ;122 db 6,5,4,3,1,0,0,5, ;122
db 6,5,4,3,1,0,0,6, ;123 db 6,5,4,3,1,0,0,6, ;123
db 6,5,4,3,2,0,0,5, ;124 db 6,5,4,3,2,0,0,5, ;124
db 6,5,4,3,2,0,0,6, ;125 db 6,5,4,3,2,0,0,6, ;125
db 6,5,4,3,2,1,0,6, ;126 db 6,5,4,3,2,1,0,6, ;126
db 6,5,4,3,2,1,0,7, ;127 db 6,5,4,3,2,1,0,7, ;127
db 7,0,0,0,0,0,0,1, ;128 db 7,0,0,0,0,0,0,1, ;128
db 7,0,0,0,0,0,0,2, ;129 db 7,0,0,0,0,0,0,2, ;129
db 7,1,0,0,0,0,0,2, ;130 db 7,1,0,0,0,0,0,2, ;130
db 7,1,0,0,0,0,0,3, ;131 db 7,1,0,0,0,0,0,3, ;131
db 7,2,0,0,0,0,0,2, ;132 db 7,2,0,0,0,0,0,2, ;132
db 7,2,0,0,0,0,0,3, ;133 db 7,2,0,0,0,0,0,3, ;133
db 7,2,1,0,0,0,0,3, ;134 db 7,2,1,0,0,0,0,3, ;134
db 7,2,1,0,0,0,0,4, ;135 db 7,2,1,0,0,0,0,4, ;135
db 7,3,0,0,0,0,0,2, ;136 db 7,3,0,0,0,0,0,2, ;136
db 7,3,0,0,0,0,0,3, ;137 db 7,3,0,0,0,0,0,3, ;137
db 7,3,1,0,0,0,0,3, ;138 db 7,3,1,0,0,0,0,3, ;138
db 7,3,1,0,0,0,0,4, ;139 db 7,3,1,0,0,0,0,4, ;139
db 7,3,2,0,0,0,0,3, ;140 db 7,3,2,0,0,0,0,3, ;140
db 7,3,2,0,0,0,0,4, ;141 db 7,3,2,0,0,0,0,4, ;141
db 7,3,2,1,0,0,0,4, ;142 db 7,3,2,1,0,0,0,4, ;142
db 7,3,2,1,0,0,0,5, ;143 db 7,3,2,1,0,0,0,5, ;143
db 7,4,0,0,0,0,0,2, ;144 db 7,4,0,0,0,0,0,2, ;144
db 7,4,0,0,0,0,0,3, ;145 db 7,4,0,0,0,0,0,3, ;145
db 7,4,1,0,0,0,0,3, ;146 db 7,4,1,0,0,0,0,3, ;146
db 7,4,1,0,0,0,0,4, ;147 db 7,4,1,0,0,0,0,4, ;147
db 7,4,2,0,0,0,0,3, ;148 db 7,4,2,0,0,0,0,3, ;148
db 7,4,2,0,0,0,0,4, ;149 db 7,4,2,0,0,0,0,4, ;149
db 7,4,2,1,0,0,0,4, ;150 db 7,4,2,1,0,0,0,4, ;150
db 7,4,2,1,0,0,0,5, ;151 db 7,4,2,1,0,0,0,5, ;151
db 7,4,3,0,0,0,0,3, ;152 db 7,4,3,0,0,0,0,3, ;152
db 7,4,3,0,0,0,0,4, ;153 db 7,4,3,0,0,0,0,4, ;153
db 7,4,3,1,0,0,0,4, ;154 db 7,4,3,1,0,0,0,4, ;154
db 7,4,3,1,0,0,0,5, ;155 db 7,4,3,1,0,0,0,5, ;155
db 7,4,3,2,0,0,0,4, ;156 db 7,4,3,2,0,0,0,4, ;156
db 7,4,3,2,0,0,0,5, ;157 db 7,4,3,2,0,0,0,5, ;157
db 7,4,3,2,1,0,0,5, ;158 db 7,4,3,2,1,0,0,5, ;158
db 7,4,3,2,1,0,0,6, ;159 db 7,4,3,2,1,0,0,6, ;159
db 7,5,0,0,0,0,0,2, ;160 db 7,5,0,0,0,0,0,2, ;160
db 7,5,0,0,0,0,0,3, ;161 db 7,5,0,0,0,0,0,3, ;161
db 7,5,1,0,0,0,0,3, ;162 db 7,5,1,0,0,0,0,3, ;162
db 7,5,1,0,0,0,0,4, ;163 db 7,5,1,0,0,0,0,4, ;163
db 7,5,2,0,0,0,0,3, ;164 db 7,5,2,0,0,0,0,3, ;164
db 7,5,2,0,0,0,0,4, ;165 db 7,5,2,0,0,0,0,4, ;165
db 7,5,2,1,0,0,0,4, ;166 db 7,5,2,1,0,0,0,4, ;166
db 7,5,2,1,0,0,0,5, ;167 db 7,5,2,1,0,0,0,5, ;167
db 7,5,3,0,0,0,0,3, ;168 db 7,5,3,0,0,0,0,3, ;168
db 7,5,3,0,0,0,0,4, ;169 db 7,5,3,0,0,0,0,4, ;169
db 7,5,3,1,0,0,0,4, ;170 db 7,5,3,1,0,0,0,4, ;170
db 7,5,3,1,0,0,0,5, ;171 db 7,5,3,1,0,0,0,5, ;171
db 7,5,3,2,0,0,0,4, ;172 db 7,5,3,2,0,0,0,4, ;172
db 7,5,3,2,0,0,0,5, ;173 db 7,5,3,2,0,0,0,5, ;173
db 7,5,3,2,1,0,0,5, ;174 db 7,5,3,2,1,0,0,5, ;174
db 7,5,3,2,1,0,0,6, ;175 db 7,5,3,2,1,0,0,6, ;175
db 7,5,4,0,0,0,0,3, ;176 db 7,5,4,0,0,0,0,3, ;176
db 7,5,4,0,0,0,0,4, ;177 db 7,5,4,0,0,0,0,4, ;177
db 7,5,4,1,0,0,0,4, ;178 db 7,5,4,1,0,0,0,4, ;178
db 7,5,4,1,0,0,0,5, ;179 db 7,5,4,1,0,0,0,5, ;179
db 7,5,4,2,0,0,0,4, ;180 db 7,5,4,2,0,0,0,4, ;180
db 7,5,4,2,0,0,0,5, ;181 db 7,5,4,2,0,0,0,5, ;181
db 7,5,4,2,1,0,0,5, ;182 db 7,5,4,2,1,0,0,5, ;182
db 7,5,4,2,1,0,0,6, ;183 db 7,5,4,2,1,0,0,6, ;183
db 7,5,4,3,0,0,0,4, ;184 db 7,5,4,3,0,0,0,4, ;184
db 7,5,4,3,0,0,0,5, ;185 db 7,5,4,3,0,0,0,5, ;185
db 7,5,4,3,1,0,0,5, ;186 db 7,5,4,3,1,0,0,5, ;186
db 7,5,4,3,1,0,0,6, ;187 db 7,5,4,3,1,0,0,6, ;187
db 7,5,4,3,2,0,0,5, ;188 db 7,5,4,3,2,0,0,5, ;188
db 7,5,4,3,2,0,0,6, ;189 db 7,5,4,3,2,0,0,6, ;189
db 7,5,4,3,2,1,0,6, ;190 db 7,5,4,3,2,1,0,6, ;190
db 7,5,4,3,2,1,0,7, ;191 db 7,5,4,3,2,1,0,7, ;191
db 7,6,0,0,0,0,0,2, ;192 db 7,6,0,0,0,0,0,2, ;192
db 7,6,0,0,0,0,0,3, ;193 db 7,6,0,0,0,0,0,3, ;193
db 7,6,1,0,0,0,0,3, ;194 db 7,6,1,0,0,0,0,3, ;194
db 7,6,1,0,0,0,0,4, ;195 db 7,6,1,0,0,0,0,4, ;195
db 7,6,2,0,0,0,0,3, ;196 db 7,6,2,0,0,0,0,3, ;196
db 7,6,2,0,0,0,0,4, ;197 db 7,6,2,0,0,0,0,4, ;197
db 7,6,2,1,0,0,0,4, ;198 db 7,6,2,1,0,0,0,4, ;198
db 7,6,2,1,0,0,0,5, ;199 db 7,6,2,1,0,0,0,5, ;199
db 7,6,3,0,0,0,0,3, ;200 db 7,6,3,0,0,0,0,3, ;200
db 7,6,3,0,0,0,0,4, ;201 db 7,6,3,0,0,0,0,4, ;201
db 7,6,3,1,0,0,0,4, ;202 db 7,6,3,1,0,0,0,4, ;202
db 7,6,3,1,0,0,0,5, ;203 db 7,6,3,1,0,0,0,5, ;203
db 7,6,3,2,0,0,0,4, ;204 db 7,6,3,2,0,0,0,4, ;204
db 7,6,3,2,0,0,0,5, ;205 db 7,6,3,2,0,0,0,5, ;205
db 7,6,3,2,1,0,0,5, ;206 db 7,6,3,2,1,0,0,5, ;206
db 7,6,3,2,1,0,0,6, ;207 db 7,6,3,2,1,0,0,6, ;207
db 7,6,4,0,0,0,0,3, ;208 db 7,6,4,0,0,0,0,3, ;208
db 7,6,4,0,0,0,0,4, ;209 db 7,6,4,0,0,0,0,4, ;209
db 7,6,4,1,0,0,0,4, ;210 db 7,6,4,1,0,0,0,4, ;210
db 7,6,4,1,0,0,0,5, ;211 db 7,6,4,1,0,0,0,5, ;211
db 7,6,4,2,0,0,0,4, ;212 db 7,6,4,2,0,0,0,4, ;212
db 7,6,4,2,0,0,0,5, ;213 db 7,6,4,2,0,0,0,5, ;213
db 7,6,4,2,1,0,0,5, ;214 db 7,6,4,2,1,0,0,5, ;214
db 7,6,4,2,1,0,0,6, ;215 db 7,6,4,2,1,0,0,6, ;215
db 7,6,4,3,0,0,0,4, ;216 db 7,6,4,3,0,0,0,4, ;216
db 7,6,4,3,0,0,0,5, ;217 db 7,6,4,3,0,0,0,5, ;217
db 7,6,4,3,1,0,0,5, ;218 db 7,6,4,3,1,0,0,5, ;218
db 7,6,4,3,1,0,0,6, ;219 db 7,6,4,3,1,0,0,6, ;219
db 7,6,4,3,2,0,0,5, ;220 db 7,6,4,3,2,0,0,5, ;220
db 7,6,4,3,2,0,0,6, ;221 db 7,6,4,3,2,0,0,6, ;221
db 7,6,4,3,2,1,0,6, ;222 db 7,6,4,3,2,1,0,6, ;222
db 7,6,4,3,2,1,0,7, ;223 db 7,6,4,3,2,1,0,7, ;223
db 7,6,5,0,0,0,0,3, ;224 db 7,6,5,0,0,0,0,3, ;224
db 7,6,5,0,0,0,0,4, ;225 db 7,6,5,0,0,0,0,4, ;225
db 7,6,5,1,0,0,0,4, ;226 db 7,6,5,1,0,0,0,4, ;226
db 7,6,5,1,0,0,0,5, ;227 db 7,6,5,1,0,0,0,5, ;227
db 7,6,5,2,0,0,0,4, ;228 db 7,6,5,2,0,0,0,4, ;228
db 7,6,5,2,0,0,0,5, ;229 db 7,6,5,2,0,0,0,5, ;229
db 7,6,5,2,1,0,0,5, ;230 db 7,6,5,2,1,0,0,5, ;230
db 7,6,5,2,1,0,0,6, ;231 db 7,6,5,2,1,0,0,6, ;231
db 7,6,5,3,0,0,0,4, ;232 db 7,6,5,3,0,0,0,4, ;232
db 7,6,5,3,0,0,0,5, ;233 db 7,6,5,3,0,0,0,5, ;233
db 7,6,5,3,1,0,0,5, ;234 db 7,6,5,3,1,0,0,5, ;234
db 7,6,5,3,1,0,0,6, ;235 db 7,6,5,3,1,0,0,6, ;235
db 7,6,5,3,2,0,0,5, ;236 db 7,6,5,3,2,0,0,5, ;236
db 7,6,5,3,2,0,0,6, ;237 db 7,6,5,3,2,0,0,6, ;237
db 7,6,5,3,2,1,0,6, ;238 db 7,6,5,3,2,1,0,6, ;238
db 7,6,5,3,2,1,0,7, ;239 db 7,6,5,3,2,1,0,7, ;239
db 7,6,5,4,0,0,0,4, ;240 db 7,6,5,4,0,0,0,4, ;240
db 7,6,5,4,0,0,0,5, ;241 db 7,6,5,4,0,0,0,5, ;241
db 7,6,5,4,1,0,0,5, ;242 db 7,6,5,4,1,0,0,5, ;242
db 7,6,5,4,1,0,0,6, ;243 db 7,6,5,4,1,0,0,6, ;243
db 7,6,5,4,2,0,0,5, ;244 db 7,6,5,4,2,0,0,5, ;244
db 7,6,5,4,2,0,0,6, ;245 db 7,6,5,4,2,0,0,6, ;245
db 7,6,5,4,2,1,0,6, ;246 db 7,6,5,4,2,1,0,6, ;246
db 7,6,5,4,2,1,0,7, ;247 db 7,6,5,4,2,1,0,7, ;247
db 7,6,5,4,3,0,0,5, ;248 db 7,6,5,4,3,0,0,5, ;248
db 7,6,5,4,3,0,0,6, ;249 db 7,6,5,4,3,0,0,6, ;249
db 7,6,5,4,3,1,0,6, ;250 db 7,6,5,4,3,1,0,6, ;250
db 7,6,5,4,3,1,0,7, ;251 db 7,6,5,4,3,1,0,7, ;251
db 7,6,5,4,3,2,0,6, ;252 db 7,6,5,4,3,2,0,6, ;252
db 7,6,5,4,3,2,0,7, ;253 db 7,6,5,4,3,2,0,7, ;253
db 7,6,5,4,3,2,1,7, ;254 db 7,6,5,4,3,2,1,7, ;254
db 7,6,5,4,3,2,1,8, ;255 db 7,6,5,4,3,2,1,8, ;255
;*********************************************************************** ;***********************************************************************
; Code ; Code
@ -323,43 +323,43 @@ SECTION .text
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx); ;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;*********************************************************************** ;***********************************************************************
WELS_EXTERN CavlcParamCal_sse2 WELS_EXTERN CavlcParamCal_sse2
push ebx push ebx
push edi push edi
push esi push esi
mov eax, [esp+16] ;coffLevel mov eax, [esp+16] ;coffLevel
mov edi, [esp+24] ;Level mov edi, [esp+24] ;Level
mov ebx, [esp+32] ;endIdx mov ebx, [esp+32] ;endIdx
cmp ebx, 3 cmp ebx, 3
jne .Level16 jne .Level16
pxor xmm1, xmm1 pxor xmm1, xmm1
movq xmm0, [eax] ; removed QWORD movq xmm0, [eax] ; removed QWORD
jmp .Cal_begin jmp .Cal_begin
.Level16: .Level16:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax+16] movdqa xmm1, [eax+16]
.Cal_begin: .Cal_begin:
movdqa xmm2, xmm0 movdqa xmm2, xmm0
packsswb xmm0, xmm1 packsswb xmm0, xmm1
movdqa xmm4, xmm0 movdqa xmm4, xmm0
pxor xmm3, xmm3 pxor xmm3, xmm3
pcmpgtb xmm0, xmm3 pcmpgtb xmm0, xmm3
pcmpgtb xmm3, xmm4 pcmpgtb xmm3, xmm4
por xmm0, xmm3 por xmm0, xmm3
pmovmskb edx, xmm0 pmovmskb edx, xmm0
cmp edx, 0 cmp edx, 0
je near .return je near .return
movdqa xmm6, [sse2_b_1] movdqa xmm6, [sse2_b_1]
pcmpeqw xmm7, xmm7 ;generate -1 pcmpeqw xmm7, xmm7 ;generate -1
mov ebx, 0xff mov ebx, 0xff
;pinsrw xmm6, ebx, 3 ;pinsrw xmm6, ebx, 3
mov bl, dh mov bl, dh
lea ebx, [byte_1pos_table+8*ebx] lea ebx, [byte_1pos_table+8*ebx]
movq xmm0, [ebx] movq xmm0, [ebx]
pextrw ecx, xmm0, 3 pextrw ecx, xmm0, 3
shr ecx, 8 shr ecx, 8
mov dh, cl mov dh, cl
.loopHighFind0: .loopHighFind0:
@ -367,19 +367,19 @@ WELS_EXTERN CavlcParamCal_sse2
je .loopHighFind0End je .loopHighFind0End
;mov esi, [ebx] ;mov esi, [ebx]
;and esi, 0xff ;and esi, 0xff
movzx esi, byte [ebx] movzx esi, byte [ebx]
add esi, 8 add esi, 8
mov esi, [eax+2*esi] mov esi, [eax+2*esi]
mov [edi], si mov [edi], si
add edi, 2 add edi, 2
;add ebx, 1 ;add ebx, 1
inc ebx inc ebx
dec ecx dec ecx
jmp .loopHighFind0 jmp .loopHighFind0
.loopHighFind0End: .loopHighFind0End:
mov cl, dh mov cl, dh
cmp cl, 8 cmp cl, 8
pand xmm0, xmm6 pand xmm0, xmm6
jne .LowByteFind0 jne .LowByteFind0
sub edi, 2 sub edi, 2
mov esi, [eax+16] mov esi, [eax+16]
@ -387,8 +387,8 @@ WELS_EXTERN CavlcParamCal_sse2
add edi, 2 add edi, 2
.LowByteFind0: .LowByteFind0:
and edx, 0xff and edx, 0xff
lea ebx, [byte_1pos_table+8*edx] lea ebx, [byte_1pos_table+8*edx]
movq xmm1, [ebx] movq xmm1, [ebx]
pextrw esi, xmm1, 3 pextrw esi, xmm1, 3
or esi, 0xff or esi, 0xff
or ecx, 0xff00 or ecx, 0xff00
@ -398,16 +398,16 @@ WELS_EXTERN CavlcParamCal_sse2
.loopLowFind0: .loopLowFind0:
cmp esi, 0 cmp esi, 0
je .loopLowFind0End je .loopLowFind0End
;mov edx, [ebx] ;mov edx, [ebx]
;and edx, 0xff ;and edx, 0xff
movzx edx, byte [ebx] movzx edx, byte [ebx]
mov edx, [eax+2*edx] mov edx, [eax+2*edx]
mov [edi], dx mov [edi], dx
add edi, 2 add edi, 2
;add ebx, 1 ;add ebx, 1
inc ebx inc ebx
dec esi dec esi
jmp .loopLowFind0 jmp .loopLowFind0
.loopLowFind0End: .loopLowFind0End:
cmp ch, 8 cmp ch, 8
jne .getLevelEnd jne .getLevelEnd
@ -415,12 +415,12 @@ WELS_EXTERN CavlcParamCal_sse2
mov edx, [eax] mov edx, [eax]
mov [edi], dx mov [edi], dx
.getLevelEnd: .getLevelEnd:
mov edx, [esp+28] ;total_coeffs mov edx, [esp+28] ;total_coeffs
;mov ebx, ecx ;mov ebx, ecx
;and ebx, 0xff ;and ebx, 0xff
movzx ebx, byte cl movzx ebx, byte cl
add cl, ch add cl, ch
mov [edx], cl mov [edx], cl
;getRun ;getRun
movq xmm5, [sse2_b8] movq xmm5, [sse2_b8]
paddb xmm0, xmm5 paddb xmm0, xmm5
@ -430,7 +430,7 @@ WELS_EXTERN CavlcParamCal_sse2
sub eax, ebx sub eax, ebx
shl eax, 3 shl eax, 3
shl ebx, 3 shl ebx, 3
pinsrw xmm2, ebx, 0 pinsrw xmm2, ebx, 0
pinsrw xmm3, eax, 0 pinsrw xmm3, eax, 0
psllq xmm0, xmm3 psllq xmm0, xmm3
psrlq xmm0, xmm3 psrlq xmm0, xmm3
@ -441,19 +441,19 @@ WELS_EXTERN CavlcParamCal_sse2
por xmm0, xmm1 por xmm0, xmm1
pextrw eax, xmm0, 0 pextrw eax, xmm0, 0
and eax, 0xff and eax, 0xff
inc eax inc eax
sub al, cl sub al, cl
movdqa xmm1, xmm0 movdqa xmm1, xmm0
paddb xmm1, xmm7 paddb xmm1, xmm7
psrldq xmm0, 1 psrldq xmm0, 1
psubb xmm1, xmm0 psubb xmm1, xmm0
mov ecx, [esp+20] ;run mov ecx, [esp+20] ;run
movdqa [ecx], xmm1 movdqa [ecx], xmm1
;getRunEnd ;getRunEnd
.return: .return:
pop esi pop esi
pop edi pop edi
pop ebx pop ebx
ret ret
%endif %endif

View File

@ -50,17 +50,17 @@ SECTION .rodata align=16
align 16 align 16
SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16, SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
dw 10, 13, 10, 13, 13, 16, 13, 16, dw 10, 13, 10, 13, 13, 16, 13, 16,
dw 11, 14, 11, 14, 14, 18, 14, 18, dw 11, 14, 11, 14, 14, 18, 14, 18,
dw 11, 14, 11, 14, 14, 18, 14, 18, dw 11, 14, 11, 14, 14, 18, 14, 18,
dw 13, 16, 13, 16, 16, 20, 16, 20, dw 13, 16, 13, 16, 16, 20, 16, 20,
dw 13, 16, 13, 16, 16, 20, 16, 20, dw 13, 16, 13, 16, 16, 20, 16, 20,
dw 14, 18, 14, 18, 18, 23, 18, 23, dw 14, 18, 14, 18, 18, 23, 18, 23,
dw 14, 18, 14, 18, 18, 23, 18, 23, dw 14, 18, 14, 18, 18, 23, 18, 23,
dw 16, 20, 16, 20, 20, 25, 20, 25, dw 16, 20, 16, 20, 20, 25, 20, 25,
dw 16, 20, 16, 20, 20, 25, 20, 25, dw 16, 20, 16, 20, 20, 25, 20, 25,
dw 18, 23, 18, 23, 23, 29, 23, 29, dw 18, 23, 18, 23, 23, 29, 23, 29,
dw 18, 23, 18, 23, 23, 29, 23, 29 dw 18, 23, 18, 23, 23, 29, 23, 29
;*********************************************************************** ;***********************************************************************
@ -68,27 +68,27 @@ SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
;*********************************************************************** ;***********************************************************************
%macro MMX_LoadDiff4P 5 %macro MMX_LoadDiff4P 5
movd %1, [%3] movd %1, [%3]
movd %2, [%4] movd %2, [%4]
punpcklbw %1, %5 punpcklbw %1, %5
punpcklbw %2, %5 punpcklbw %2, %5
psubw %1, %2 psubw %1, %2
%endmacro %endmacro
%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm) %macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
MMX_LoadDiff4P %1, %9, %5, %7, %10 MMX_LoadDiff4P %1, %9, %5, %7, %10
MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10 MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
lea %5, [%5+2*%6] lea %5, [%5+2*%6]
lea %7, [%7+2*%8] lea %7, [%7+2*%8]
MMX_LoadDiff4P %3, %9, %5, %7, %10 MMX_LoadDiff4P %3, %9, %5, %7, %10
MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10 MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
%endmacro %endmacro
%macro MMX_SumSubMul2 3 %macro MMX_SumSubMul2 3
movq %3, %1 movq %3, %1
psllw %1, $01 psllw %1, $01
paddw %1, %2 paddw %1, %2
psllw %2, $01 psllw %2, $01
psubw %3, %2 psubw %3, %2
%endmacro %endmacro
@ -101,23 +101,23 @@ SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
%endmacro %endmacro
%macro MMX_SumSub 3 %macro MMX_SumSub 3
movq %3, %2 movq %3, %2
psubw %2, %1 psubw %2, %1
paddw %1, %3 paddw %1, %3
%endmacro %endmacro
%macro MMX_DCT 6 %macro MMX_DCT 6
MMX_SumSub %4, %1, %6 MMX_SumSub %4, %1, %6
MMX_SumSub %3, %2, %6 MMX_SumSub %3, %2, %6
MMX_SumSub %3, %4, %6 MMX_SumSub %3, %4, %6
MMX_SumSubMul2 %1, %2, %5 MMX_SumSubMul2 %1, %2, %5
%endmacro %endmacro
%macro MMX_IDCT 6 %macro MMX_IDCT 6
MMX_SumSub %4, %5, %6 MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1 MMX_SumSubDiv2 %3, %2, %1
MMX_SumSub %1, %4, %6 MMX_SumSub %1, %4, %6
MMX_SumSub %3, %5, %6 MMX_SumSub %3, %5, %6
%endmacro %endmacro
%macro MMX_StoreDiff4P 6 %macro MMX_StoreDiff4P 6
@ -142,11 +142,11 @@ WELS_EXTERN WelsDctT4_mmx
MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7 MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6 MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2 MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6 MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5 MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
movq [r0+ 0], mm2 movq [r0+ 0], mm2
movq [r0+ 8], mm1 movq [r0+ 8], mm1
@ -170,22 +170,22 @@ WELS_EXTERN WelsIDctT4Rec_mmx
movq mm2, [r4+16] movq mm2, [r4+16]
movq mm3, [r4+24] movq mm3, [r4+24]
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4 MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6 MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2 MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6 MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
WELS_Zero mm7 WELS_Zero mm7
WELS_DW32 mm6 WELS_DW32 mm6
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2] MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3] MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
lea r0, [r0+2*r1] lea r0, [r0+2*r1]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2] MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3] MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
WELSEMMS WELSEMMS
LOAD_5_PARA_POP LOAD_5_PARA_POP
ret ret
@ -194,21 +194,21 @@ WELS_EXTERN WelsIDctT4Rec_mmx
; SSE2 functions ; SSE2 functions
;*********************************************************************** ;***********************************************************************
%macro SSE2_Store4x8p 6 %macro SSE2_Store4x8p 6
SSE2_XSawp qdq, %2, %3, %6 SSE2_XSawp qdq, %2, %3, %6
SSE2_XSawp qdq, %4, %5, %3 SSE2_XSawp qdq, %4, %5, %3
MOVDQ [%1+0x00], %2 MOVDQ [%1+0x00], %2
MOVDQ [%1+0x10], %4 MOVDQ [%1+0x10], %4
MOVDQ [%1+0x20], %6 MOVDQ [%1+0x20], %6
MOVDQ [%1+0x30], %3 MOVDQ [%1+0x30], %3
%endmacro %endmacro
%macro SSE2_Load4x8p 6 %macro SSE2_Load4x8p 6
MOVDQ %2, [%1+0x00] MOVDQ %2, [%1+0x00]
MOVDQ %4, [%1+0x10] MOVDQ %4, [%1+0x10]
MOVDQ %6, [%1+0x20] MOVDQ %6, [%1+0x20]
MOVDQ %3, [%1+0x30] MOVDQ %3, [%1+0x30]
SSE2_XSawp qdq, %4, %3, %5 SSE2_XSawp qdq, %4, %3, %5
SSE2_XSawp qdq, %2, %6, %3 SSE2_XSawp qdq, %2, %6, %3
%endmacro %endmacro
%macro SSE2_SumSubMul2 3 %macro SSE2_SumSubMul2 3
@ -231,57 +231,57 @@ WELS_EXTERN WelsIDctT4Rec_mmx
%macro SSE2_StoreDiff8p 6 %macro SSE2_StoreDiff8p 6
paddw %1, %3 paddw %1, %3
psraw %1, $06 psraw %1, $06
movq %2, %6 movq %2, %6
punpcklbw %2, %4 punpcklbw %2, %4
paddsw %2, %1 paddsw %2, %1
packuswb %2, %2 packuswb %2, %2
movq %5, %2 movq %5, %2
%endmacro %endmacro
%macro SSE2_StoreDiff8p 5 %macro SSE2_StoreDiff8p 5
movq %2, %5 movq %2, %5
punpcklbw %2, %3 punpcklbw %2, %3
paddsw %2, %1 paddsw %2, %1
packuswb %2, %2 packuswb %2, %2
movq %4, %2 movq %4, %2
%endmacro %endmacro
%macro SSE2_Load8DC 6 %macro SSE2_Load8DC 6
movdqa %1, %6 ; %1 = dc0 dc1 movdqa %1, %6 ; %1 = dc0 dc1
paddw %1, %5 paddw %1, %5
psraw %1, $06 ; (dc + 32) >> 6 psraw %1, $06 ; (dc + 32) >> 6
movdqa %2, %1 movdqa %2, %1
psrldq %2, 4 psrldq %2, 4
punpcklwd %2, %2 punpcklwd %2, %2
punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3 punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
movdqa %3, %1 movdqa %3, %1
psrldq %3, 8 psrldq %3, 8
punpcklwd %3, %3 punpcklwd %3, %3
punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5 punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
movdqa %4, %1 movdqa %4, %1
psrldq %4, 12 psrldq %4, 12
punpcklwd %4, %4 punpcklwd %4, %4
punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7 punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
punpcklwd %1, %1 punpcklwd %1, %1
punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1 punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
%endmacro %endmacro
%macro SSE2_DCT 6 %macro SSE2_DCT 6
SSE2_SumSub %6, %3, %5 SSE2_SumSub %6, %3, %5
SSE2_SumSub %1, %2, %5 SSE2_SumSub %1, %2, %5
SSE2_SumSub %3, %2, %5 SSE2_SumSub %3, %2, %5
SSE2_SumSubMul2 %6, %1, %4 SSE2_SumSubMul2 %6, %1, %4
%endmacro %endmacro
%macro SSE2_IDCT 7 %macro SSE2_IDCT 7
SSE2_SumSub %7, %2, %6 SSE2_SumSub %7, %2, %6
SSE2_SumSubDiv2 %1, %3, %5, %4 SSE2_SumSubDiv2 %1, %3, %5, %4
SSE2_SumSub %2, %1, %5 SSE2_SumSub %2, %1, %5
SSE2_SumSub %7, %4, %5 SSE2_SumSub %7, %4, %5
%endmacro %endmacro
;*********************************************************************** ;***********************************************************************
@ -294,42 +294,42 @@ WELS_EXTERN WelsDctFourT4_sse2
SIGN_EXTENSION r2, r2d SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r4, r4d SIGN_EXTENSION r4, r4d
pxor xmm7, xmm7 pxor xmm7, xmm7
;Load 4x8 ;Load 4x8
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3] SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4] SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
lea r1, [r1 + 2 * r2] lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4] lea r3, [r3 + 2 * r4]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
;Load 4x8
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3] SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4] SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0 SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1 SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2 SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0 SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
lea r0, [r0+64] SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
POP_XMM lea r1, [r1 + 2 * r2]
LOAD_5_PARA_POP lea r3, [r3 + 2 * r4]
;Load 4x8
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
lea r0, [r0+64]
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
POP_XMM
LOAD_5_PARA_POP
ret ret
@ -337,168 +337,168 @@ WELS_EXTERN WelsDctFourT4_sse2
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs); ; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsIDctFourT4Rec_sse2 WELS_EXTERN WelsIDctFourT4Rec_sse2
%assign push_num 0 %assign push_num 0
LOAD_5_PARA LOAD_5_PARA
PUSH_XMM 8 PUSH_XMM 8
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
;Load 4x8 ;Load 4x8
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5 SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3 SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0 SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
WELS_Zero xmm7
WELS_DW32 xmm6
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
add r4, 64
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3 SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1 SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
WELS_Zero xmm7 WELS_Zero xmm7
WELS_DW32 xmm6 WELS_DW32 xmm6
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2] SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3] SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
lea r0, [r0 + 2 * r1] lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2] SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3] SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
POP_XMM
LOAD_5_PARA_POP add r4, 64
; pop esi lea r0, [r0 + 2 * r1]
; pop ebx lea r2, [r2 + 2 * r3]
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
WELS_Zero xmm7
WELS_DW32 xmm6
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
POP_XMM
LOAD_5_PARA_POP
; pop esi
; pop ebx
ret ret
%macro SSE2_StoreDiff4x8p 8 %macro SSE2_StoreDiff4x8p 8
SSE2_StoreDiff8p %1, %3, %4, [%5], [%6] SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8] SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8] SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8] SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
%endmacro %endmacro
;*********************************************************************** ;***********************************************************************
; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc) ; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsIDctRecI16x16Dc_sse2 WELS_EXTERN WelsIDctRecI16x16Dc_sse2
%assign push_num 0 %assign push_num 0
LOAD_5_PARA LOAD_5_PARA
PUSH_XMM 8 PUSH_XMM 8
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
pxor xmm7, xmm7 pxor xmm7, xmm7
WELS_DW32 xmm6 WELS_DW32 xmm6
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4] SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1] lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1] lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1] lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16] SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
lea r0, [r0 + 2 * r1] lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1] lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1] lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1] lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
POP_XMM POP_XMM
LOAD_5_PARA_POP LOAD_5_PARA_POP
ret ret
%macro SSE2_SumSubD 3 %macro SSE2_SumSubD 3
movdqa %3, %2 movdqa %3, %2
paddd %2, %1 paddd %2, %1
psubd %1, %3 psubd %1, %3
%endmacro %endmacro
%macro SSE2_SumSubDiv2D 4 %macro SSE2_SumSubDiv2D 4
paddd %1, %2 paddd %1, %2
paddd %1, %3 paddd %1, %3
psrad %1, 1 psrad %1, 1
movdqa %4, %1 movdqa %4, %1
psubd %4, %2 psubd %4, %2
%endmacro %endmacro
%macro SSE2_Load4Col 5 %macro SSE2_Load4Col 5
movsx r2, WORD[%5] movsx r2, WORD[%5]
movd %1, r2d movd %1, r2d
movsx r2, WORD[%5 + 0x20] movsx r2, WORD[%5 + 0x20]
movd %2, r2d movd %2, r2d
punpckldq %1, %2 punpckldq %1, %2
movsx r2, WORD[%5 + 0x80] movsx r2, WORD[%5 + 0x80]
movd %3, r2d movd %3, r2d
movsx r2, WORD[%5 + 0xa0] movsx r2, WORD[%5 + 0xa0]
movd %4, r2d movd %4, r2d
punpckldq %3, %4 punpckldq %3, %4
punpcklqdq %1, %3 punpcklqdq %1, %3
%endmacro %endmacro
;*********************************************************************** ;***********************************************************************
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct) ;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsHadamardT4Dc_sse2 WELS_EXTERN WelsHadamardT4Dc_sse2
%assign push_num 0 %assign push_num 0
LOAD_2_PARA LOAD_2_PARA
PUSH_XMM 8 PUSH_XMM 8
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1 SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40 SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100 SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140 SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
SSE2_SumSubD xmm1, xmm2, xmm7 SSE2_SumSubD xmm1, xmm2, xmm7
SSE2_SumSubD xmm3, xmm4, xmm7 SSE2_SumSubD xmm3, xmm4, xmm7
SSE2_SumSubD xmm2, xmm4, xmm7 SSE2_SumSubD xmm2, xmm4, xmm7
SSE2_SumSubD xmm1, xmm3, xmm7 SSE2_SumSubD xmm1, xmm3, xmm7
SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1 SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
SSE2_SumSubD xmm4, xmm3, xmm7 SSE2_SumSubD xmm4, xmm3, xmm7
SSE2_SumSubD xmm5, xmm1, xmm7 SSE2_SumSubD xmm5, xmm1, xmm7
WELS_DD1 xmm6 WELS_DD1 xmm6
SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2 SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2 SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1 SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
packssdw xmm3, xmm4 packssdw xmm3, xmm4
packssdw xmm2, xmm1 packssdw xmm2, xmm1
movdqa [r0+ 0], xmm3 movdqa [r0+ 0], xmm3
movdqa [r0+16], xmm2 movdqa [r0+16], xmm2
POP_XMM POP_XMM
ret ret

File diff suppressed because it is too large Load Diff

View File

@ -34,362 +34,362 @@
;in: m0, m1, m2, m3, m4, m5, m6, m7 ;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4 ;out: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE_8x8B_MMX 10 %macro TRANSPOSE_8x8B_MMX 10
MMX_XSwap bw, %1, %2, %8 MMX_XSwap bw, %1, %2, %8
MMX_XSwap bw, %3, %4, %2 MMX_XSwap bw, %3, %4, %2
MMX_XSwap bw, %5, %6, %4 MMX_XSwap bw, %5, %6, %4
movq %6, %9 movq %6, %9
movq %10, %4 movq %10, %4
MMX_XSwap bw, %7, %6, %4 MMX_XSwap bw, %7, %6, %4
MMX_XSwap wd, %1, %3, %6 MMX_XSwap wd, %1, %3, %6
MMX_XSwap wd, %8, %2, %3 MMX_XSwap wd, %8, %2, %3
MMX_XSwap wd, %5, %7, %2 MMX_XSwap wd, %5, %7, %2
movq %7, %10 movq %7, %10
movq %10, %3 movq %10, %3
MMX_XSwap wd, %7, %4, %3 MMX_XSwap wd, %7, %4, %3
MMX_XSwap dq, %1, %5, %4 MMX_XSwap dq, %1, %5, %4
MMX_XSwap dq, %6, %2, %5 MMX_XSwap dq, %6, %2, %5
MMX_XSwap dq, %8, %7, %2 MMX_XSwap dq, %8, %7, %2
movq %7, %10 movq %7, %10
movq %10, %5 movq %10, %5
MMX_XSwap dq, %7, %3, %5 MMX_XSwap dq, %7, %3, %5
movq %3, %10 movq %3, %10
%endmacro %endmacro
;in: m0, m3, m5, m2, m7, m1, m6, m4 ;in: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride %macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
movq [%1], mm0 ; result of line 1, x8 bytes movq [%1], mm0 ; result of line 1, x8 bytes
movq [%1+%2], mm3 ; result of line 2 movq [%1+%2], mm3 ; result of line 2
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movq [%1], mm5 ; result of line 3 movq [%1], mm5 ; result of line 3
movq [%1+%2], mm2 ; result of line 4 movq [%1+%2], mm2 ; result of line 4
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movq [%1], mm7 ; result of line 5 movq [%1], mm7 ; result of line 5
movq [%1+%2], mm1 ; result of line 6 movq [%1+%2], mm1 ; result of line 6
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movq [%1], mm6 ; result of line 7 movq [%1], mm6 ; result of line 7
movq [%1+%2], mm4 ; result of line 8 movq [%1+%2], mm4 ; result of line 8
%endmacro %endmacro
;in: m0, m3, m5, m2, m7, m1, m6, m4 ;in: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32 %macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
movq [%1], mm0 ; result of line 1, x8 bytes movq [%1], mm0 ; result of line 1, x8 bytes
movq [%1+%2], mm3 ; result of line 2 movq [%1+%2], mm3 ; result of line 2
lea %3, [%1+2*%2] lea %3, [%1+2*%2]
movq [%3], mm5 ; result of line 3 movq [%3], mm5 ; result of line 3
movq [%3+%2], mm2 ; result of line 4 movq [%3+%2], mm2 ; result of line 4
lea %3, [%3+2*%2] lea %3, [%3+2*%2]
movq [%3], mm7 ; result of line 5 movq [%3], mm7 ; result of line 5
movq [%3+%2], mm1 ; result of line 6 movq [%3+%2], mm1 ; result of line 6
lea %3, [%3+2*%2] lea %3, [%3+2*%2]
movq [%3], mm6 ; result of line 7 movq [%3], mm6 ; result of line 7
movq [%3+%2], mm4 ; result of line 8 movq [%3+%2], mm4 ; result of line 8
%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX %endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
; for transpose 16x8 ; for transpose 16x8
;in: m0, m1, m2, m3, m4, m5, m6, m7 ;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0 ;out: m4, m2, m3, m7, m5, m1, m6, m0
%macro TRANSPOSE_8x16B_SSE2 10 %macro TRANSPOSE_8x16B_SSE2 10
SSE2_XSawp bw, %1, %2, %8 SSE2_XSawp bw, %1, %2, %8
SSE2_XSawp bw, %3, %4, %2 SSE2_XSawp bw, %3, %4, %2
SSE2_XSawp bw, %5, %6, %4 SSE2_XSawp bw, %5, %6, %4
movdqa %6, %9 movdqa %6, %9
movdqa %10, %4 movdqa %10, %4
SSE2_XSawp bw, %7, %6, %4 SSE2_XSawp bw, %7, %6, %4
SSE2_XSawp wd, %1, %3, %6 SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3 SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2 SSE2_XSawp wd, %5, %7, %2
movdqa %7, %10 movdqa %7, %10
movdqa %10, %3 movdqa %10, %3
SSE2_XSawp wd, %7, %4, %3 SSE2_XSawp wd, %7, %4, %3
SSE2_XSawp dq, %1, %5, %4 SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5 SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2 SSE2_XSawp dq, %8, %7, %2
movdqa %7, %10 movdqa %7, %10
movdqa %10, %5 movdqa %10, %5
SSE2_XSawp dq, %7, %3, %5 SSE2_XSawp dq, %7, %3, %5
SSE2_XSawp qdq, %1, %8, %3 SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8 SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2 SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %10 movdqa %7, %10
movdqa %10, %1 movdqa %10, %1
SSE2_XSawp qdq, %7, %5, %1 SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %10 movdqa %5, %10
%endmacro ; end of TRANSPOSE_8x16B_SSE2 %endmacro ; end of TRANSPOSE_8x16B_SSE2
%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride %macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
movq [%1], xmm4 ; result of line 1, x8 bytes movq [%1], xmm4 ; result of line 1, x8 bytes
movq [%1+%2], xmm2 ; result of line 2 movq [%1+%2], xmm2 ; result of line 2
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movq [%1], xmm3 ; result of line 3 movq [%1], xmm3 ; result of line 3
movq [%1+%2], xmm7 ; result of line 4 movq [%1+%2], xmm7 ; result of line 4
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movq [%1], xmm5 ; result of line 5 movq [%1], xmm5 ; result of line 5
movq [%1+%2], xmm1 ; result of line 6 movq [%1+%2], xmm1 ; result of line 6
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movq [%1], xmm6 ; result of line 7 movq [%1], xmm6 ; result of line 7
movq [%1+%2], xmm0 ; result of line 8 movq [%1+%2], xmm0 ; result of line 8
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movhpd [%1], xmm4 ; result of line 9 movhpd [%1], xmm4 ; result of line 9
movhpd [%1+%2], xmm2 ; result of line 10 movhpd [%1+%2], xmm2 ; result of line 10
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movhpd [%1], xmm3 ; result of line 11 movhpd [%1], xmm3 ; result of line 11
movhpd [%1+%2], xmm7 ; result of line 12 movhpd [%1+%2], xmm7 ; result of line 12
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movhpd [%1], xmm5 ; result of line 13 movhpd [%1], xmm5 ; result of line 13
movhpd [%1+%2], xmm1 ; result of line 14 movhpd [%1+%2], xmm1 ; result of line 14
lea %1, [%1+2*%2] lea %1, [%1+2*%2]
movhpd [%1], xmm6 ; result of line 15 movhpd [%1], xmm6 ; result of line 15
movhpd [%1+%2], xmm0 ; result of line 16 movhpd [%1+%2], xmm0 ; result of line 16
%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2 %endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32 %macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
movq [%1], xmm4 ; result of line 1, x8 bytes movq [%1], xmm4 ; result of line 1, x8 bytes
movq [%1+%2], xmm2 ; result of line 2 movq [%1+%2], xmm2 ; result of line 2
lea %3, [%1+2*%2] lea %3, [%1+2*%2]
movq [%3], xmm3 ; result of line 3 movq [%3], xmm3 ; result of line 3
movq [%3+%2], xmm7 ; result of line 4 movq [%3+%2], xmm7 ; result of line 4
lea %3, [%3+2*%2] lea %3, [%3+2*%2]
movq [%3], xmm5 ; result of line 5 movq [%3], xmm5 ; result of line 5
movq [%3+%2], xmm1 ; result of line 6 movq [%3+%2], xmm1 ; result of line 6
lea %3, [%3+2*%2] lea %3, [%3+2*%2]
movq [%3], xmm6 ; result of line 7 movq [%3], xmm6 ; result of line 7
movq [%3+%2], xmm0 ; result of line 8 movq [%3+%2], xmm0 ; result of line 8
lea %3, [%3+2*%2] lea %3, [%3+2*%2]
movhpd [%3], xmm4 ; result of line 9 movhpd [%3], xmm4 ; result of line 9
movhpd [%3+%2], xmm2 ; result of line 10 movhpd [%3+%2], xmm2 ; result of line 10
lea %3, [%3+2*%2] lea %3, [%3+2*%2]
movhpd [%3], xmm3 ; result of line 11 movhpd [%3], xmm3 ; result of line 11
movhpd [%3+%2], xmm7 ; result of line 12 movhpd [%3+%2], xmm7 ; result of line 12
lea %3, [%3+2*%2] lea %3, [%3+2*%2]
movhpd [%3], xmm5 ; result of line 13 movhpd [%3], xmm5 ; result of line 13
movhpd [%3+%2], xmm1 ; result of line 14 movhpd [%3+%2], xmm1 ; result of line 14
lea %3, [%3+2*%2] lea %3, [%3+2*%2]
movhpd [%3], xmm6 ; result of line 15 movhpd [%3], xmm6 ; result of line 15
movhpd [%3+%2], xmm0 ; result of line 16 movhpd [%3+%2], xmm0 ; result of line 16
%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2 %endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
SECTION .text SECTION .text
WELS_EXTERN TransposeMatrixBlock16x16_sse2 WELS_EXTERN TransposeMatrixBlock16x16_sse2
; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride ); ; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
push r4 push r4
push r5 push r5
%assign push_num 2 %assign push_num 2
LOAD_4_PARA LOAD_4_PARA
PUSH_XMM 8 PUSH_XMM 8
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
mov r4, r7 mov r4, r7
and r4, 0Fh and r4, 0Fh
sub r7, 10h sub r7, 10h
sub r7, r4 sub r7, r4
lea r5, [r3+r3*2] lea r5, [r3+r3*2]
; top 8x16 block ; top 8x16 block
movdqa xmm0, [r2] movdqa xmm0, [r2]
movdqa xmm1, [r2+r3] movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+r3*2] movdqa xmm2, [r2+r3*2]
movdqa xmm3, [r2+r5] movdqa xmm3, [r2+r5]
lea r2, [r2+r3*4] lea r2, [r2+r3*4]
movdqa xmm4, [r2] movdqa xmm4, [r2]
movdqa xmm5, [r2+r3] movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+r3*2] movdqa xmm6, [r2+r3*2]
;in: m0, m1, m2, m3, m4, m5, m6, m7 ;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0 ;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7] TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
TRANSPOSE8x16_WRITE_SSE2 r0, r1 TRANSPOSE8x16_WRITE_SSE2 r0, r1
; bottom 8x16 block ; bottom 8x16 block
lea r2, [r2+r3*4] lea r2, [r2+r3*4]
movdqa xmm0, [r2] movdqa xmm0, [r2]
movdqa xmm1, [r2+r3] movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+r3*2] movdqa xmm2, [r2+r3*2]
movdqa xmm3, [r2+r5] movdqa xmm3, [r2+r5]
lea r2, [r2+r3*4] lea r2, [r2+r3*4]
movdqa xmm4, [r2] movdqa xmm4, [r2]
movdqa xmm5, [r2+r3] movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+r3*2] movdqa xmm6, [r2+r3*2]
;in: m0, m1, m2, m3, m4, m5, m6, m7 ;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0 ;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7] TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
mov r5, r1 mov r5, r1
sal r5, 4 sal r5, 4
sub r0, r5 sub r0, r5
lea r0, [r0+r1*2+8] lea r0, [r0+r1*2+8]
TRANSPOSE8x16_WRITE_SSE2 r0, r1 TRANSPOSE8x16_WRITE_SSE2 r0, r1
add r7, r4 add r7, r4
add r7, 10h add r7, 10h
POP_XMM POP_XMM
LOAD_4_PARA_POP LOAD_4_PARA_POP
pop r5 pop r5
pop r4 pop r4
ret ret
WELS_EXTERN TransposeMatrixBlocksx16_sse2 WELS_EXTERN TransposeMatrixBlocksx16_sse2
; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks ); ; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
push r5 push r5
push r6 push r6
%assign push_num 2 %assign push_num 2
LOAD_5_PARA LOAD_5_PARA
PUSH_XMM 8 PUSH_XMM 8
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d SIGN_EXTENSION r4, r4d
mov r5, r7 mov r5, r7
and r5, 0Fh and r5, 0Fh
sub r7, 10h sub r7, 10h
sub r7, r5 sub r7, r5
TRANSPOSE_LOOP_SSE2: TRANSPOSE_LOOP_SSE2:
; explictly loading next loop data ; explictly loading next loop data
lea r6, [r2+r3*8] lea r6, [r2+r3*8]
push r4 push r4
%rep 8 %rep 8
mov r4, [r6] mov r4, [r6]
mov r4, [r6+r3] mov r4, [r6+r3]
lea r6, [r6+r3*2] lea r6, [r6+r3*2]
%endrep %endrep
pop r4 pop r4
; top 8x16 block ; top 8x16 block
movdqa xmm0, [r2] movdqa xmm0, [r2]
movdqa xmm1, [r2+r3] movdqa xmm1, [r2+r3]
lea r2, [r2+r3*2] lea r2, [r2+r3*2]
movdqa xmm2, [r2] movdqa xmm2, [r2]
movdqa xmm3, [r2+r3] movdqa xmm3, [r2+r3]
lea r2, [r2+r3*2] lea r2, [r2+r3*2]
movdqa xmm4, [r2] movdqa xmm4, [r2]
movdqa xmm5, [r2+r3] movdqa xmm5, [r2+r3]
lea r2, [r2+r3*2] lea r2, [r2+r3*2]
movdqa xmm6, [r2] movdqa xmm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7 ;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0 ;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7] TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6 TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
lea r2, [r2+r3*2] lea r2, [r2+r3*2]
; bottom 8x16 block ; bottom 8x16 block
movdqa xmm0, [r2] movdqa xmm0, [r2]
movdqa xmm1, [r2+r3] movdqa xmm1, [r2+r3]
lea r2, [r2+r3*2] lea r2, [r2+r3*2]
movdqa xmm2, [r2] movdqa xmm2, [r2]
movdqa xmm3, [r2+r3] movdqa xmm3, [r2+r3]
lea r2, [r2+r3*2] lea r2, [r2+r3*2]
movdqa xmm4, [r2] movdqa xmm4, [r2]
movdqa xmm5, [r2+r3] movdqa xmm5, [r2+r3]
lea r2, [r2+r3*2] lea r2, [r2+r3*2]
movdqa xmm6, [r2] movdqa xmm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7 ;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0 ;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7] TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6 TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
lea r2, [r2+r3*2] lea r2, [r2+r3*2]
lea r0, [r0+16] lea r0, [r0+16]
dec r4 dec r4
jg near TRANSPOSE_LOOP_SSE2 jg near TRANSPOSE_LOOP_SSE2
add r7, r5 add r7, r5
add r7, 10h add r7, 10h
POP_XMM POP_XMM
LOAD_5_PARA_POP LOAD_5_PARA_POP
pop r6 pop r6
pop r5 pop r5
ret ret
WELS_EXTERN TransposeMatrixBlock8x8_mmx WELS_EXTERN TransposeMatrixBlock8x8_mmx
; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride ); ; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
%assign push_num 0 %assign push_num 0
LOAD_4_PARA LOAD_4_PARA
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
sub r7, 8 sub r7, 8
movq mm0, [r2] movq mm0, [r2]
movq mm1, [r2+r3] movq mm1, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movq mm2, [r2] movq mm2, [r2]
movq mm3, [r2+r3] movq mm3, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movq mm4, [r2] movq mm4, [r2]
movq mm5, [r2+r3] movq mm5, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movq mm6, [r2] movq mm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7 ;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4 ;out: m0, m3, m5, m2, m7, m1, m6, m4
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7] TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
TRANSPOSE8x8_WRITE_MMX r0, r1 TRANSPOSE8x8_WRITE_MMX r0, r1
emms emms
add r7, 8 add r7, 8
LOAD_4_PARA_POP LOAD_4_PARA_POP
ret ret
WELS_EXTERN TransposeMatrixBlocksx8_mmx WELS_EXTERN TransposeMatrixBlocksx8_mmx
; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks ); ; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
push r5 push r5
push r6 push r6
%assign push_num 2 %assign push_num 2
LOAD_5_PARA LOAD_5_PARA
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d SIGN_EXTENSION r4, r4d
sub r7, 8 sub r7, 8
lea r5, [r2+r3*8] lea r5, [r2+r3*8]
TRANSPOSE_BLOCKS_X8_LOOP_MMX: TRANSPOSE_BLOCKS_X8_LOOP_MMX:
; explictly loading next loop data ; explictly loading next loop data
%rep 4 %rep 4
mov r6, [r5] mov r6, [r5]
mov r6, [r5+r3] mov r6, [r5+r3]
lea r5, [r5+r3*2] lea r5, [r5+r3*2]
%endrep %endrep
movq mm0, [r2] movq mm0, [r2]
movq mm1, [r2+r3] movq mm1, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movq mm2, [r2] movq mm2, [r2]
movq mm3, [r2+r3] movq mm3, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movq mm4, [r2] movq mm4, [r2]
movq mm5, [r2+r3] movq mm5, [r2+r3]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
movq mm6, [r2] movq mm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7 ;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4 ;out: m0, m3, m5, m2, m7, m1, m6, m4
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7] TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6 TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
lea r0, [r0+8] lea r0, [r0+8]
lea r2, [r2+2*r3] lea r2, [r2+2*r3]
dec r4 dec r4
jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
emms emms
add r7, 8 add r7, 8
LOAD_5_PARA_POP LOAD_5_PARA_POP
pop r6 pop r6
pop r5 pop r5
ret ret

View File

@ -51,10 +51,10 @@ SECTION .text
;void WelsPrefetchZero_mmx(int8_t const*_A); ;void WelsPrefetchZero_mmx(int8_t const*_A);
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsPrefetchZero_mmx WELS_EXTERN WelsPrefetchZero_mmx
%assign push_num 0 %assign push_num 0
LOAD_1_PARA LOAD_1_PARA
prefetchnta [r0] prefetchnta [r0]
ret ret
;*********************************************************************** ;***********************************************************************
@ -62,71 +62,71 @@ WELS_EXTERN WelsPrefetchZero_mmx
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsSetMemZeroAligned64_sse2 WELS_EXTERN WelsSetMemZeroAligned64_sse2
%assign push_num 0 %assign push_num 0
LOAD_2_PARA LOAD_2_PARA
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
neg r1 neg r1
pxor xmm0, xmm0 pxor xmm0, xmm0
.memzeroa64_sse2_loops: .memzeroa64_sse2_loops:
movdqa [r0], xmm0 movdqa [r0], xmm0
movdqa [r0+16], xmm0 movdqa [r0+16], xmm0
movdqa [r0+32], xmm0 movdqa [r0+32], xmm0
movdqa [r0+48], xmm0 movdqa [r0+48], xmm0
add r0, 0x40 add r0, 0x40
add r1, 0x40 add r1, 0x40
jnz near .memzeroa64_sse2_loops jnz near .memzeroa64_sse2_loops
ret ret
;*********************************************************************** ;***********************************************************************
; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size) ; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsSetMemZeroSize64_mmx WELS_EXTERN WelsSetMemZeroSize64_mmx
%assign push_num 0 %assign push_num 0
LOAD_2_PARA LOAD_2_PARA
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
neg r1 neg r1
pxor mm0, mm0 pxor mm0, mm0
.memzero64_mmx_loops: .memzero64_mmx_loops:
movq [r0], mm0 movq [r0], mm0
movq [r0+8], mm0 movq [r0+8], mm0
movq [r0+16], mm0 movq [r0+16], mm0
movq [r0+24], mm0 movq [r0+24], mm0
movq [r0+32], mm0 movq [r0+32], mm0
movq [r0+40], mm0 movq [r0+40], mm0
movq [r0+48], mm0 movq [r0+48], mm0
movq [r0+56], mm0 movq [r0+56], mm0
add r0, 0x40 add r0, 0x40
add r1, 0x40 add r1, 0x40
jnz near .memzero64_mmx_loops jnz near .memzero64_mmx_loops
WELSEMMS WELSEMMS
ret ret
;*********************************************************************** ;***********************************************************************
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size) ; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsSetMemZeroSize8_mmx WELS_EXTERN WelsSetMemZeroSize8_mmx
%assign push_num 0 %assign push_num 0
LOAD_2_PARA LOAD_2_PARA
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
neg r1 neg r1
pxor mm0, mm0 pxor mm0, mm0
.memzero8_mmx_loops: .memzero8_mmx_loops:
movq [r0], mm0 movq [r0], mm0
add r0, 0x08 add r0, 0x08
add r1, 0x08 add r1, 0x08
jnz near .memzero8_mmx_loops jnz near .memzero8_mmx_loops
WELSEMMS WELSEMMS
ret ret

View File

@ -49,241 +49,241 @@ SECTION .text
;************************************************ ;************************************************
%macro SSE2_Quant8 5 %macro SSE2_Quant8 5
MOVDQ %1, %5 MOVDQ %1, %5
pxor %2, %2 pxor %2, %2
pcmpgtw %2, %1 pcmpgtw %2, %1
pxor %1, %2 pxor %1, %2
psubw %1, %2 psubw %1, %2
paddusw %1, %3 paddusw %1, %3
pmulhuw %1, %4 pmulhuw %1, %4
pxor %1, %2 pxor %1, %2
psubw %1, %2 psubw %1, %2
MOVDQ %5, %1 MOVDQ %5, %1
%endmacro %endmacro
%macro SSE2_QuantMax8 6 %macro SSE2_QuantMax8 6
MOVDQ %1, %5 MOVDQ %1, %5
pxor %2, %2 pxor %2, %2
pcmpgtw %2, %1 pcmpgtw %2, %1
pxor %1, %2 pxor %1, %2
psubw %1, %2 psubw %1, %2
paddusw %1, %3 paddusw %1, %3
pmulhuw %1, %4 pmulhuw %1, %4
pmaxsw %6, %1 pmaxsw %6, %1
pxor %1, %2 pxor %1, %2
psubw %1, %2 psubw %1, %2
MOVDQ %5, %1 MOVDQ %5, %1
%endmacro %endmacro
%define pDct esp + 4 %define pDct esp + 4
%define ff esp + 8 %define ff esp + 8
%define mf esp + 12 %define mf esp + 12
%define max esp + 16 %define max esp + 16
;*********************************************************************** ;***********************************************************************
; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf); ; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsQuant4x4_sse2 WELS_EXTERN WelsQuant4x4_sse2
%assign push_num 0 %assign push_num 0
LOAD_3_PARA LOAD_3_PARA
movdqa xmm2, [r1] movdqa xmm2, [r1]
movdqa xmm3, [r2] movdqa xmm3, [r2]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0] SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10] SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
ret ret
;*********************************************************************** ;***********************************************************************
;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf); ;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsQuant4x4Dc_sse2 WELS_EXTERN WelsQuant4x4Dc_sse2
%assign push_num 0 %assign push_num 0
LOAD_3_PARA LOAD_3_PARA
SIGN_EXTENSIONW r1, r1w SIGN_EXTENSIONW r1, r1w
SIGN_EXTENSIONW r2, r2w SIGN_EXTENSIONW r2, r2w
SSE2_Copy8Times xmm3, r2d SSE2_Copy8Times xmm3, r2d
SSE2_Copy8Times xmm2, r1d SSE2_Copy8Times xmm2, r1d
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0] SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10] SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
ret ret
;*********************************************************************** ;***********************************************************************
; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf); ; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsQuantFour4x4_sse2 WELS_EXTERN WelsQuantFour4x4_sse2
%assign push_num 0 %assign push_num 0
LOAD_3_PARA LOAD_3_PARA
MOVDQ xmm2, [r1] MOVDQ xmm2, [r1]
MOVDQ xmm3, [r2] MOVDQ xmm3, [r2]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0] SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10] SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20] SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30] SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40] SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50] SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60] SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70] SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
ret ret
;*********************************************************************** ;***********************************************************************
; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max); ; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsQuantFour4x4Max_sse2 WELS_EXTERN WelsQuantFour4x4Max_sse2
%assign push_num 0 %assign push_num 0
LOAD_4_PARA LOAD_4_PARA
PUSH_XMM 8 PUSH_XMM 8
MOVDQ xmm2, [r1] MOVDQ xmm2, [r1]
MOVDQ xmm3, [r2] MOVDQ xmm3, [r2]
pxor xmm4, xmm4 pxor xmm4, xmm4
pxor xmm5, xmm5 pxor xmm5, xmm5
pxor xmm6, xmm6 pxor xmm6, xmm6
pxor xmm7, xmm7 pxor xmm7, xmm7
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0 SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
pmaxsw xmm0, xmm4 pmaxsw xmm0, xmm4
pmaxsw xmm0, xmm5 pmaxsw xmm0, xmm5
pmaxsw xmm0, xmm7 pmaxsw xmm0, xmm7
movdqa xmm1, xmm0 movdqa xmm1, xmm0
punpckhqdq xmm0, xmm1 punpckhqdq xmm0, xmm1
pmaxsw xmm0, xmm1 pmaxsw xmm0, xmm1
movq [r3], xmm0 movq [r3], xmm0
POP_XMM POP_XMM
LOAD_4_PARA_POP LOAD_4_PARA_POP
ret ret
%macro MMX_Copy4Times 2 %macro MMX_Copy4Times 2
movd %1, %2 movd %1, %2
punpcklwd %1, %1 punpcklwd %1, %1
punpckldq %1, %1 punpckldq %1, %1
%endmacro %endmacro
SECTION .text SECTION .text
%macro MMX_Quant4 4 %macro MMX_Quant4 4
pxor %2, %2 pxor %2, %2
pcmpgtw %2, %1 pcmpgtw %2, %1
pxor %1, %2 pxor %1, %2
psubw %1, %2 psubw %1, %2
paddusw %1, %3 paddusw %1, %3
pmulhuw %1, %4 pmulhuw %1, %4
pxor %1, %2 pxor %1, %2
psubw %1, %2 psubw %1, %2
%endmacro %endmacro
;*********************************************************************** ;***********************************************************************
;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block); ;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsHadamardQuant2x2_mmx WELS_EXTERN WelsHadamardQuant2x2_mmx
%assign push_num 0 %assign push_num 0
LOAD_5_PARA LOAD_5_PARA
SIGN_EXTENSIONW r1, r1w SIGN_EXTENSIONW r1, r1w
SIGN_EXTENSIONW r2, r2w SIGN_EXTENSIONW r2, r2w
movd mm0, [r0] movd mm0, [r0]
movd mm1, [r0 + 0x20] movd mm1, [r0 + 0x20]
punpcklwd mm0, mm1 punpcklwd mm0, mm1
movd mm3, [r0 + 0x40] movd mm3, [r0 + 0x40]
movd mm1, [r0 + 0x60] movd mm1, [r0 + 0x60]
punpcklwd mm3, mm1 punpcklwd mm3, mm1
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3 ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
movq mm5, mm3 movq mm5, mm3
paddw mm3, mm0 paddw mm3, mm0
psubw mm0, mm5 psubw mm0, mm5
punpcklwd mm3, mm0 punpcklwd mm3, mm0
movq mm1, mm3 movq mm1, mm3
psrlq mm1, 32 psrlq mm1, 32
movq mm5, mm1 movq mm5, mm1
paddw mm1, mm3 paddw mm1, mm3
psubw mm3, mm5 psubw mm3, mm5
punpcklwd mm1, mm3 punpcklwd mm1, mm3
;quant_2x2_dc ;quant_2x2_dc
MMX_Copy4Times mm3, r2d MMX_Copy4Times mm3, r2d
MMX_Copy4Times mm2, r1d MMX_Copy4Times mm2, r1d
MMX_Quant4 mm1, mm0, mm2, mm3 MMX_Quant4 mm1, mm0, mm2, mm3
; store dct_2x2 ; store dct_2x2
movq [r3], mm1 movq [r3], mm1
movq [r4], mm1 movq [r4], mm1
; pNonZeroCount of dct_2x2 ; pNonZeroCount of dct_2x2
pcmpeqb mm2, mm2 ; mm2 = FF pcmpeqb mm2, mm2 ; mm2 = FF
pxor mm3, mm3 pxor mm3, mm3
packsswb mm1, mm3 packsswb mm1, mm3
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
psadbw mm1, mm3 ; psadbw mm1, mm3 ;
mov r1w, 0 mov r1w, 0
mov [r0], r1w mov [r0], r1w
mov [r0 + 0x20], r1w mov [r0 + 0x20], r1w
mov [r0 + 0x40], r1w mov [r0 + 0x40], r1w
mov [r0 + 0x60], r1w mov [r0 + 0x60], r1w
movd retrd, mm1 movd retrd, mm1
WELSEMMS WELSEMMS
LOAD_5_PARA_POP LOAD_5_PARA_POP
ret ret
;*********************************************************************** ;***********************************************************************
;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf); ;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf);
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsHadamardQuant2x2Skip_mmx WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
%assign push_num 0 %assign push_num 0
LOAD_3_PARA LOAD_3_PARA
SIGN_EXTENSIONW r1, r1w SIGN_EXTENSIONW r1, r1w
SIGN_EXTENSIONW r2, r2w SIGN_EXTENSIONW r2, r2w
movd mm0, [r0] movd mm0, [r0]
movd mm1, [r0 + 0x20] movd mm1, [r0 + 0x20]
punpcklwd mm0, mm1 punpcklwd mm0, mm1
movd mm3, [r0 + 0x40] movd mm3, [r0 + 0x40]
movd mm1, [r0 + 0x60] movd mm1, [r0 + 0x60]
punpcklwd mm3, mm1 punpcklwd mm3, mm1
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3 ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
movq mm5, mm3 movq mm5, mm3
paddw mm3, mm0 paddw mm3, mm0
psubw mm0, mm5 psubw mm0, mm5
punpcklwd mm3, mm0 punpcklwd mm3, mm0
movq mm1, mm3 movq mm1, mm3
psrlq mm1, 32 psrlq mm1, 32
movq mm5, mm1 movq mm5, mm1
paddw mm1, mm3 paddw mm1, mm3
psubw mm3, mm5 psubw mm3, mm5
punpcklwd mm1, mm3 punpcklwd mm1, mm3
;quant_2x2_dc ;quant_2x2_dc
MMX_Copy4Times mm3, r2d MMX_Copy4Times mm3, r2d
MMX_Copy4Times mm2, r1d MMX_Copy4Times mm2, r1d
MMX_Quant4 mm1, mm0, mm2, mm3 MMX_Quant4 mm1, mm0, mm2, mm3
; pNonZeroCount of dct_2x2 ; pNonZeroCount of dct_2x2
pcmpeqb mm2, mm2 ; mm2 = FF pcmpeqb mm2, mm2 ; mm2 = FF
pxor mm3, mm3 pxor mm3, mm3
packsswb mm1, mm3 packsswb mm1, mm3
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
psadbw mm1, mm3 ; psadbw mm1, mm3 ;
movd retrd, mm1 movd retrd, mm1
WELSEMMS WELSEMMS
ret ret
%macro SSE2_DeQuant8 3 %macro SSE2_DeQuant8 3
@ -297,12 +297,12 @@ WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf); ; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsDequant4x4_sse2 WELS_EXTERN WelsDequant4x4_sse2
%assign push_num 0 %assign push_num 0
LOAD_2_PARA LOAD_2_PARA
movdqa xmm1, [r1] movdqa xmm1, [r1]
SSE2_DeQuant8 [r0 ], xmm0, xmm1 SSE2_DeQuant8 [r0 ], xmm0, xmm1
SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1 SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1
ret ret
@ -311,18 +311,18 @@ WELS_EXTERN WelsDequant4x4_sse2
;***********************************************************************==== ;***********************************************************************====
WELS_EXTERN WelsDequantFour4x4_sse2 WELS_EXTERN WelsDequantFour4x4_sse2
%assign push_num 0 %assign push_num 0
LOAD_2_PARA LOAD_2_PARA
movdqa xmm1, [r1] movdqa xmm1, [r1]
SSE2_DeQuant8 [r0 ], xmm0, xmm1 SSE2_DeQuant8 [r0 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1 SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1 SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1 SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1 SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1 SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1 SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1 SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1
ret ret
@ -330,41 +330,41 @@ WELS_EXTERN WelsDequantFour4x4_sse2
;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf); ;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsDequantIHadamard4x4_sse2 WELS_EXTERN WelsDequantIHadamard4x4_sse2
%assign push_num 0 %assign push_num 0
LOAD_2_PARA LOAD_2_PARA
%ifndef X86_32 %ifndef X86_32
movzx r1, r1w movzx r1, r1w
%endif %endif
; WelsDequantLumaDc4x4 ; WelsDequantLumaDc4x4
SSE2_Copy8Times xmm1, r1d SSE2_Copy8Times xmm1, r1d
;psrlw xmm1, 2 ; for the (>>2) in ihdm ;psrlw xmm1, 2 ; for the (>>2) in ihdm
MOVDQ xmm0, [r0] MOVDQ xmm0, [r0]
MOVDQ xmm2, [r0+0x10] MOVDQ xmm2, [r0+0x10]
pmullw xmm0, xmm1 pmullw xmm0, xmm1
pmullw xmm2, xmm1 pmullw xmm2, xmm1
; ihdm_4x4 ; ihdm_4x4
movdqa xmm1, xmm0 movdqa xmm1, xmm0
psrldq xmm1, 8 psrldq xmm1, 8
movdqa xmm3, xmm2 movdqa xmm3, xmm2
psrldq xmm3, 8 psrldq xmm3, 8
SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3 SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2 SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2 SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1 SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4 SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
SSE2_SumSub xmm2, xmm4, xmm5 SSE2_SumSub xmm2, xmm4, xmm5
SSE2_SumSub xmm1, xmm0, xmm5 SSE2_SumSub xmm1, xmm0, xmm5
SSE2_SumSub xmm4, xmm0, xmm5 SSE2_SumSub xmm4, xmm0, xmm5
SSE2_SumSub xmm2, xmm1, xmm5 SSE2_SumSub xmm2, xmm1, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3 SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
punpcklqdq xmm0, xmm1 punpcklqdq xmm0, xmm1
MOVDQ [r0], xmm0 MOVDQ [r0], xmm0
punpcklqdq xmm2, xmm3 punpcklqdq xmm2, xmm3
MOVDQ [r0+16], xmm2 MOVDQ [r0+16], xmm2
ret ret

View File

@ -35,189 +35,189 @@ SECTION .text
;********************************************************************************************************************************** ;**********************************************************************************************************************************
; ;
; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost ) ; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
; ;
; \note: ; \note:
; src need align with 16 bytes, ref is optional ; src need align with 16 bytes, ref is optional
; \return value: ; \return value:
; return minimal SAD cost, according index carried by index_min_cost ; return minimal SAD cost, according index carried by index_min_cost
;********************************************************************************************************************************** ;**********************************************************************************************************************************
; try 8 mv via offset ; try 8 mv via offset
; xmm7 store sad costs ; xmm7 store sad costs
%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref %macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
movdqa xmm0, [%1] movdqa xmm0, [%1]
movdqu xmm1, [%2] movdqu xmm1, [%2]
movdqu xmm2, [%2+8h] movdqu xmm2, [%2+8h]
movdqa xmm3, xmm1 movdqa xmm3, xmm1
movdqa xmm4, xmm2 movdqa xmm4, xmm2
mpsadbw xmm1, xmm0, 0 ; 000 B mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm3, xmm0, 5 ; 101 B mpsadbw xmm3, xmm0, 5 ; 101 B
paddw xmm7, xmm3 ; accumulate cost paddw xmm7, xmm3 ; accumulate cost
mpsadbw xmm2, xmm0, 2 ; 010 B mpsadbw xmm2, xmm0, 2 ; 010 B
paddw xmm7, xmm2 ; accumulate cost paddw xmm7, xmm2 ; accumulate cost
mpsadbw xmm4, xmm0, 7 ; 111 B mpsadbw xmm4, xmm0, 7 ; 111 B
paddw xmm7, xmm4 ; accumulate cost paddw xmm7, xmm4 ; accumulate cost
add %1, %3 add %1, %3
add %2, %4 add %2, %4
%endmacro ; end of SAD_16x16_LINE_SSE41 %endmacro ; end of SAD_16x16_LINE_SSE41
%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref %macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
movdqa xmm0, [%1] movdqa xmm0, [%1]
movdqu xmm1, [%2] movdqu xmm1, [%2]
movdqu xmm2, [%2+8h] movdqu xmm2, [%2+8h]
movdqa xmm3, xmm1 movdqa xmm3, xmm1
movdqa xmm4, xmm2 movdqa xmm4, xmm2
mpsadbw xmm1, xmm0, 0 ; 000 B mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm3, xmm0, 5 ; 101 B mpsadbw xmm3, xmm0, 5 ; 101 B
paddw xmm7, xmm3 ; accumulate cost paddw xmm7, xmm3 ; accumulate cost
mpsadbw xmm2, xmm0, 2 ; 010 B mpsadbw xmm2, xmm0, 2 ; 010 B
paddw xmm7, xmm2 ; accumulate cost paddw xmm7, xmm2 ; accumulate cost
mpsadbw xmm4, xmm0, 7 ; 111 B mpsadbw xmm4, xmm0, 7 ; 111 B
paddw xmm7, xmm4 ; accumulate cost paddw xmm7, xmm4 ; accumulate cost
%endmacro ; end of SAD_16x16_LINE_SSE41E %endmacro ; end of SAD_16x16_LINE_SSE41E
WELS_EXTERN SampleSad16x16Hor8_sse41 WELS_EXTERN SampleSad16x16Hor8_sse41
;push ebx ;push ebx
;push esi ;push esi
;mov eax, [esp+12] ; src ;mov eax, [esp+12] ; src
;mov ecx, [esp+16] ; stride_src ;mov ecx, [esp+16] ; stride_src
;mov ebx, [esp+20] ; ref ;mov ebx, [esp+20] ; ref
;mov edx, [esp+24] ; stride_ref ;mov edx, [esp+24] ; stride_ref
;mov esi, [esp+28] ; base_cost ;mov esi, [esp+28] ; base_cost
%assign push_num 0 %assign push_num 0
LOAD_6_PARA LOAD_6_PARA
PUSH_XMM 8 PUSH_XMM 8
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
pxor xmm7, xmm7 pxor xmm7, xmm7
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3 SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41E r0, r2, r1, r3 SAD_16x16_LINE_SSE41E r0, r2, r1, r3
pxor xmm0, xmm0 pxor xmm0, xmm0
movdqa xmm6, xmm7 movdqa xmm6, xmm7
punpcklwd xmm6, xmm0 punpcklwd xmm6, xmm0
punpckhwd xmm7, xmm0 punpckhwd xmm7, xmm0
movdqa xmm5, [r4] movdqa xmm5, [r4]
movdqa xmm4, xmm5 movdqa xmm4, xmm5
punpcklwd xmm4, xmm0 punpcklwd xmm4, xmm0
punpckhwd xmm5, xmm0 punpckhwd xmm5, xmm0
paddd xmm4, xmm6 paddd xmm4, xmm6
paddd xmm5, xmm7 paddd xmm5, xmm7
movdqa xmm3, xmm4 movdqa xmm3, xmm4
pminud xmm3, xmm5 pminud xmm3, xmm5
pshufd xmm2, xmm3, 01001110B pshufd xmm2, xmm3, 01001110B
pminud xmm2, xmm3 pminud xmm2, xmm3
pshufd xmm3, xmm2, 10110001B pshufd xmm3, xmm2, 10110001B
pminud xmm2, xmm3 pminud xmm2, xmm3
movd retrd, xmm2 movd retrd, xmm2
pcmpeqd xmm4, xmm2 pcmpeqd xmm4, xmm2
movmskps r2d, xmm4 movmskps r2d, xmm4
bsf r1d, r2d bsf r1d, r2d
jnz near WRITE_INDEX jnz near WRITE_INDEX
pcmpeqd xmm5, xmm2 pcmpeqd xmm5, xmm2
movmskps r2d, xmm5 movmskps r2d, xmm5
bsf r1d, r2d bsf r1d, r2d
add r1d, 4 add r1d, 4
WRITE_INDEX: WRITE_INDEX:
mov [r5], r1d mov [r5], r1d
POP_XMM POP_XMM
LOAD_6_PARA_POP LOAD_6_PARA_POP
ret ret
;********************************************************************************************************************************** ;**********************************************************************************************************************************
; ;
; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost ) ; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
; ;
; \note: ; \note:
; src and ref is optional to align with 16 due inter 8x8 ; src and ref is optional to align with 16 due inter 8x8
; \return value: ; \return value:
; return minimal SAD cost, according index carried by index_min_cost ; return minimal SAD cost, according index carried by index_min_cost
; ;
;********************************************************************************************************************************** ;**********************************************************************************************************************************
; try 8 mv via offset ; try 8 mv via offset
; xmm7 store sad costs ; xmm7 store sad costs
%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref %macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
movdqu xmm0, [%1] movdqu xmm0, [%1]
movdqu xmm1, [%2] movdqu xmm1, [%2]
movdqa xmm2, xmm1 movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 0 ; 000 B mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm2, xmm0, 5 ; 101 B mpsadbw xmm2, xmm0, 5 ; 101 B
paddw xmm7, xmm2 ; accumulate cost paddw xmm7, xmm2 ; accumulate cost
add %1, %3 add %1, %3
add %2, %4 add %2, %4
%endmacro ; end of SAD_8x8_LINE_SSE41 %endmacro ; end of SAD_8x8_LINE_SSE41
%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref %macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
movdqu xmm0, [%1] movdqu xmm0, [%1]
movdqu xmm1, [%2] movdqu xmm1, [%2]
movdqa xmm2, xmm1 movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 0 ; 000 B mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm2, xmm0, 5 ; 101 B mpsadbw xmm2, xmm0, 5 ; 101 B
paddw xmm7, xmm2 ; accumulate cost paddw xmm7, xmm2 ; accumulate cost
%endmacro ; end of SAD_8x8_LINE_SSE41E %endmacro ; end of SAD_8x8_LINE_SSE41E
WELS_EXTERN SampleSad8x8Hor8_sse41 WELS_EXTERN SampleSad8x8Hor8_sse41
%assign push_num 0 %assign push_num 0
LOAD_6_PARA LOAD_6_PARA
PUSH_XMM 8 PUSH_XMM 8
SIGN_EXTENSION r1, r1d SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d SIGN_EXTENSION r3, r3d
movdqa xmm7, [r4] ; load base cost list movdqa xmm7, [r4] ; load base cost list
SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3 SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41E r0, r2, r1, r3 SAD_8x8_LINE_SSE41E r0, r2, r1, r3
phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
mov r1d, retrd mov r1d, retrd
and retrd, 0xFFFF and retrd, 0xFFFF
sar r1d, 16 sar r1d, 16
mov [r5], r1d mov [r5], r1d
POP_XMM POP_XMM
LOAD_6_PARA_POP LOAD_6_PARA_POP

View File

@ -104,32 +104,32 @@ db 6,7,6,7,7,8
align 16 align 16
high_mask_table: high_mask_table:
db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2 db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2
db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5 db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5
db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8 db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8
db 9,12, 0, 1, 1, 4, 2, 4, 5, 8 db 9,12, 0, 1, 1, 4, 2, 4, 5, 8
db 2, 4, 4, 7, 5, 7, 8,11, 3, 4 db 2, 4, 4, 7, 5, 7, 8,11, 3, 4
db 5, 8, 5, 7, 8,11, 6, 8, 8,11 db 5, 8, 5, 7, 8,11, 6, 8, 8,11
db 9,11,12,15, 0, 1, 1, 4, 1, 3 db 9,11,12,15, 0, 1, 1, 4, 1, 3
db 4, 7, 2, 4, 4, 7, 5, 7, 8,11 db 4, 7, 2, 4, 4, 7, 5, 7, 8,11
db 2, 3, 4, 7, 4, 6, 7,10, 5, 7 db 2, 3, 4, 7, 4, 6, 7,10, 5, 7
db 7,10, 8,10,11,14, 3, 4, 4, 7 db 7,10, 8,10,11,14, 3, 4, 4, 7
db 5, 7, 8,11, 5, 7, 7,10, 8,10 db 5, 7, 8,11, 5, 7, 7,10, 8,10
db 11,14, 6, 7, 8,11, 8,10,11,14 db 11,14, 6, 7, 8,11, 8,10,11,14
db 9,11,11,14,12,14,15,18, 0, 0 db 9,11,11,14,12,14,15,18, 0, 0
db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6 db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6
db 4, 6, 7,10, 2, 3, 4, 7, 4, 6 db 4, 6, 7,10, 2, 3, 4, 7, 4, 6
db 7,10, 5, 7, 7,10, 8,10,11,14 db 7,10, 5, 7, 7,10, 8,10,11,14
db 2, 3, 3, 6, 4, 6, 7,10, 4, 6 db 2, 3, 3, 6, 4, 6, 7,10, 4, 6
db 6, 9, 7, 9,10,13, 5, 6, 7,10 db 6, 9, 7, 9,10,13, 5, 6, 7,10
db 7, 9,10,13, 8,10,10,13,11,13 db 7, 9,10,13, 8,10,10,13,11,13
db 14,17, 3, 4, 4, 7, 4, 6, 7,10 db 14,17, 3, 4, 4, 7, 4, 6, 7,10
db 5, 7, 7,10, 8,10,11,14, 5, 6 db 5, 7, 7,10, 8,10,11,14, 5, 6
db 7,10, 7, 9,10,13, 8,10,10,13 db 7,10, 7, 9,10,13, 8,10,10,13
db 11,13,14,17, 6, 7, 7,10, 8,10 db 11,13,14,17, 6, 7, 7,10, 8,10
db 11,14, 8,10,10,13,11,13,14,17 db 11,14, 8,10,10,13,11,13,14,17
db 9,10,11,14,11,13,14,17,12,14 db 9,10,11,14,11,13,14,17,12,14
db 14,17,15,17,18,21 db 14,17,15,17,18,21
align 16 align 16
low_mask_table: low_mask_table:
@ -167,173 +167,173 @@ SECTION .text
;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct ) ;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsScan4x4DcAc_sse2 WELS_EXTERN WelsScan4x4DcAc_sse2
%ifdef X86_32 %ifdef X86_32
push r3 push r3
%assign push_num 1 %assign push_num 1
%else %else
%assign push_num 0 %assign push_num 0
%endif %endif
LOAD_2_PARA LOAD_2_PARA
movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0 movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
movdqa xmm1, [r1+16] ; f e d c b a 9 8 movdqa xmm1, [r1+16] ; f e d c b a 9 8
pextrw r2d, xmm0, 7 ; ecx = 7 pextrw r2d, xmm0, 7 ; ecx = 7
pextrw r3d, xmm1, 2 ; edx = a pextrw r3d, xmm1, 2 ; edx = a
pextrw r1d, xmm0, 5 ; eax = 5 pextrw r1d, xmm0, 5 ; eax = 5
pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8 pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0 pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
pextrw r2d, xmm1, 0 ; ecx = 8 pextrw r2d, xmm1, 0 ; ecx = 8
pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0 pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0 pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0 pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9 pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
movdqa [r0],xmm0 movdqa [r0],xmm0
movdqa [r0+16], xmm1 movdqa [r0+16], xmm1
%ifdef X86_32 %ifdef X86_32
pop r3 pop r3
%endif %endif
ret ret
;*********************************************************************** ;***********************************************************************
;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct ) ;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsScan4x4DcAc_ssse3 WELS_EXTERN WelsScan4x4DcAc_ssse3
%assign push_num 0 %assign push_num 0
LOAD_2_PARA LOAD_2_PARA
movdqa xmm0, [r1] movdqa xmm0, [r1]
movdqa xmm1, [r1+16] movdqa xmm1, [r1+16]
pextrw r2d, xmm0, 7 ; ecx = [7] pextrw r2d, xmm0, 7 ; ecx = [7]
pextrw r1d, xmm1, 0 ; eax = [8] pextrw r1d, xmm1, 0 ; eax = [8]
pinsrw xmm0, r1d, 7 ; xmm0[7] = [8] pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
pinsrw xmm1, r2d, 0 ; xmm1[0] = [7] pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
pshufb xmm1, [pb_scanacdc_maskb] pshufb xmm1, [pb_scanacdc_maskb]
pshufb xmm0, [pb_scanacdc_maska] pshufb xmm0, [pb_scanacdc_maska]
movdqa [r0],xmm0 movdqa [r0],xmm0
movdqa [r0+16], xmm1 movdqa [r0+16], xmm1
ret ret
;*********************************************************************** ;***********************************************************************
;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct ) ;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsScan4x4Ac_sse2 WELS_EXTERN WelsScan4x4Ac_sse2
%assign push_num 0 %assign push_num 0
LOAD_2_PARA LOAD_2_PARA
movdqa xmm0, [r1] movdqa xmm0, [r1]
movdqa xmm1, [r1+16] movdqa xmm1, [r1+16]
movdqa xmm2, xmm0 movdqa xmm2, xmm0
punpcklqdq xmm0, xmm1 punpcklqdq xmm0, xmm1
punpckhqdq xmm2, xmm1 punpckhqdq xmm2, xmm1
movdqa xmm3, xmm0 movdqa xmm3, xmm0
punpckldq xmm0, xmm2 punpckldq xmm0, xmm2
punpckhdq xmm3, xmm2 punpckhdq xmm3, xmm2
pextrw r1d , xmm0, 3 pextrw r1d , xmm0, 3
pextrw r2d , xmm0, 7 pextrw r2d , xmm0, 7
pinsrw xmm0, r1d, 7 pinsrw xmm0, r1d, 7
pextrw r1d, xmm3, 4 pextrw r1d, xmm3, 4
pinsrw xmm3, r2d, 4 pinsrw xmm3, r2d, 4
pextrw r2d, xmm3, 0 pextrw r2d, xmm3, 0
pinsrw xmm3, r1d, 0 pinsrw xmm3, r1d, 0
pinsrw xmm0, r2d, 3 pinsrw xmm0, r2d, 3
pshufhw xmm1, xmm0, 0x93 pshufhw xmm1, xmm0, 0x93
pshuflw xmm2, xmm3, 0x39 pshuflw xmm2, xmm3, 0x39
movdqa xmm3, xmm2 movdqa xmm3, xmm2
psrldq xmm1, 2 psrldq xmm1, 2
pslldq xmm3, 14 pslldq xmm3, 14
por xmm1, xmm3 por xmm1, xmm3
psrldq xmm2, 2 psrldq xmm2, 2
movdqa [r0],xmm1 movdqa [r0],xmm1
movdqa [r0+16], xmm2 movdqa [r0+16], xmm2
ret ret
;*********************************************************************** ;***********************************************************************
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct ); ;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsCalculateSingleCtr4x4_sse2 WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
%ifdef X86_32 %ifdef X86_32
push r3 push r3
%assign push_num 1 %assign push_num 1
%else %else
%assign push_num 0 %assign push_num 0
%endif %endif
LOAD_1_PARA LOAD_1_PARA
movdqa xmm0, [r0] movdqa xmm0, [r0]
movdqa xmm1, [r0+16] movdqa xmm1, [r0+16]
packsswb xmm0, xmm1 packsswb xmm0, xmm1
; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
xor r3, r3 xor r3, r3
pxor xmm3, xmm3 pxor xmm3, xmm3
pcmpeqb xmm0, xmm3 pcmpeqb xmm0, xmm3
pmovmskb r3d, xmm0 pmovmskb r3d, xmm0
xor r3, 0xffff xor r3, 0xffff
xor r0, r0 xor r0, r0
mov r2, 7 mov r2, 7
mov r1, 8 mov r1, 8
.loop_low8_find1: .loop_low8_find1:
bt r3, r2 bt r3, r2
jc .loop_high8_find1 jc .loop_high8_find1
dec r2 dec r2
jnz .loop_low8_find1 jnz .loop_low8_find1
.loop_high8_find1: .loop_high8_find1:
bt r3, r1 bt r3, r1
jc .find1end jc .find1end
inc r1 inc r1
cmp r1,16 cmp r1,16
jb .loop_high8_find1 jb .loop_high8_find1
.find1end: .find1end:
sub r1, r2 sub r1, r2
sub r1, 1 sub r1, 1
lea r2, [i_ds_table] lea r2, [i_ds_table]
add r0b, [r2+r1] add r0b, [r2+r1]
mov r1, r3 mov r1, r3
and r3, 0xff and r3, 0xff
shr r1, 8 shr r1, 8
and r1, 0xff and r1, 0xff
lea r2 , [low_mask_table] lea r2 , [low_mask_table]
add r0b, [r2 +r3] add r0b, [r2 +r3]
lea r2, [high_mask_table] lea r2, [high_mask_table]
add r0b, [r2+r1] add r0b, [r2+r1]
%ifdef X86_32 %ifdef X86_32
pop r3 pop r3
%else %else
mov retrd, r0d mov retrd, r0d
%endif %endif
ret ret
;*********************************************************************** ;***********************************************************************
; int32_t WelsGetNoneZeroCount_sse2(int16_t* level); ; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
;*********************************************************************** ;***********************************************************************
WELS_EXTERN WelsGetNoneZeroCount_sse2 WELS_EXTERN WelsGetNoneZeroCount_sse2
%assign push_num 0 %assign push_num 0
LOAD_1_PARA LOAD_1_PARA
movdqa xmm0, [r0] movdqa xmm0, [r0]
movdqa xmm1, [r0+16] movdqa xmm1, [r0+16]
pxor xmm2, xmm2 pxor xmm2, xmm2
pcmpeqw xmm0, xmm2 pcmpeqw xmm0, xmm2
pcmpeqw xmm1, xmm2 pcmpeqw xmm1, xmm2
packsswb xmm1, xmm0 packsswb xmm1, xmm0
xor r1, r1 xor r1, r1
pmovmskb r1d, xmm1 pmovmskb r1d, xmm1
xor r1d, 0xffff xor r1d, 0xffff
mov r2, r1 mov r2, r1
and r1, 0xff and r1, 0xff
shr r2, 8 shr r2, 8
; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet ; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
; xor retr, retr ; xor retr, retr
;add al, [nozero_count_table+r2] ;add al, [nozero_count_table+r2]
lea r0 , [nozero_count_table] lea r0 , [nozero_count_table]
movzx r2, byte [r0+r2] movzx r2, byte [r0+r2]
movzx r1, byte [r0+r1] movzx r1, byte [r0+r1]
mov retrq, r2 mov retrq, r2
add retrq, r1 add retrq, r1
;add al, [nozero_count_table+r1] ;add al, [nozero_count_table+r1]
ret ret

View File

@ -36,17 +36,17 @@
#ifdef __APPLE__ #ifdef __APPLE__
.macro SQR_ADD_16BYTES .macro SQR_ADD_16BYTES
vmull.u8 q3, $0, $0 vmull.u8 q3, $0, $0
vmull.u8 q8, $1, $1 vmull.u8 q8, $1, $1
vpadal.u16 $2, q3 vpadal.u16 $2, q3
vpadal.u16 $2, q8 vpadal.u16 $2, q8
.endm .endm
#else #else
.macro SQR_ADD_16BYTES arg0, arg1, arg2 .macro SQR_ADD_16BYTES arg0, arg1, arg2
vmull.u8 q3, \arg0, \arg0 vmull.u8 q3, \arg0, \arg0
vmull.u8 q8, \arg1, \arg1 vmull.u8 q8, \arg1, \arg1
vpadal.u16 \arg2, q3 vpadal.u16 \arg2, q3
vpadal.u16 \arg2, q8 vpadal.u16 \arg2, q8
.endm .endm
#endif #endif
@ -54,66 +54,66 @@
WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
stmdb sp!, {r4} stmdb sp!, {r4}
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes) vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q14}, [r2], r3 //save the src data (16bytes) vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
vabd.u8 q13, q14, q15 vabd.u8 q13, q14, q15
vmull.u8 q12, d27, d27 vmull.u8 q12, d27, d27
vmull.u8 q11, d26, d26 vmull.u8 q11, d26, d26
vaddl.u16 q12, d24, d25 vaddl.u16 q12, d24, d25
vpadal.u16 q12, q11 //sqr vpadal.u16 q12, q11 //sqr
vaddl.u8 q13, d26, d27 //sum vaddl.u8 q13, d26, d27 //sum
vaddl.u8 q10, d28, d29 //sum_cur vaddl.u8 q10, d28, d29 //sum_cur
vmull.u8 q9, d29, d29 vmull.u8 q9, d29, d29
vmull.u8 q8, d28, d28 vmull.u8 q8, d28, d28
vaddl.u16 q9, d18, d19 //sqr_cur vaddl.u16 q9, d18, d19 //sqr_cur
vpadal.u16 q9, q8 vpadal.u16 q9, q8
mov r4, #15 mov r4, #15
pixel_var_16x16_loop0: pixel_var_16x16_loop0:
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes) vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q1}, [r2], r3 //save the src data (16bytes) vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
vabd.u8 q2, q0, q1 vabd.u8 q2, q0, q1
//q10 save sum_cur //q10 save sum_cur
vpadal.u8 q10, q1 vpadal.u8 q10, q1
//q12 save sqr //q12 save sqr
SQR_ADD_16BYTES d4, d5, q12 SQR_ADD_16BYTES d4, d5, q12
//q13 save sum //q13 save sum
vpadal.u8 q13, q2 vpadal.u8 q13, q2
subs r4, #1 subs r4, #1
//q9 save sqr_cur //q9 save sqr_cur
SQR_ADD_16BYTES d2, d3, q9 SQR_ADD_16BYTES d2, d3, q9
bne pixel_var_16x16_loop0 bne pixel_var_16x16_loop0
vadd.u16 d0, d26, d27 //sum vadd.u16 d0, d26, d27 //sum
vadd.u16 d1, d20, d21 //sum_cur vadd.u16 d1, d20, d21 //sum_cur
vpaddl.u16 q0, q0 vpaddl.u16 q0, q0
vadd.u32 d2, d24, d25 //sqr vadd.u32 d2, d24, d25 //sqr
vadd.u32 d3, d18, d19 //sqr_cur vadd.u32 d3, d18, d19 //sqr_cur
vpadd.u32 d0, d0, d1 vpadd.u32 d0, d0, d1
vpadd.u32 d1, d2, d3 vpadd.u32 d1, d2, d3
ldr r4, [sp, #4] ldr r4, [sp, #4]
vshr.u32 q0, q0, #8 vshr.u32 q0, q0, #8
vmul.u32 d0, d0 vmul.u32 d0, d0
vsub.u32 d0, d1, d0 vsub.u32 d0, d1, d0
vmovl.u32 q0, d0 vmovl.u32 q0, d0
vst2.16 {d0[0], d1[0]}, [r4] vst2.16 {d0[0], d1[0]}, [r4]
ldmia sp!, {r4} ldmia sp!, {r4}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END

View File

@ -30,313 +30,313 @@
* *
*/ */
#ifdef HAVE_NEON #ifdef HAVE_NEON
.text .text
#include "arm_arch_common_macro.S" #include "arm_arch_common_macro.S"
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
stmdb sp!, {r4-r8, lr} stmdb sp!, {r4-r8, lr}
//Get the width and height //Get the width and height
ldr r4, [sp, #24] //src_width ldr r4, [sp, #24] //src_width
ldr r5, [sp, #28] //src_height ldr r5, [sp, #28] //src_height
//Initialize the register //Initialize the register
mov r6, r2 mov r6, r2
mov r8, r0 mov r8, r0
mov lr, #0 mov lr, #0
lsr r5, #1 lsr r5, #1
//Save the tailer for the unasigned size //Save the tailer for the unasigned size
mla r7, r1, r5, r0 mla r7, r1, r5, r0
vld1.32 {q15}, [r7] vld1.32 {q15}, [r7]
add r7, r2, r3 add r7, r2, r3
//processing a colume data //processing a colume data
comp_ds_bilinear_loop0: comp_ds_bilinear_loop0:
vld1.8 {q0,q1}, [r2]! vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]! vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0 vpaddl.u8 q0, q0
vpaddl.u8 q1, q1 vpaddl.u8 q1, q1
vpaddl.u8 q2, q2 vpaddl.u8 q2, q2
vpaddl.u8 q3, q3 vpaddl.u8 q3, q3
vrshr.u16 q0, #1 vrshr.u16 q0, #1
vrshr.u16 q1, #1 vrshr.u16 q1, #1
vrshr.u16 q2, #1 vrshr.u16 q2, #1
vrshr.u16 q3, #1 vrshr.u16 q3, #1
vrhadd.u16 q0, q2 vrhadd.u16 q0, q2
vrhadd.u16 q1, q3 vrhadd.u16 q1, q3
vmovn.u16 d0, q0 vmovn.u16 d0, q0
vmovn.u16 d1, q1 vmovn.u16 d1, q1
vst1.32 {q0}, [r0]! vst1.32 {q0}, [r0]!
add lr, #32 add lr, #32
cmp lr, r4 cmp lr, r4
movcs lr, #0 movcs lr, #0
addcs r6, r6, r3, lsl #1 addcs r6, r6, r3, lsl #1
movcs r2, r6 movcs r2, r6
addcs r7, r2, r3 addcs r7, r2, r3
addcs r8, r1 addcs r8, r1
movcs r0, r8 movcs r0, r8
subscs r5, #1 subscs r5, #1
bne comp_ds_bilinear_loop0 bne comp_ds_bilinear_loop0
//restore the tailer for the unasigned size //restore the tailer for the unasigned size
vst1.32 {q15}, [r0] vst1.32 {q15}, [r0]
ldmia sp!, {r4-r8,lr} ldmia sp!, {r4-r8,lr}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
stmdb sp!, {r4-r7, lr} stmdb sp!, {r4-r7, lr}
//Get the width and height //Get the width and height
ldr r4, [sp, #20] //src_width ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height ldr r5, [sp, #24] //src_height
//Get the difference //Get the difference
sub lr, r3, r4 sub lr, r3, r4
sub r1, r1, r4, lsr #1 sub r1, r1, r4, lsr #1
lsr r5, #1 lsr r5, #1
//processing a colume data //processing a colume data
comp_ds_bilinear_w_x8_loop0: comp_ds_bilinear_w_x8_loop0:
lsr r6, r4, #3 lsr r6, r4, #3
add r7, r2, r3 add r7, r2, r3
//processing a line data //processing a line data
comp_ds_bilinear_w_x8_loop1: comp_ds_bilinear_w_x8_loop1:
vld1.8 {d0}, [r2]! vld1.8 {d0}, [r2]!
vld1.8 {d1}, [r7]! vld1.8 {d1}, [r7]!
vpaddl.u8 q0, q0 vpaddl.u8 q0, q0
vrshr.u16 q0, #1 vrshr.u16 q0, #1
vrhadd.u16 d0, d1 vrhadd.u16 d0, d1
vmovn.u16 d0, q0 vmovn.u16 d0, q0
vst1.32 {d0[0]}, [r0]! vst1.32 {d0[0]}, [r0]!
subs r6, #1 subs r6, #1
bne comp_ds_bilinear_w_x8_loop1 bne comp_ds_bilinear_w_x8_loop1
add r2, r7, lr add r2, r7, lr
add r0, r1 add r0, r1
subs r5, #1 subs r5, #1
bne comp_ds_bilinear_w_x8_loop0 bne comp_ds_bilinear_w_x8_loop0
ldmia sp!, {r4-r7,lr} ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
stmdb sp!, {r4-r7, lr} stmdb sp!, {r4-r7, lr}
//Get the width and height //Get the width and height
ldr r4, [sp, #20] //src_width ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height ldr r5, [sp, #24] //src_height
//Get the difference //Get the difference
sub lr, r3, r4 sub lr, r3, r4
sub r1, r1, r4, lsr #1 sub r1, r1, r4, lsr #1
lsr r5, #1 lsr r5, #1
//processing a colume data //processing a colume data
comp_ds_bilinear_w_x16_loop0: comp_ds_bilinear_w_x16_loop0:
lsr r6, r4, #4 lsr r6, r4, #4
add r7, r2, r3 add r7, r2, r3
//processing a line data //processing a line data
comp_ds_bilinear_w_x16_loop1: comp_ds_bilinear_w_x16_loop1:
vld1.8 {q0}, [r2]! vld1.8 {q0}, [r2]!
vld1.8 {q1}, [r7]! vld1.8 {q1}, [r7]!
vpaddl.u8 q0, q0 vpaddl.u8 q0, q0
vpaddl.u8 q1, q1 vpaddl.u8 q1, q1
vrshr.u16 q0, #1 vrshr.u16 q0, #1
vrshr.u16 q1, #1 vrshr.u16 q1, #1
vrhadd.u16 q0, q1 vrhadd.u16 q0, q1
vmovn.u16 d0, q0 vmovn.u16 d0, q0
vst1.32 {d0}, [r0]! vst1.32 {d0}, [r0]!
subs r6, #1 subs r6, #1
bne comp_ds_bilinear_w_x16_loop1 bne comp_ds_bilinear_w_x16_loop1
add r2, r7, lr add r2, r7, lr
add r0, r1 add r0, r1
subs r5, #1 subs r5, #1
bne comp_ds_bilinear_w_x16_loop0 bne comp_ds_bilinear_w_x16_loop0
ldmia sp!, {r4-r7,lr} ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
stmdb sp!, {r4-r7, lr} stmdb sp!, {r4-r7, lr}
//Get the width and height //Get the width and height
ldr r4, [sp, #20] //src_width ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height ldr r5, [sp, #24] //src_height
//Get the difference //Get the difference
sub lr, r3, r4 sub lr, r3, r4
sub r1, r1, r4, lsr #1 sub r1, r1, r4, lsr #1
lsr r5, #1 lsr r5, #1
//processing a colume data //processing a colume data
comp_ds_bilinear_w_x32_loop0: comp_ds_bilinear_w_x32_loop0:
lsr r6, r4, #5 lsr r6, r4, #5
add r7, r2, r3 add r7, r2, r3
//processing a line data //processing a line data
comp_ds_bilinear_w_x32_loop1: comp_ds_bilinear_w_x32_loop1:
vld1.8 {q0,q1}, [r2]! vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]! vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0 vpaddl.u8 q0, q0
vpaddl.u8 q1, q1 vpaddl.u8 q1, q1
vpaddl.u8 q2, q2 vpaddl.u8 q2, q2
vpaddl.u8 q3, q3 vpaddl.u8 q3, q3
vrshr.u16 q0, #1 vrshr.u16 q0, #1
vrshr.u16 q1, #1 vrshr.u16 q1, #1
vrshr.u16 q2, #1 vrshr.u16 q2, #1
vrshr.u16 q3, #1 vrshr.u16 q3, #1
vrhadd.u16 q0, q2 vrhadd.u16 q0, q2
vrhadd.u16 q1, q3 vrhadd.u16 q1, q3
vmovn.u16 d0, q0 vmovn.u16 d0, q0
vmovn.u16 d1, q1 vmovn.u16 d1, q1
vst1.32 {q0}, [r0]! vst1.32 {q0}, [r0]!
subs r6, #1 subs r6, #1
bne comp_ds_bilinear_w_x32_loop1 bne comp_ds_bilinear_w_x32_loop1
add r2, r7, lr add r2, r7, lr
add r0, r1 add r0, r1
subs r5, #1 subs r5, #1
bne comp_ds_bilinear_w_x32_loop0 bne comp_ds_bilinear_w_x32_loop0
ldmia sp!, {r4-r7,lr} ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
stmdb sp!, {r4-r12, lr} stmdb sp!, {r4-r12, lr}
//Get the data from stack //Get the data from stack
ldr r4, [sp, #40] //the addr of src ldr r4, [sp, #40] //the addr of src
ldr r5, [sp, #44] //the value of src_stride ldr r5, [sp, #44] //the value of src_stride
ldr r6, [sp, #48] //the value of scaleX ldr r6, [sp, #48] //the value of scaleX
ldr r7, [sp, #52] //the value of scaleY ldr r7, [sp, #52] //the value of scaleY
mov r10, #32768 mov r10, #32768
sub r10, #1 sub r10, #1
and r8, r6, r10 // r8 uinc(scaleX mod 32767) and r8, r6, r10 // r8 uinc(scaleX mod 32767)
mov r11, #-1 mov r11, #-1
mul r11, r8 // r11 -uinc mul r11, r8 // r11 -uinc
vdup.s16 d2, r8 vdup.s16 d2, r8
vdup.s16 d0, r11 vdup.s16 d0, r11
vzip.s16 d0, d2 // uinc -uinc uinc -uinc vzip.s16 d0, d2 // uinc -uinc uinc -uinc
and r9, r7, r10 // r9 vinc(scaleY mod 32767) and r9, r7, r10 // r9 vinc(scaleY mod 32767)
mov r11, #-1 mov r11, #-1
mul r11, r9 // r11 -vinc mul r11, r9 // r11 -vinc
vdup.s16 d2, r9 vdup.s16 d2, r9
vdup.s16 d3, r11 vdup.s16 d3, r11
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
mov r11, #0x40000000 mov r11, #0x40000000
mov r12, #0x4000 mov r12, #0x4000
sub r12, #1 sub r12, #1
add r11, r12 add r11, r12
vdup.s32 d1, r11; //init u 16384 16383 16384 16383 vdup.s32 d1, r11; //init u 16384 16383 16384 16383
mov r11, #16384 mov r11, #16384
vdup.s16 d16, r11 vdup.s16 d16, r11
sub r11, #1 sub r11, #1
vdup.s16 d17, r11 vdup.s16 d17, r11
vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383 vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383
veor q14, q14 veor q14, q14
sub r1, r2 // stride - width sub r1, r2 // stride - width
mov r8, #16384 // yInverse mov r8, #16384 // yInverse
sub r3, #1 sub r3, #1
_HEIGHT: _HEIGHT:
ldr r4, [sp, #40] //the addr of src ldr r4, [sp, #40] //the addr of src
mov r11, r8 mov r11, r8
lsr r11, #15 lsr r11, #15
mul r11, r5 mul r11, r5
add r11, r4 // get current row address add r11, r4 // get current row address
mov r12, r11 mov r12, r11
add r12, r5 add r12, r5
mov r9, #16384 // xInverse mov r9, #16384 // xInverse
sub r10, r2, #1 sub r10, r2, #1
vmov.s16 d6, d1 vmov.s16 d6, d1
_WIDTH: _WIDTH:
mov lr, r9 mov lr, r9
lsr lr, #15 lsr lr, #15
add r4, r11,lr add r4, r11,lr
vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a; vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
add r4, r12,lr add r4, r12,lr
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a; vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
vzip.32 d28, d29 //q14: 000d000c000b000a; vzip.32 d28, d29 //q14: 000d000c000b000a;
vmull.u16 q13, d6, d7 //q13: init u * init v vmull.u16 q13, d6, d7 //q13: init u * init v
vmull.u32 q12, d26,d28 vmull.u32 q12, d26,d28
vmlal.u32 q12, d27,d29 vmlal.u32 q12, d27,d29
vqadd.u64 d24, d24,d25 vqadd.u64 d24, d24,d25
vrshr.u64 d24, #30 vrshr.u64 d24, #30
vst1.8 {d24[0]}, [r0]! vst1.8 {d24[0]}, [r0]!
add r9, r6 add r9, r6
vadd.u16 d6, d0 // inc u vadd.u16 d6, d0 // inc u
vshl.u16 d6, #1 vshl.u16 d6, #1
vshr.u16 d6, #1 vshr.u16 d6, #1
subs r10, #1 subs r10, #1
bne _WIDTH bne _WIDTH
WIDTH_END: WIDTH_END:
lsr r9, #15 lsr r9, #15
add r4,r11,r9 add r4,r11,r9
vld1.8 {d24[0]}, [r4] vld1.8 {d24[0]}, [r4]
vst1.8 {d24[0]}, [r0] vst1.8 {d24[0]}, [r0]
add r0, #1 add r0, #1
add r8, r7 add r8, r7
add r0, r1 add r0, r1
vadd.s16 d7, d5 // inc v vadd.s16 d7, d5 // inc v
vshl.u16 d7, #1 vshl.u16 d7, #1
vshr.u16 d7, #1 vshr.u16 d7, #1
subs r3, #1 subs r3, #1
bne _HEIGHT bne _HEIGHT
LAST_ROW: LAST_ROW:
ldr r4, [sp, #40] //the addr of src ldr r4, [sp, #40] //the addr of src
lsr r8, #15 lsr r8, #15
mul r8, r5 mul r8, r5
add r4, r8 // get current row address add r4, r8 // get current row address
mov r9, #16384 mov r9, #16384
_LAST_ROW_WIDTH: _LAST_ROW_WIDTH:
mov r11, r9 mov r11, r9
lsr r11, #15 lsr r11, #15
add r3, r4,r11 add r3, r4,r11
vld1.8 {d0[0]}, [r3] vld1.8 {d0[0]}, [r3]
vst1.8 {d0[0]}, [r0] vst1.8 {d0[0]}, [r0]
add r0, #1 add r0, #1
add r9, r6 add r9, r6
subs r2, #1 subs r2, #1
bne _LAST_ROW_WIDTH bne _LAST_ROW_WIDTH
ldmia sp!, {r4-r12, lr} ldmia sp!, {r4-r12, lr}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
#endif #endif

View File

@ -37,32 +37,32 @@
WELS_ASM_FUNC_BEGIN WelsProcessingSampleSad8x8_neon WELS_ASM_FUNC_BEGIN WelsProcessingSampleSad8x8_neon
stmdb sp!, {lr} stmdb sp!, {lr}
//Loading a horizontal line data (8 bytes) //Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3 vld1.8 {d1}, [r2], r3
//Do the SAD for 8 bytes //Do the SAD for 8 bytes
vabdl.u8 q1, d0, d1 vabdl.u8 q1, d0, d1
mov lr, #7 mov lr, #7
pixel_sad_8x8_loop0: pixel_sad_8x8_loop0:
//Loading a horizontal line data (8 bytes) //Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3 vld1.8 {d1}, [r2], r3
subs lr, #1 subs lr, #1
//Do the SAD for 8 bytes //Do the SAD for 8 bytes
vabal.u8 q1, d0, d1 vabal.u8 q1, d0, d1
bne pixel_sad_8x8_loop0 bne pixel_sad_8x8_loop0
vadd.u16 d2, d3 vadd.u16 d2, d3
vpaddl.u16 d2, d2 vpaddl.u16 d2, d2
vpaddl.u32 d2, d2 vpaddl.u32 d2, d2
vmov.u32 r0, d2[0]//TBO... vmov.u32 r0, d2[0]//TBO...
ldmia sp!, {lr} ldmia sp!, {lr}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -56,217 +56,217 @@ sse2_20 times 8 dw 20
;*********************************************************************** ;***********************************************************************
SECTION .text SECTION .text
%macro WEIGHT_LINE 9 %macro WEIGHT_LINE 9
movq %2, %9 movq %2, %9
punpcklbw %2, %7 punpcklbw %2, %7
movdqa %8, %2 movdqa %8, %2
movdqa %1, %6 movdqa %1, %6
psubusb %1, %8 psubusb %1, %8
psubusb %8, %6 psubusb %8, %6
por %8, %1 ; ABS(curPixel - centerPixel); por %8, %1 ; ABS(curPixel - centerPixel);
movdqa %1, %3 movdqa %1, %3
psubusb %1, %8 psubusb %1, %8
pmullw %1, %1 pmullw %1, %1
psrlw %1, 5 psrlw %1, 5
pmullw %2, %1 pmullw %2, %1
paddusw %4, %1 paddusw %4, %1
paddusw %5, %2 paddusw %5, %2
%endmacro %endmacro
%macro WEIGHT_LINE1_UV 4 %macro WEIGHT_LINE1_UV 4
movdqa %2, %1 movdqa %2, %1
punpcklbw %2, %4 punpcklbw %2, %4
paddw %3, %2 paddw %3, %2
movdqa %2, %1 movdqa %2, %1
psrldq %2, 1 psrldq %2, 1
punpcklbw %2, %4 punpcklbw %2, %4
paddw %3, %2 paddw %3, %2
movdqa %2, %1 movdqa %2, %1
psrldq %2, 2 psrldq %2, 2
punpcklbw %2, %4 punpcklbw %2, %4
psllw %2, 1 psllw %2, 1
paddw %3, %2 paddw %3, %2
movdqa %2, %1 movdqa %2, %1
psrldq %2, 3 psrldq %2, 3
punpcklbw %2, %4 punpcklbw %2, %4
paddw %3, %2 paddw %3, %2
movdqa %2, %1 movdqa %2, %1
psrldq %2, 4 psrldq %2, 4
punpcklbw %2, %4 punpcklbw %2, %4
paddw %3, %2 paddw %3, %2
%endmacro %endmacro
%macro WEIGHT_LINE2_UV 4 %macro WEIGHT_LINE2_UV 4
movdqa %2, %1 movdqa %2, %1
punpcklbw %2, %4 punpcklbw %2, %4
paddw %3, %2 paddw %3, %2
movdqa %2, %1 movdqa %2, %1
psrldq %2, 1 psrldq %2, 1
punpcklbw %2, %4 punpcklbw %2, %4
psllw %2, 1 psllw %2, 1
paddw %3, %2 paddw %3, %2
movdqa %2, %1 movdqa %2, %1
psrldq %2, 2 psrldq %2, 2
punpcklbw %2, %4 punpcklbw %2, %4
psllw %2, 2 psllw %2, 2
paddw %3, %2 paddw %3, %2
movdqa %2, %1 movdqa %2, %1
psrldq %2, 3 psrldq %2, 3
punpcklbw %2, %4 punpcklbw %2, %4
psllw %2, 1 psllw %2, 1
paddw %3, %2 paddw %3, %2
movdqa %2, %1 movdqa %2, %1
psrldq %2, 4 psrldq %2, 4
punpcklbw %2, %4 punpcklbw %2, %4
paddw %3, %2 paddw %3, %2
%endmacro %endmacro
%macro WEIGHT_LINE3_UV 4 %macro WEIGHT_LINE3_UV 4
movdqa %2, %1 movdqa %2, %1
punpcklbw %2, %4 punpcklbw %2, %4
psllw %2, 1 psllw %2, 1
paddw %3, %2 paddw %3, %2
movdqa %2, %1 movdqa %2, %1
psrldq %2, 1 psrldq %2, 1
punpcklbw %2, %4 punpcklbw %2, %4
psllw %2, 2 psllw %2, 2
paddw %3, %2 paddw %3, %2
movdqa %2, %1 movdqa %2, %1
psrldq %2, 2 psrldq %2, 2
punpcklbw %2, %4 punpcklbw %2, %4
pmullw %2, [sse2_20] pmullw %2, [sse2_20]
paddw %3, %2 paddw %3, %2
movdqa %2, %1 movdqa %2, %1
psrldq %2, 3 psrldq %2, 3
punpcklbw %2, %4 punpcklbw %2, %4
psllw %2, 2 psllw %2, 2
paddw %3, %2 paddw %3, %2
movdqa %2, %1 movdqa %2, %1
psrldq %2, 4 psrldq %2, 4
punpcklbw %2, %4 punpcklbw %2, %4
psllw %2, 1 psllw %2, 1
paddw %3, %2 paddw %3, %2
%endmacro %endmacro
;*********************************************************************** ;***********************************************************************
; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride); ; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
;*********************************************************************** ;***********************************************************************
; 1 2 3 ; 1 2 3
; 4 0 5 ; 4 0 5
; 6 7 8 ; 6 7 8
; 0: the center point ; 0: the center point
WELS_EXTERN BilateralLumaFilter8_sse2 WELS_EXTERN BilateralLumaFilter8_sse2
push r3 push r3
%assign push_num 1 %assign push_num 1
LOAD_2_PARA LOAD_2_PARA
PUSH_XMM 8 PUSH_XMM 8
pxor xmm7, xmm7 pxor xmm7, xmm7
mov r3, r0 mov r3, r0
movq xmm6, [r0] movq xmm6, [r0]
punpcklbw xmm6, xmm7 punpcklbw xmm6, xmm7
movdqa xmm3, [sse2_32] movdqa xmm3, [sse2_32]
pxor xmm4, xmm4 ; nTotWeight pxor xmm4, xmm4 ; nTotWeight
pxor xmm5, xmm5 ; nSum pxor xmm5, xmm5 ; nSum
dec r0 dec r0
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
sub r0, r1 sub r0, r1
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
lea r0, [r0 + r1 * 2] lea r0, [r0 + r1 * 2]
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8 WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
pcmpeqw xmm0, xmm0 pcmpeqw xmm0, xmm0
psrlw xmm0, 15 psrlw xmm0, 15
psllw xmm0, 8 psllw xmm0, 8
psubusw xmm0, xmm4 psubusw xmm0, xmm4
pmullw xmm0, xmm6 pmullw xmm0, xmm6
paddusw xmm5, xmm0 paddusw xmm5, xmm0
psrlw xmm5, 8 psrlw xmm5, 8
packuswb xmm5, xmm5 packuswb xmm5, xmm5
movq [r3], xmm5 movq [r3], xmm5
POP_XMM POP_XMM
pop r3 pop r3
%assign push_num 0 %assign push_num 0
ret ret
;*********************************************************************** ;***********************************************************************
; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride); ; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
;*********************************************************************** ;***********************************************************************
;5x5 filter: ;5x5 filter:
;1 1 2 1 1 ;1 1 2 1 1
;1 2 4 2 1 ;1 2 4 2 1
;2 4 20 4 2 ;2 4 20 4 2
;1 2 4 2 1 ;1 2 4 2 1
;1 1 2 1 1 ;1 1 2 1 1
WELS_EXTERN WaverageChromaFilter8_sse2 WELS_EXTERN WaverageChromaFilter8_sse2
push r3 push r3
%assign push_num 1 %assign push_num 1
LOAD_2_PARA LOAD_2_PARA
mov r3, r1 mov r3, r1
add r3, r3 add r3, r3
sub r0, r3 ; pixels - 2 * stride sub r0, r3 ; pixels - 2 * stride
sub r0, 2 sub r0, 2
pxor xmm0, xmm0 pxor xmm0, xmm0
pxor xmm3, xmm3 pxor xmm3, xmm3
movdqu xmm1, [r0] movdqu xmm1, [r0]
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0 WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1] movdqu xmm1, [r0 + r1]
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0 WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
add r0, r3 add r0, r3
movdqu xmm1, [r0] movdqu xmm1, [r0]
WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0 WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1] movdqu xmm1, [r0 + r1]
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0 WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1 * 2] movdqu xmm1, [r0 + r1 * 2]
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0 WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
psrlw xmm3, 6 psrlw xmm3, 6
packuswb xmm3, xmm3 packuswb xmm3, xmm3
movq [r0 + 2], xmm3 movq [r0 + 2], xmm3
pop r3 pop r3
%assign push_num 0 %assign push_num 0
ret ret

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff