Convert all tabs to spaces in assembly sources, unify indentation

Previously the assembly sources had mixed indentation consisting
of both spaces and tabs, making it quite hard to read unless
the right tab size was used in the editor.

Tabs have been interpreted as 4 spaces in most cases, matching
the surrounding code.
This commit is contained in:
Martin Storsjö 2014-05-31 14:13:34 +03:00
parent faaf62afad
commit 57f6bcc4b0
38 changed files with 19904 additions and 19904 deletions

View File

@ -36,75 +36,75 @@
#ifdef __APPLE__
.macro LOAD_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride
vld1.64 {$0}, [$4,:128], $5
vld1.64 {$1}, [$4,:128], $5
vld1.64 {$2}, [$4,:128], $5
vld1.64 {$3}, [$4,:128], $5
// }
// { // input: $0~$3, src*, src_stride
vld1.64 {$0}, [$4,:128], $5
vld1.64 {$1}, [$4,:128], $5
vld1.64 {$2}, [$4,:128], $5
vld1.64 {$3}, [$4,:128], $5
// }
.endm
.macro STORE_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride
vst1.64 {$0}, [$4,:128], $5
vst1.64 {$1}, [$4,:128], $5
vst1.64 {$2}, [$4,:128], $5
vst1.64 {$3}, [$4,:128], $5
// }
// { // input: $0~$3, dst*, dst_stride
vst1.64 {$0}, [$4,:128], $5
vst1.64 {$1}, [$4,:128], $5
vst1.64 {$2}, [$4,:128], $5
vst1.64 {$3}, [$4,:128], $5
// }
.endm
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride
vld1.64 {$0}, [$4], $5
vld1.64 {$1}, [$4], $5
vld1.64 {$2}, [$4], $5
vld1.64 {$3}, [$4], $5
// }
// { // input: $0~$3, src*, src_stride
vld1.64 {$0}, [$4], $5
vld1.64 {$1}, [$4], $5
vld1.64 {$2}, [$4], $5
vld1.64 {$3}, [$4], $5
// }
.endm
.macro STORE_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride
vst1.64 {$0}, [$4], $5
vst1.64 {$1}, [$4], $5
vst1.64 {$2}, [$4], $5
vst1.64 {$3}, [$4], $5
// }
// { // input: $0~$3, dst*, dst_stride
vst1.64 {$0}, [$4], $5
vst1.64 {$1}, [$4], $5
vst1.64 {$2}, [$4], $5
vst1.64 {$3}, [$4], $5
// }
.endm
#else
.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: \arg0~\arg3, src*, src_stride
vld1.64 {\arg0}, [\arg4,:128], \arg5
vld1.64 {\arg1}, [\arg4,:128], \arg5
vld1.64 {\arg2}, [\arg4,:128], \arg5
vld1.64 {\arg3}, [\arg4,:128], \arg5
// }
// { // input: \arg0~\arg3, src*, src_stride
vld1.64 {\arg0}, [\arg4,:128], \arg5
vld1.64 {\arg1}, [\arg4,:128], \arg5
vld1.64 {\arg2}, [\arg4,:128], \arg5
vld1.64 {\arg3}, [\arg4,:128], \arg5
// }
.endm
.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: \arg0~\arg3, dst*, dst_stride
vst1.64 {\arg0}, [\arg4,:128], \arg5
vst1.64 {\arg1}, [\arg4,:128], \arg5
vst1.64 {\arg2}, [\arg4,:128], \arg5
vst1.64 {\arg3}, [\arg4,:128], \arg5
// }
// { // input: \arg0~\arg3, dst*, dst_stride
vst1.64 {\arg0}, [\arg4,:128], \arg5
vst1.64 {\arg1}, [\arg4,:128], \arg5
vst1.64 {\arg2}, [\arg4,:128], \arg5
vst1.64 {\arg3}, [\arg4,:128], \arg5
// }
.endm
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: \arg0~\arg3, src*, src_stride
vld1.64 {\arg0}, [\arg4], \arg5
vld1.64 {\arg1}, [\arg4], \arg5
vld1.64 {\arg2}, [\arg4], \arg5
vld1.64 {\arg3}, [\arg4], \arg5
// }
// { // input: \arg0~\arg3, src*, src_stride
vld1.64 {\arg0}, [\arg4], \arg5
vld1.64 {\arg1}, [\arg4], \arg5
vld1.64 {\arg2}, [\arg4], \arg5
vld1.64 {\arg3}, [\arg4], \arg5
// }
.endm
.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: \arg0~\arg3, dst*, dst_stride
vst1.64 {\arg0}, [\arg4], \arg5
vst1.64 {\arg1}, [\arg4], \arg5
vst1.64 {\arg2}, [\arg4], \arg5
vst1.64 {\arg3}, [\arg4], \arg5
// }
// { // input: \arg0~\arg3, dst*, dst_stride
vst1.64 {\arg0}, [\arg4], \arg5
vst1.64 {\arg1}, [\arg4], \arg5
vst1.64 {\arg2}, [\arg4], \arg5
vst1.64 {\arg3}, [\arg4], \arg5
// }
.endm
#endif
@ -112,89 +112,89 @@
WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon
LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
WELS_ASM_FUNC_END

File diff suppressed because it is too large Load Diff

View File

@ -37,119 +37,119 @@
WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
stmdb sp!, {r4-r8}
//Save the dst
mov r7, r0
mov r8, r3
//Save the dst
mov r7, r0
mov r8, r3
add r4, r7, r2
sub r4, #1
add r4, r7, r2
sub r4, #1
//For the left and right expand
_expand_picture_luma_loop2:
sub r5, r7, #32
add r6, r4, #1
sub r5, r7, #32
add r6, r4, #1
vld1.8 {d0[], d1[]}, [r7], r1
vld1.8 {d2[], d3[]}, [r4], r1
vld1.8 {d0[], d1[]}, [r7], r1
vld1.8 {d2[], d3[]}, [r4], r1
vst1.8 {q0}, [r5]!
vst1.8 {q0}, [r5]
vst1.8 {q1}, [r6]!
vst1.8 {q1}, [r6]
subs r8, #1
bne _expand_picture_luma_loop2
vst1.8 {q0}, [r5]!
vst1.8 {q0}, [r5]
vst1.8 {q1}, [r6]!
vst1.8 {q1}, [r6]
subs r8, #1
bne _expand_picture_luma_loop2
//for the top and bottom expand
add r2, #64
sub r0, #32
mla r4, r1, r3, r0
sub r4, r1
//for the top and bottom expand
add r2, #64
sub r0, #32
mla r4, r1, r3, r0
sub r4, r1
_expand_picture_luma_loop0:
mov r5, #32
mov r5, #32
mls r5, r5, r1, r0
add r6, r4, r1
vld1.8 {q0}, [r0]!
vld1.8 {q1}, [r4]!
add r6, r4, r1
vld1.8 {q0}, [r0]!
vld1.8 {q1}, [r4]!
mov r8, #32
mov r8, #32
_expand_picture_luma_loop1:
vst1.8 {q0}, [r5], r1
vst1.8 {q1}, [r6], r1
subs r8, #1
vst1.8 {q0}, [r5], r1
vst1.8 {q1}, [r6], r1
subs r8, #1
bne _expand_picture_luma_loop1
subs r2, #16
bne _expand_picture_luma_loop0
subs r2, #16
bne _expand_picture_luma_loop0
//vldreq.32 d0, [r0]
ldmia sp!, {r4-r8}
ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
stmdb sp!, {r4-r9}
//Save the dst
mov r7, r0
mov r8, r3
//Save the dst
mov r7, r0
mov r8, r3
add r4, r7, r2
sub r4, #1
add r4, r7, r2
sub r4, #1
//For the left and right expand
_expand_picture_chroma_loop2:
sub r5, r7, #16
add r6, r4, #1
sub r5, r7, #16
add r6, r4, #1
vld1.8 {d0[], d1[]}, [r7], r1
vld1.8 {d2[], d3[]}, [r4], r1
vld1.8 {d0[], d1[]}, [r7], r1
vld1.8 {d2[], d3[]}, [r4], r1
vst1.8 {q0}, [r5]
vst1.8 {q1}, [r6]
subs r8, #1
bne _expand_picture_chroma_loop2
vst1.8 {q0}, [r5]
vst1.8 {q1}, [r6]
subs r8, #1
bne _expand_picture_chroma_loop2
//for the top and bottom expand
add r2, #32
mov r9, r2
bic r2, #15
sub r0, #16
mla r4, r1, r3, r0
sub r4, r1
//for the top and bottom expand
add r2, #32
mov r9, r2
bic r2, #15
sub r0, #16
mla r4, r1, r3, r0
sub r4, r1
_expand_picture_chroma_loop0:
mov r5, #16
mls r5, r5, r1, r0
add r6, r4, r1
vld1.8 {q0}, [r0]!
vld1.8 {q1}, [r4]!
mov r5, #16
mls r5, r5, r1, r0
add r6, r4, r1
vld1.8 {q0}, [r0]!
vld1.8 {q1}, [r4]!
mov r8, #16
mov r8, #16
_expand_picture_chroma_loop1:
vst1.8 {q0}, [r5], r1
vst1.8 {q1}, [r6], r1
subs r8, #1
bne _expand_picture_chroma_loop1
vst1.8 {q0}, [r5], r1
vst1.8 {q1}, [r6], r1
subs r8, #1
bne _expand_picture_chroma_loop1
subs r2, #16
bne _expand_picture_chroma_loop0
subs r2, #16
bne _expand_picture_chroma_loop0
//vldreq.32 d0, [r0]
and r9, #15
cmp r9, #8
bne _expand_picture_chroma_end
mov r5, #16
mls r5, r5, r1, r0
add r6, r4, r1
vld1.8 {d0}, [r0]!
vld1.8 {d2}, [r4]!
mov r8, #16
and r9, #15
cmp r9, #8
bne _expand_picture_chroma_end
mov r5, #16
mls r5, r5, r1, r0
add r6, r4, r1
vld1.8 {d0}, [r0]!
vld1.8 {d2}, [r4]!
mov r8, #16
_expand_picture_chroma_loop3:
vst1.8 {d0}, [r5], r1
vst1.8 {d2}, [r6], r1
subs r8, #1
bne _expand_picture_chroma_loop3
vst1.8 {d0}, [r5], r1
vst1.8 {d2}, [r6], r1
subs r8, #1
bne _expand_picture_chroma_loop3
_expand_picture_chroma_end:
ldmia sp!, {r4-r9}
ldmia sp!, {r4-r9}
WELS_ASM_FUNC_END
#endif

File diff suppressed because it is too large Load Diff

View File

@ -53,88 +53,88 @@ _expand_picture_luma_loop2:
sub x8, x8, #1
cbnz x8, _expand_picture_luma_loop2
//for the top and bottom expand
add x2, x2, #64
sub x0, x0, #32
add x2, x2, #64
sub x0, x0, #32
madd x4, x1, x3, x0
sub x4, x4, x1
_expand_picture_luma_loop0:
mov x5, #32
mov x5, #32
msub x5, x5, x1, x0
add x6, x4, x1
add x6, x4, x1
ld1 {v0.16b}, [x0], x10
ld1 {v1.16b}, [x4], x10
mov x8, #32
mov x8, #32
_expand_picture_luma_loop1:
st1 {v0.16b}, [x5], x1
st1 {v1.16b}, [x6], x1
sub x8, x8, #1
st1 {v0.16b}, [x5], x1
st1 {v1.16b}, [x6], x1
sub x8, x8, #1
cbnz x8, _expand_picture_luma_loop1
sub x2, x2, #16
cbnz x2, _expand_picture_luma_loop0
sub x2, x2, #16
cbnz x2, _expand_picture_luma_loop0
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN ExpandPictureChroma_AArch64_neon
//Save the dst
mov x7, x0
mov x8, x3
//Save the dst
mov x7, x0
mov x8, x3
mov x10, #16
add x4, x7, x2
sub x4, x4, #1
add x4, x7, x2
sub x4, x4, #1
//For the left and right expand
_expand_picture_chroma_loop2:
sub x5, x7, #16
add x6, x4, #1
sub x5, x7, #16
add x6, x4, #1
ld1r {v0.16b}, [x7], x1
ld1r {v1.16b}, [x4], x1
ld1r {v0.16b}, [x7], x1
ld1r {v1.16b}, [x4], x1
st1 {v0.16b}, [x5]
st1 {v1.16b}, [x6]
sub x8, x8, #1
cbnz x8, _expand_picture_chroma_loop2
st1 {v0.16b}, [x5]
st1 {v1.16b}, [x6]
sub x8, x8, #1
cbnz x8, _expand_picture_chroma_loop2
//for the top and bottom expand
add x2, x2, #32
//for the top and bottom expand
add x2, x2, #32
//
mov x9, x2
mov x11, #15
bic x2, x2, x11
//
sub x0, x0, #16
madd x4, x1, x3, x0
sub x4, x4, x1
sub x0, x0, #16
madd x4, x1, x3, x0
sub x4, x4, x1
_expand_picture_chroma_loop0:
mov x5, #16
mov x5, #16
msub x5, x5, x1, x0
add x6, x4, x1
ld1 {v0.16b}, [x0], x10
ld1 {v1.16b}, [x4], x10
add x6, x4, x1
ld1 {v0.16b}, [x0], x10
ld1 {v1.16b}, [x4], x10
mov x8, #16
mov x8, #16
_expand_picture_chroma_loop1:
st1 {v0.16b}, [x5], x1
st1 {v1.16b}, [x6], x1
sub x8, x8, #1
st1 {v0.16b}, [x5], x1
st1 {v1.16b}, [x6], x1
sub x8, x8, #1
cbnz x8, _expand_picture_chroma_loop1
sub x2, x2, #16
cbnz x2, _expand_picture_chroma_loop0
sub x2, x2, #16
cbnz x2, _expand_picture_chroma_loop0
and x9, x9, #15
sub x9, x9, #8
cbnz x9, _expand_picture_chroma_end
mov x5, #16
mov x5, #16
msub x5, x5, x1, x0
add x6, x4, x1
ld1 {v0.8b}, [x0]
ld1 {v1.8b}, [x4]
add x6, x4, x1
ld1 {v0.8b}, [x0]
ld1 {v1.8b}, [x4]
mov x8, #16
mov x8, #16
_expand_picture_chroma_loop3:
st1 {v0.8b}, [x5], x1
st1 {v1.8b}, [x6], x1
sub x8, x8, #1
st1 {v0.8b}, [x5], x1
st1 {v1.8b}, [x6], x1
sub x8, x8, #1
cbnz x8, _expand_picture_chroma_loop3
_expand_picture_chroma_end:

File diff suppressed because it is too large Load Diff

View File

@ -44,15 +44,15 @@
;***********************************************************************
%if 1
%define MOVDQ movdqa
%define MOVDQ movdqa
%else
%define MOVDQ movdqu
%define MOVDQ movdqu
%endif
%if 1
%define WELSEMMS emms
%define WELSEMMS emms
%else
%define WELSEMMS
%define WELSEMMS
%endif
@ -220,7 +220,7 @@ BITS 32
%macro LOAD_1_PARA 0
%ifdef X86_32
mov r0, [esp + push_num*4 + 4]
mov r0, [esp + push_num*4 + 4]
%endif
%endmacro
@ -234,8 +234,8 @@ BITS 32
%macro LOAD_3_PARA 0
%ifdef X86_32
mov r0, [esp + push_num*4 + 4]
mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12]
mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12]
%endif
%endmacro
@ -267,7 +267,7 @@ BITS 32
%macro LOAD_6_PARA 0
%ifdef X86_32
push r3
push r3
push r4
push r5
%assign push_num push_num+3
@ -310,22 +310,22 @@ BITS 32
%macro LOAD_4_PARA_POP 0
%ifdef X86_32
pop r3
pop r3
%endif
%endmacro
%macro LOAD_5_PARA_POP 0
%ifdef X86_32
pop r4
pop r3
pop r3
%endif
%endmacro
%macro LOAD_6_PARA_POP 0
%ifdef X86_32
pop r5
pop r4
pop r3
pop r4
pop r3
%endif
%endmacro
@ -416,13 +416,13 @@ BITS 32
%macro SIGN_EXTENSION 2
%ifndef X86_32
movsxd %1, %2
movsxd %1, %2
%endif
%endmacro
%macro SIGN_EXTENSIONW 2
%ifndef X86_32
movsx %1, %2
movsx %1, %2
%endif
%endmacro
@ -438,13 +438,13 @@ BITS 32
%endmacro
%macro WELS_AbsW 2
pxor %2, %2
pxor %2, %2
psubw %2, %1
pmaxsw %1, %2
%endmacro
%macro MMX_XSwap 4
movq %4, %2
movq %4, %2
punpckh%1 %4, %3
punpckl%1 %2, %3
%endmacro
@ -485,35 +485,35 @@ BITS 32
;in: m1, m2, m3, m4, m5, m6, m7, m8
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
%macro SSE2_TransTwo8x8B 9
movdqa %9, %8
SSE2_XSawp bw, %1, %2, %8
SSE2_XSawp bw, %3, %4, %2
SSE2_XSawp bw, %5, %6, %4
movdqa %6, %9
movdqa %9, %4
SSE2_XSawp bw, %7, %6, %4
movdqa %9, %8
SSE2_XSawp bw, %1, %2, %8
SSE2_XSawp bw, %3, %4, %2
SSE2_XSawp bw, %5, %6, %4
movdqa %6, %9
movdqa %9, %4
SSE2_XSawp bw, %7, %6, %4
SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2
movdqa %7, %9
movdqa %9, %3
SSE2_XSawp wd, %7, %4, %3
SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2
movdqa %7, %9
movdqa %9, %3
SSE2_XSawp wd, %7, %4, %3
SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2
movdqa %7, %9
movdqa %9, %5
SSE2_XSawp dq, %7, %3, %5
SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2
movdqa %7, %9
movdqa %9, %5
SSE2_XSawp dq, %7, %3, %5
SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %9
movdqa %9, %1
SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %9
SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %9
movdqa %9, %1
SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %9
%endmacro
;xmm0, xmm6, xmm7, [eax], [ecx]
@ -528,32 +528,32 @@ BITS 32
; m2 = m1 + m2, m1 = m1 - m2
%macro SSE2_SumSub 3
movdqa %3, %2
movdqa %3, %2
paddw %2, %1
psubw %1, %3
%endmacro
%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
mov %3h, %3l
movd %1, e%3x ; i.e, 1% = eax (=b0)
pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
mov %3h, %3l
movd %1, e%3x ; i.e, 1% = eax (=b0)
pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
%endmacro
;copy a dw into a xmm for 8 times
%macro SSE2_Copy8Times 2
movd %1, %2
punpcklwd %1, %1
pshufd %1, %1, 0
movd %1, %2
punpcklwd %1, %1
pshufd %1, %1, 0
%endmacro
;copy a db into a xmm for 16 times
%macro SSE2_Copy16Times 2
movd %1, %2
pshuflw %1, %1, 0
punpcklqdq %1, %1
packuswb %1, %1
movd %1, %2
pshuflw %1, %1, 0
punpcklqdq %1, %1
packuswb %1, %1
%endmacro
@ -564,35 +564,35 @@ BITS 32
;dw 32,32,32,32,32,32,32,32 for xmm
;dw 32,32,32,32 for mm
%macro WELS_DW32 1
pcmpeqw %1,%1
psrlw %1,15
psllw %1,5
pcmpeqw %1,%1
psrlw %1,15
psllw %1,5
%endmacro
;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
;dw 1, 1, 1, 1 for mm
%macro WELS_DW1 1
pcmpeqw %1,%1
psrlw %1,15
pcmpeqw %1,%1
psrlw %1,15
%endmacro
;all 0 for xmm and mm
%macro WELS_Zero 1
pxor %1, %1
pxor %1, %1
%endmacro
;dd 1, 1, 1, 1 for xmm
;dd 1, 1 for mm
%macro WELS_DD1 1
pcmpeqw %1,%1
psrld %1,31
pcmpeqw %1,%1
psrld %1,31
%endmacro
;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
%macro WELS_DB1 1
pcmpeqw %1,%1
psrlw %1,15
packuswb %1,%1
pcmpeqw %1,%1
psrlw %1,15
packuswb %1,%1
%endmacro

View File

@ -29,13 +29,13 @@
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* cpu_mmx.asm
;* cpu_mmx.asm
;*
;* Abstract
;* verify cpuid feature support and cpuid detection
;* verify cpuid feature support and cpuid detection
;*
;* History
;* 04/29/2009 Created
;* 04/29/2009 Created
;*
;*************************************************************************/
@ -115,13 +115,13 @@ WELS_EXTERN WelsCPUId
%elifdef X86_32
WELS_EXTERN WelsCPUId
push ebx
push edi
push ebx
push edi
mov eax, [esp+12] ; operating index
mov eax, [esp+12] ; operating index
mov edi, [esp+24]
mov ecx, [edi]
cpuid ; cpuid
cpuid ; cpuid
; processing various information return
mov edi, [esp+16]
@ -133,7 +133,7 @@ WELS_EXTERN WelsCPUId
mov edi, [esp+28]
mov [edi], edx
pop edi
pop edi
pop ebx
ret
@ -145,31 +145,31 @@ WELS_EXTERN WelsCPUId
;****************************************************************************************************
WELS_EXTERN WelsCPUSupportAVX
%ifdef WIN64
mov eax, ecx
mov ecx, edx
mov eax, ecx
mov ecx, edx
%elifdef UNIX64
mov eax, edi
mov ecx, esi
mov eax, edi
mov ecx, esi
%else
mov eax, [esp+4]
mov ecx, [esp+8]
mov eax, [esp+4]
mov ecx, [esp+8]
%endif
; refer to detection of AVX addressed in INTEL AVX manual document
and ecx, 018000000H
cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
jne avx_not_supported
; processor supports AVX instructions and XGETBV is enabled by OS
mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
XGETBV ; result in EDX:EAX
and eax, 06H
cmp eax, 06H ; check OS has enabled both XMM and YMM state support
jne avx_not_supported
mov eax, 1
ret
; refer to detection of AVX addressed in INTEL AVX manual document
and ecx, 018000000H
cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
jne avx_not_supported
; processor supports AVX instructions and XGETBV is enabled by OS
mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
XGETBV ; result in EDX:EAX
and eax, 06H
cmp eax, 06H ; check OS has enabled both XMM and YMM state support
jne avx_not_supported
mov eax, 1
ret
avx_not_supported:
mov eax, 0
ret
mov eax, 0
ret
; need call after cpuid=1 and eax, ecx flag got then
@ -178,35 +178,35 @@ avx_not_supported:
;****************************************************************************************************
WELS_EXTERN WelsCPUSupportFMA
%ifdef WIN64
mov eax, ecx
mov ecx, edx
mov eax, ecx
mov ecx, edx
%elifdef UNIX64
mov eax, edi
mov ecx, esi
mov eax, edi
mov ecx, esi
%else
mov eax, [esp+4]
mov ecx, [esp+8]
mov eax, [esp+4]
mov ecx, [esp+8]
%endif
; refer to detection of FMA addressed in INTEL AVX manual document
and ecx, 018001000H
cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
jne fma_not_supported
; processor supports AVX,FMA instructions and XGETBV is enabled by OS
mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
XGETBV ; result in EDX:EAX
and eax, 06H
cmp eax, 06H ; check OS has enabled both XMM and YMM state support
jne fma_not_supported
mov eax, 1
ret
; refer to detection of FMA addressed in INTEL AVX manual document
and ecx, 018001000H
cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
jne fma_not_supported
; processor supports AVX,FMA instructions and XGETBV is enabled by OS
mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
XGETBV ; result in EDX:EAX
and eax, 06H
cmp eax, 06H ; check OS has enabled both XMM and YMM state support
jne fma_not_supported
mov eax, 1
ret
fma_not_supported:
mov eax, 0
ret
mov eax, 0
ret
;******************************************************************************************
; void WelsEmms()
;******************************************************************************************
WELS_EXTERN WelsEmms
emms ; empty mmx technology states
ret
emms ; empty mmx technology states
ret

File diff suppressed because it is too large Load Diff

View File

@ -77,280 +77,280 @@ SECTION .text
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+%2]
%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+%2]
%endmacro
%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
movdq%4 [%1], %3 ; top(bottom)_0
movdq%4 [%1+%2], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdq%4 [%1], %3 ; top(bottom)_2
movdq%4 [%1+%2], %3 ; top(bottom)_3
lea %1, [%1+2*%2]
%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
movdq%4 [%1], %3 ; top(bottom)_0
movdq%4 [%1+%2], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdq%4 [%1], %3 ; top(bottom)_2
movdq%4 [%1+%2], %3 ; top(bottom)_3
lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
movdq%4 [%1], %3 ; top(bottom)_0
movdq%4 [%1+%2], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdq%4 [%1], %3 ; top(bottom)_2
movdq%4 [%1+%2], %3 ; top(bottom)_3
lea %1, [%1+%2]
%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
movdq%4 [%1], %3 ; top(bottom)_0
movdq%4 [%1+%2], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdq%4 [%1], %3 ; top(bottom)_2
movdq%4 [%1+%2], %3 ; top(bottom)_3
lea %1, [%1+%2]
%endmacro
%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
movdqa [%1], %3 ; top(bottom)_0
movdqa [%1+16], %3 ; top(bottom)_0
movdqa [%1+%2], %3 ; top(bottom)_1
movdqa [%1+%2+16], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdqa [%1], %3 ; top(bottom)_2
movdqa [%1+16], %3 ; top(bottom)_2
movdqa [%1+%2], %3 ; top(bottom)_3
movdqa [%1+%2+16], %3 ; top(bottom)_3
lea %1, [%1+2*%2]
%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
movdqa [%1], %3 ; top(bottom)_0
movdqa [%1+16], %3 ; top(bottom)_0
movdqa [%1+%2], %3 ; top(bottom)_1
movdqa [%1+%2+16], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdqa [%1], %3 ; top(bottom)_2
movdqa [%1+16], %3 ; top(bottom)_2
movdqa [%1+%2], %3 ; top(bottom)_3
movdqa [%1+%2+16], %3 ; top(bottom)_3
lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
movdqa [%1], %3 ; top(bottom)_0
movdqa [%1+16], %3 ; top(bottom)_0
movdqa [%1+%2], %3 ; top(bottom)_1
movdqa [%1+%2+16], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdqa [%1], %3 ; top(bottom)_2
movdqa [%1+16], %3 ; top(bottom)_2
movdqa [%1+%2], %3 ; top(bottom)_3
movdqa [%1+%2+16], %3 ; top(bottom)_3
lea %1, [%1+%2]
%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
movdqa [%1], %3 ; top(bottom)_0
movdqa [%1+16], %3 ; top(bottom)_0
movdqa [%1+%2], %3 ; top(bottom)_1
movdqa [%1+%2+16], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdqa [%1], %3 ; top(bottom)_2
movdqa [%1+16], %3 ; top(bottom)_2
movdqa [%1+%2], %3 ; top(bottom)_3
movdqa [%1+%2+16], %3 ; top(bottom)_3
lea %1, [%1+%2]
%endmacro
%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
;r2 [width/16(8)]
;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
%if %1 == 32 ; for luma
sar r2, 04h ; width / 16(8) pixels
%if %1 == 32 ; for luma
sar r2, 04h ; width / 16(8) pixels
.top_bottom_loops:
; top
movdqa xmm0, [r0] ; first line of picture pData
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_end16x4_sse2 r5, r1, xmm0, a
; top
movdqa xmm0, [r0] ; first line of picture pData
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_end16x4_sse2 r5, r1, xmm0, a
; bottom
movdqa xmm1, [r3] ; last line of picture pData
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_end16x4_sse2 r4, r1, xmm1, a
; bottom
movdqa xmm1, [r3] ; last line of picture pData
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_end16x4_sse2 r4, r1, xmm1, a
lea r0, [r0+16] ; top pSrc
lea r5, [r5+16] ; top dst
lea r3, [r3+16] ; bottom pSrc
lea r4, [r4+16] ; bottom dst
neg r1 ; positive/negative stride need for next loop?
lea r0, [r0+16] ; top pSrc
lea r5, [r5+16] ; top dst
lea r3, [r3+16] ; bottom pSrc
lea r4, [r4+16] ; bottom dst
neg r1 ; positive/negative stride need for next loop?
dec r2
jnz near .top_bottom_loops
%elif %1 == 16 ; for chroma ??
mov r6, r2
sar r2, 04h ; (width / 16) pixels
dec r2
jnz near .top_bottom_loops
%elif %1 == 16 ; for chroma ??
mov r6, r2
sar r2, 04h ; (width / 16) pixels
.top_bottom_loops:
; top
movdqa xmm0, [r0] ; first line of picture pData
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_end16x4_sse2 r5, r1, xmm0, a
; top
movdqa xmm0, [r0] ; first line of picture pData
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_end16x4_sse2 r5, r1, xmm0, a
; bottom
movdqa xmm1, [r3] ; last line of picture pData
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_end16x4_sse2 r4, r1, xmm1, a
; bottom
movdqa xmm1, [r3] ; last line of picture pData
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_end16x4_sse2 r4, r1, xmm1, a
lea r0, [r0+16] ; top pSrc
lea r5, [r5+16] ; top dst
lea r3, [r3+16] ; bottom pSrc
lea r4, [r4+16] ; bottom dst
neg r1 ; positive/negative stride need for next loop?
lea r0, [r0+16] ; top pSrc
lea r5, [r5+16] ; top dst
lea r3, [r3+16] ; bottom pSrc
lea r4, [r4+16] ; bottom dst
neg r1 ; positive/negative stride need for next loop?
dec r2
jnz near .top_bottom_loops
dec r2
jnz near .top_bottom_loops
; for remaining 8 bytes
and r6, 0fh ; any 8 bytes left?
test r6, r6
jz near .to_be_continued ; no left to exit here
; for remaining 8 bytes
and r6, 0fh ; any 8 bytes left?
test r6, r6
jz near .to_be_continued ; no left to exit here
; top
movq mm0, [r0] ; remained 8 byte
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
; bottom
movq mm1, [r3]
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
WELSEMMS
; top
movq mm0, [r0] ; remained 8 byte
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
; bottom
movq mm1, [r3]
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
WELSEMMS
.to_be_continued:
%endif
%endmacro
%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
;r6 [height]
;r0 [pSrc+0] r5[pSrc-32] r1[stride]
;r3 [pSrc+(w-1)] r4[pSrc+w]
%if %1 == 32 ; for luma
%if %1 == 32 ; for luma
.left_right_loops:
; left
movzx r2d, byte [r0] ; pixel pData for left border
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r5], xmm0
movdqa [r5+16], xmm0
; left
movzx r2d, byte [r0] ; pixel pData for left border
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r5], xmm0
movdqa [r5+16], xmm0
; right
movzx r2d, byte [r3]
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r4], xmm1
movdqa [r4+16], xmm1
; right
movzx r2d, byte [r3]
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r4], xmm1
movdqa [r4+16], xmm1
lea r0, [r0+r1] ; left pSrc
lea r5, [r5+r1] ; left dst
lea r3, [r3+r1] ; right pSrc
lea r4, [r4+r1] ; right dst
lea r0, [r0+r1] ; left pSrc
lea r5, [r5+r1] ; left dst
lea r3, [r3+r1] ; right pSrc
lea r4, [r4+r1] ; right dst
dec r6
jnz near .left_right_loops
%elif %1 == 16 ; for chroma ??
dec r6
jnz near .left_right_loops
%elif %1 == 16 ; for chroma ??
.left_right_loops:
; left
movzx r2d, byte [r0] ; pixel pData for left border
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r5], xmm0
; left
movzx r2d, byte [r0] ; pixel pData for left border
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r5], xmm0
; right
movzx r2d, byte [r3]
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
; right
movzx r2d, byte [r3]
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
lea r0, [r0+r1] ; left pSrc
lea r5, [r5+r1] ; left dst
lea r3, [r3+r1] ; right pSrc
lea r4, [r4+r1] ; right dst
lea r0, [r0+r1] ; left pSrc
lea r5, [r5+r1] ; left dst
lea r3, [r3+r1] ; right pSrc
lea r4, [r4+r1] ; right dst
dec r6
jnz near .left_right_loops
dec r6
jnz near .left_right_loops
%endif
%endmacro
%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
%if %1 == 32 ; luma
; TL
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
%if %1 == 32 ; luma
; TL
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
; TR
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
; TR
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
; BL
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
; BL
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
; BR
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
%elif %1 == 16 ; chroma
; TL
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
; BR
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
%elif %1 == 16 ; chroma
; TL
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
; TR
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
; TR
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
; BL
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
; BL
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
; BR
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
; BR
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
%endif
%endmacro
;***********************************************************************----------------
; void ExpandPictureLuma_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
; void ExpandPictureLuma_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
WELS_EXTERN ExpandPictureLuma_sse2
@ -403,8 +403,8 @@ WELS_EXTERN ExpandPictureLuma_sse2
exp_top_bottom_sse2 32
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
@ -416,8 +416,8 @@ WELS_EXTERN ExpandPictureLuma_sse2
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
neg r1 ;r1 = stride
@ -438,8 +438,8 @@ WELS_EXTERN ExpandPictureLuma_sse2
pop r1
pop r0
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
@ -472,13 +472,13 @@ WELS_EXTERN ExpandPictureLuma_sse2
%assign push_num 0
ret
ret
;***********************************************************************----------------
; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
WELS_EXTERN ExpandPictureChromaAlign_sse2
@ -531,8 +531,8 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
exp_top_bottom_sse2 16
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
@ -557,7 +557,7 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
push r0
push r1
push r2
push r6
push r6
exp_left_right_sse2 16,a
pop r6
@ -565,8 +565,8 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
pop r1
pop r0
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
@ -599,16 +599,16 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
%assign push_num 0
ret
ret
;***********************************************************************----------------
; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
WELS_EXTERN ExpandPictureChromaUnalign_sse2
push r4
push r4
push r5
push r6
@ -657,8 +657,8 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
exp_top_bottom_sse2 16
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
@ -683,7 +683,7 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
push r0
push r1
push r2
push r6
push r6
exp_left_right_sse2 16,u
pop r6
@ -691,8 +691,8 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
pop r1
pop r0
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
@ -725,4 +725,4 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
%assign push_num 0
ret
ret

View File

@ -36,9 +36,9 @@
;*
;* History
;* 15/09/2009 Created
;* 12/28/2009 Modified with larger throughput
;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
;* 12/28/2009 Modified with larger throughput
;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
;*
;*
;*********************************************************************************************/
@ -56,174 +56,174 @@ SECTION .text
;***********************************************************************
; void WelsCopy16x16_sse2( uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
; void WelsCopy16x16_sse2( uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy16x16_sse2
push r4
push r5
%assign push_num 2
push r4
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+2*r3]
movdqa xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+2*r3]
movdqa xmm7, [r2+r5]
lea r2, [r2+4*r3]
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+2*r3]
movdqa xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+2*r3]
movdqa xmm7, [r2+r5]
lea r2, [r2+4*r3]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
lea r0, [r0+4*r1]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
lea r0, [r0+4*r1]
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+2*r3]
movdqa xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+2*r3]
movdqa xmm7, [r2+r5]
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+2*r3]
movdqa xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+2*r3]
movdqa xmm7, [r2+r5]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
;***********************************************************************
; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
WELS_EXTERN WelsCopy16x16NotAligned_sse2
push r4
push r5
%assign push_num 2
push r4
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
movdqu xmm0, [r2]
movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqu xmm4, [r2]
movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5]
lea r2, [r2+4*r3]
movdqu xmm0, [r2]
movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqu xmm4, [r2]
movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5]
lea r2, [r2+4*r3]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
lea r0, [r0+4*r1]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
lea r0, [r0+4*r1]
movdqu xmm0, [r2]
movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqu xmm4, [r2]
movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5]
movdqu xmm0, [r2]
movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqu xmm4, [r2]
movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
; , 12/29/2011
;***********************************************************************
; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy16x8NotAligned_sse2
push r4
push r5
%assign push_num 2
push r4
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
movdqu xmm0, [r2]
movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqu xmm4, [r2]
movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5]
movdqu xmm0, [r2]
movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqu xmm4, [r2]
movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
;***********************************************************************
@ -233,62 +233,62 @@ WELS_EXTERN WelsCopy16x8NotAligned_sse2
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy8x16_mmx
%assign push_num 0
%assign push_num 0
LOAD_4_PARA
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm7, [r2+r3]
lea r2, [r2+2*r3]
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm7, [r2+r3]
lea r2, [r2+2*r3]
movq [r0], mm0
movq [r0+r1], mm1
lea r0, [r0+2*r1]
movq [r0], mm2
movq [r0+r1], mm3
lea r0, [r0+2*r1]
movq [r0], mm4
movq [r0+r1], mm5
lea r0, [r0+2*r1]
movq [r0], mm6
movq [r0+r1], mm7
lea r0, [r0+2*r1]
movq [r0], mm0
movq [r0+r1], mm1
lea r0, [r0+2*r1]
movq [r0], mm2
movq [r0+r1], mm3
lea r0, [r0+2*r1]
movq [r0], mm4
movq [r0+r1], mm5
lea r0, [r0+2*r1]
movq [r0], mm6
movq [r0+r1], mm7
lea r0, [r0+2*r1]
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm7, [r2+r3]
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm7, [r2+r3]
movq [r0], mm0
movq [r0+r1], mm1
lea r0, [r0+2*r1]
movq [r0], mm2
movq [r0+r1], mm3
lea r0, [r0+2*r1]
movq [r0], mm4
movq [r0+r1], mm5
lea r0, [r0+2*r1]
movq [r0], mm6
movq [r0+r1], mm7
movq [r0], mm0
movq [r0+r1], mm1
lea r0, [r0+2*r1]
movq [r0], mm2
movq [r0+r1], mm3
lea r0, [r0+2*r1]
movq [r0], mm4
movq [r0+r1], mm5
lea r0, [r0+2*r1]
movq [r0], mm6
movq [r0+r1], mm7
WELSEMMS
LOAD_4_PARA_POP
ret
WELSEMMS
LOAD_4_PARA_POP
ret
;***********************************************************************
; void WelsCopy8x8_mmx( uint8_t* Dst,
@ -297,48 +297,48 @@ WELS_EXTERN WelsCopy8x16_mmx
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy8x8_mmx
push r4
%assign push_num 1
push r4
%assign push_num 1
LOAD_4_PARA
lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
; to prefetch next loop
prefetchnta [r2+2*r3]
prefetchnta [r2+r4]
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
; to prefetch next loop
prefetchnta [r2+2*r3]
prefetchnta [r2+r4]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
; to prefetch next loop
prefetchnta [r2+2*r3]
prefetchnta [r2+r4]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm7, [r2+r3]
; to prefetch next loop
prefetchnta [r2+2*r3]
prefetchnta [r2+r4]
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
; to prefetch next loop
prefetchnta [r2+2*r3]
prefetchnta [r2+r4]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
; to prefetch next loop
prefetchnta [r2+2*r3]
prefetchnta [r2+r4]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm7, [r2+r3]
movq [r0], mm0
movq [r0+r1], mm1
lea r0, [r0+2*r1]
movq [r0], mm2
movq [r0+r1], mm3
lea r0, [r0+2*r1]
movq [r0], mm4
movq [r0+r1], mm5
lea r0, [r0+2*r1]
movq [r0], mm6
movq [r0+r1], mm7
movq [r0], mm0
movq [r0+r1], mm1
lea r0, [r0+2*r1]
movq [r0], mm2
movq [r0+r1], mm3
lea r0, [r0+2*r1]
movq [r0], mm4
movq [r0+r1], mm5
lea r0, [r0+2*r1]
movq [r0], mm6
movq [r0+r1], mm7
WELSEMMS
LOAD_4_PARA_POP
pop r4
ret
WELSEMMS
LOAD_4_PARA_POP
pop r4
ret
; (dunhuang@cisco), 12/21/2011
;***********************************************************************
@ -349,13 +349,13 @@ WELS_EXTERN UpdateMbMv_sse2
%assign push_num 0
LOAD_2_PARA
movd xmm0, r1d ; _mv
pshufd xmm1, xmm0, $00
movdqa [r0 ], xmm1
movdqa [r0+0x10], xmm1
movdqa [r0+0x20], xmm1
movdqa [r0+0x30], xmm1
ret
movd xmm0, r1d ; _mv
pshufd xmm1, xmm0, $00
movdqa [r0 ], xmm1
movdqa [r0+0x10], xmm1
movdqa [r0+0x20], xmm1
movdqa [r0+0x30], xmm1
ret
;*******************************************************************************
; Macros and other preprocessor constants
@ -381,14 +381,14 @@ WELS_EXTERN PixelAvgWidthEq4_mmx
%assign push_num 0
LOAD_7_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
movd mm0, [r4]
movd mm0, [r4]
pavgb mm0, [r2]
movd [r0], mm0
@ -398,8 +398,8 @@ ALIGN 4
lea r4, [r4+r5]
jne .height_loop
WELSEMMS
LOAD_7_PARA_POP
WELSEMMS
LOAD_7_PARA_POP
ret
@ -413,29 +413,29 @@ WELS_EXTERN PixelAvgWidthEq8_mmx
%assign push_num 0
LOAD_7_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
movq mm0, [r2]
movq mm0, [r2]
pavgb mm0, [r4]
movq [r0], mm0
movq mm0, [r2+r3]
pavgb mm0, [r4+r5]
movq [r0+r1], mm0
movq [r0+r1], mm0
lea r2, [r2+2*r3]
lea r4, [r4+2*r5]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
lea r4, [r4+2*r5]
lea r0, [r0+2*r1]
sub r6, 2
jnz .height_loop
WELSEMMS
LOAD_7_PARA_POP
WELSEMMS
LOAD_7_PARA_POP
ret
@ -450,46 +450,46 @@ WELS_EXTERN PixelAvgWidthEq16_sse2
%assign push_num 0
LOAD_7_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
movdqu xmm0, [r2]
movdqu xmm1, [r4]
pavgb xmm0, xmm1
;pavgb xmm0, [r4]
movdqu xmm0, [r2]
movdqu xmm1, [r4]
pavgb xmm0, xmm1
;pavgb xmm0, [r4]
movdqu [r0], xmm0
movdqu xmm0, [r2+r3]
movdqu xmm1, [r4+r5]
pavgb xmm0, xmm1
movdqu xmm0, [r2+r3]
movdqu xmm1, [r4+r5]
pavgb xmm0, xmm1
movdqu [r0+r1], xmm0
movdqu xmm0, [r2+2*r3]
movdqu xmm1, [r4+2*r5]
pavgb xmm0, xmm1
movdqu xmm0, [r2+2*r3]
movdqu xmm1, [r4+2*r5]
pavgb xmm0, xmm1
movdqu [r0+2*r1], xmm0
lea r2, [r2+2*r3]
lea r4, [r4+2*r5]
lea r0, [r0+2*r1]
lea r4, [r4+2*r5]
lea r0, [r0+2*r1]
movdqu xmm0, [r2+r3]
movdqu xmm1, [r4+r5]
pavgb xmm0, xmm1
movdqu xmm0, [r2+r3]
movdqu xmm1, [r4+r5]
pavgb xmm0, xmm1
movdqu [r0+r1], xmm0
lea r2, [r2+2*r3]
lea r4, [r4+2*r5]
lea r0, [r0+2*r1]
lea r4, [r4+2*r5]
lea r0, [r0+2*r1]
sub r6, 4
jne .height_loop
WELSEMMS
LOAD_7_PARA_POP
WELSEMMS
LOAD_7_PARA_POP
ret
;*******************************************************************************
@ -497,26 +497,26 @@ ALIGN 4
; uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
WELS_EXTERN McCopyWidthEq4_mmx
push r5
push r5
%assign push_num 1
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
mov r5d, [r0]
mov [r2], r5d
mov r5d, [r0]
mov [r2], r5d
add r0, r1
add r2, r3
dec r4
jnz .height_loop
WELSEMMS
add r0, r1
add r2, r3
dec r4
jnz .height_loop
WELSEMMS
LOAD_5_PARA_POP
pop r5
pop r5
ret
;*******************************************************************************
@ -527,21 +527,21 @@ WELS_EXTERN McCopyWidthEq8_mmx
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
movq mm0, [r0]
movq [r2], mm0
add r0, r1
add r2, r3
dec r4
jnz .height_loop
movq mm0, [r0]
movq [r2], mm0
add r0, r1
add r2, r3
dec r4
jnz .height_loop
WELSEMMS
LOAD_5_PARA_POP
WELSEMMS
LOAD_5_PARA_POP
ret
@ -550,32 +550,32 @@ ALIGN 4
;*******************************************************************************
;read unaligned memory
%macro SSE_READ_UNA 2
movq %1, [%2]
movhps %1, [%2+8]
movq %1, [%2]
movhps %1, [%2+8]
%endmacro
;write unaligned memory
%macro SSE_WRITE_UNA 2
movq [%1], %2
movhps [%1+8], %2
movq [%1], %2
movhps [%1+8], %2
%endmacro
WELS_EXTERN McCopyWidthEq16_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
SSE_READ_UNA xmm0, r0
SSE_READ_UNA xmm1, r0+r1
SSE_WRITE_UNA r2, xmm0
SSE_WRITE_UNA r2+r3, xmm1
SSE_READ_UNA xmm0, r0
SSE_READ_UNA xmm1, r0+r1
SSE_WRITE_UNA r2, xmm0
SSE_WRITE_UNA r2+r3, xmm1
sub r4, 2
sub r4, 2
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
jnz .height_loop
LOAD_5_PARA_POP
LOAD_5_PARA_POP
ret

View File

@ -53,10 +53,10 @@ SECTION .rodata align=16
ALIGN 16
h264_d0x20_sse2:
dw 32,32,32,32,32,32,32,32
dw 32,32,32,32,32,32,32,32
ALIGN 16
h264_d0x20_mmx:
dw 32,32,32,32
dw 32,32,32,32
;=============================================================================
@ -67,171 +67,171 @@ SECTION .text
;*******************************************************************************
; void McChromaWidthEq4_mmx( const uint8_t *src,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; const uint8_t *pABCD,
; int32_t iHeigh );
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; const uint8_t *pABCD,
; int32_t iHeigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq4_mmx
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
movd mm3, [r4]; [eax]
WELS_Zero mm7
punpcklbw mm3, mm3
movq mm4, mm3
punpcklwd mm3, mm3
punpckhwd mm4, mm4
movd mm3, [r4]; [eax]
WELS_Zero mm7
punpcklbw mm3, mm3
movq mm4, mm3
punpcklwd mm3, mm3
punpckhwd mm4, mm4
movq mm5, mm3
punpcklbw mm3, mm7
punpckhbw mm5, mm7
movq mm5, mm3
punpcklbw mm3, mm7
punpckhbw mm5, mm7
movq mm6, mm4
punpcklbw mm4, mm7
punpckhbw mm6, mm7
movq mm6, mm4
punpcklbw mm4, mm7
punpckhbw mm6, mm7
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
movd mm0, [r0]
movd mm1, [r0+1]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
movd mm0, [r0]
movd mm1, [r0+1]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
.xloop:
pmullw mm0, mm3
pmullw mm1, mm5
paddw mm0, mm1
pmullw mm0, mm3
pmullw mm1, mm5
paddw mm0, mm1
movd mm1, [r4]
punpcklbw mm1, mm7
movq mm2, mm1
pmullw mm1, mm4
paddw mm0, mm1
movd mm1, [r4]
punpcklbw mm1, mm7
movq mm2, mm1
pmullw mm1, mm4
paddw mm0, mm1
movd mm1, [r4+1]
punpcklbw mm1, mm7
movq mm7, mm1
pmullw mm1,mm6
paddw mm0, mm1
movq mm1,mm7
movd mm1, [r4+1]
punpcklbw mm1, mm7
movq mm7, mm1
pmullw mm1,mm6
paddw mm0, mm1
movq mm1,mm7
paddw mm0, [h264_d0x20_mmx]
psrlw mm0, 6
paddw mm0, [h264_d0x20_mmx]
psrlw mm0, 6
WELS_Zero mm7
packuswb mm0, mm7
movd [r2], mm0
WELS_Zero mm7
packuswb mm0, mm7
movd [r2], mm0
movq mm0, mm2
movq mm0, mm2
lea r2, [r2 + r3]
lea r4, [r4 + r1]
lea r2, [r2 + r3]
lea r4, [r4 + r1]
dec r5
jnz near .xloop
WELSEMMS
LOAD_6_PARA_POP
ret
dec r5
jnz near .xloop
WELSEMMS
LOAD_6_PARA_POP
ret
;*******************************************************************************
; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; const uint8_t *pABCD,
; int32_t iheigh );
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; const uint8_t *pABCD,
; int32_t iheigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq8_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
movd xmm3, [r4]
WELS_Zero xmm7
punpcklbw xmm3, xmm3
punpcklwd xmm3, xmm3
movd xmm3, [r4]
WELS_Zero xmm7
punpcklbw xmm3, xmm3
punpcklwd xmm3, xmm3
movdqa xmm4, xmm3
punpckldq xmm3, xmm3
punpckhdq xmm4, xmm4
movdqa xmm5, xmm3
movdqa xmm6, xmm4
movdqa xmm4, xmm3
punpckldq xmm3, xmm3
punpckhdq xmm4, xmm4
movdqa xmm5, xmm3
movdqa xmm6, xmm4
punpcklbw xmm3, xmm7
punpckhbw xmm5, xmm7
punpcklbw xmm4, xmm7
punpckhbw xmm6, xmm7
punpcklbw xmm3, xmm7
punpckhbw xmm5, xmm7
punpcklbw xmm4, xmm7
punpckhbw xmm6, xmm7
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
movq xmm0, [r0]
movq xmm1, [r0+1]
punpcklbw xmm0, xmm7
punpcklbw xmm1, xmm7
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
movq xmm0, [r0]
movq xmm1, [r0+1]
punpcklbw xmm0, xmm7
punpcklbw xmm1, xmm7
.xloop:
pmullw xmm0, xmm3
pmullw xmm1, xmm5
paddw xmm0, xmm1
pmullw xmm0, xmm3
pmullw xmm1, xmm5
paddw xmm0, xmm1
movq xmm1, [r4]
punpcklbw xmm1, xmm7
movdqa xmm2, xmm1
pmullw xmm1, xmm4
paddw xmm0, xmm1
movq xmm1, [r4]
punpcklbw xmm1, xmm7
movdqa xmm2, xmm1
pmullw xmm1, xmm4
paddw xmm0, xmm1
movq xmm1, [r4+1]
punpcklbw xmm1, xmm7
movdqa xmm7, xmm1
pmullw xmm1, xmm6
paddw xmm0, xmm1
movdqa xmm1,xmm7
movq xmm1, [r4+1]
punpcklbw xmm1, xmm7
movdqa xmm7, xmm1
pmullw xmm1, xmm6
paddw xmm0, xmm1
movdqa xmm1,xmm7
paddw xmm0, [h264_d0x20_sse2]
psrlw xmm0, 6
paddw xmm0, [h264_d0x20_sse2]
psrlw xmm0, 6
WELS_Zero xmm7
packuswb xmm0, xmm7
movq [r2], xmm0
WELS_Zero xmm7
packuswb xmm0, xmm7
movq [r2], xmm0
movdqa xmm0, xmm2
movdqa xmm0, xmm2
lea r2, [r2 + r3]
lea r4, [r4 + r1]
lea r2, [r2 + r3]
lea r4, [r4 + r1]
dec r5
jnz near .xloop
dec r5
jnz near .xloop
POP_XMM
LOAD_6_PARA_POP
POP_XMM
LOAD_6_PARA_POP
ret
ret
;***********************************************************************
; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
; int32_t iSrcStride,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; const uint8_t *pABCD,
; int32_t iHeigh);
; int32_t iHeigh);
;***********************************************************************
WELS_EXTERN McChromaWidthEq8_ssse3
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
pxor xmm7, xmm7
movd xmm5, [r4]
@ -243,27 +243,27 @@ WELS_EXTERN McChromaWidthEq8_ssse3
sub r2, r3 ;sub esi, edi
sub r2, r3
movdqa xmm7, [h264_d0x20_sse2]
movdqa xmm7, [h264_d0x20_sse2]
movdqu xmm0, [r0]
movdqa xmm1, xmm0
psrldq xmm1, 1
punpcklbw xmm0, xmm1
movdqu xmm0, [r0]
movdqa xmm1, xmm0
psrldq xmm1, 1
punpcklbw xmm0, xmm1
.hloop_chroma:
lea r2, [r2+2*r3]
lea r2, [r2+2*r3]
movdqu xmm2, [r0+r1]
movdqa xmm3, xmm2
psrldq xmm3, 1
punpcklbw xmm2, xmm3
movdqa xmm4, xmm2
movdqu xmm2, [r0+r1]
movdqa xmm3, xmm2
psrldq xmm3, 1
punpcklbw xmm2, xmm3
movdqa xmm4, xmm2
pmaddubsw xmm0, xmm5
pmaddubsw xmm2, xmm6
paddw xmm0, xmm2
paddw xmm0, xmm7
psrlw xmm0, 6
psrlw xmm0, 6
packuswb xmm0, xmm0
movq [r2],xmm0
@ -278,16 +278,16 @@ WELS_EXTERN McChromaWidthEq8_ssse3
pmaddubsw xmm2, xmm6
paddw xmm4, xmm2
paddw xmm4, xmm7
psrlw xmm4, 6
psrlw xmm4, 6
packuswb xmm4, xmm4
movq [r2+r3],xmm4
sub r5, 2
jnz .hloop_chroma
sub r5, 2
jnz .hloop_chroma
POP_XMM
LOAD_6_PARA_POP
POP_XMM
LOAD_6_PARA_POP
ret
ret

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -29,16 +29,16 @@
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* vaa.asm
;* vaa.asm
;*
;* Abstract
;* Abstract
;* sse2 for pVaa routines
;*
;* History
;* 04/14/2010 Created
;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
;* 04/14/2010 Created
;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
;*
;*************************************************************************/
%include "asm_inc.asm"
@ -49,87 +49,87 @@
;***********************************************************************
; by comparing it outperforms than phaddw(SSSE3) sets
%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
; @sum_8x2 begin
pshufd %2, %1, 04Eh ; 01001110 B
paddw %1, %2
pshuflw %2, %1, 04Eh ; 01001110 B
paddw %1, %2
pshuflw %2, %1, 0B1h ; 10110001 B
paddw %1, %2
; end of @sum_8x2
%endmacro ; END of SUM_WORD_8x2_SSE2
%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
; @sum_8x2 begin
pshufd %2, %1, 04Eh ; 01001110 B
paddw %1, %2
pshuflw %2, %1, 04Eh ; 01001110 B
paddw %1, %2
pshuflw %2, %1, 0B1h ; 10110001 B
paddw %1, %2
; end of @sum_8x2
%endmacro ; END of SUM_WORD_8x2_SSE2
%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
movdqa %1, [r0 ] ; line 0
movdqa %2, [r0+r1] ; line 1
movdqa %3, %1
punpcklbw %1, xmm7
punpckhbw %3, xmm7
movdqa %4, %2
punpcklbw %4, xmm7
punpckhbw %2, xmm7
paddw %1, %4
paddw %2, %3
movdqa %3, [r0+r2] ; line 2
movdqa %4, [r0+r3] ; line 3
movdqa %5, %3
punpcklbw %3, xmm7
punpckhbw %5, xmm7
movdqa %6, %4
punpcklbw %6, xmm7
punpckhbw %4, xmm7
paddw %3, %6
paddw %4, %5
paddw %1, %3 ; block 0, 1
paddw %2, %4 ; block 2, 3
pshufd %3, %1, 0B1h
pshufd %4, %2, 0B1h
paddw %1, %3
paddw %2, %4
movdqa %3, %1
movdqa %4, %2
pshuflw %5, %1, 0B1h
pshufhw %6, %3, 0B1h
paddw %1, %5
paddw %3, %6
pshuflw %5, %2, 0B1h
pshufhw %6, %4, 0B1h
paddw %2, %5
paddw %4, %6
punpcklwd %1, %2
punpckhwd %3, %4
punpcklwd %1, %3
psraw %1, $04
movdqa %1, [r0 ] ; line 0
movdqa %2, [r0+r1] ; line 1
movdqa %3, %1
punpcklbw %1, xmm7
punpckhbw %3, xmm7
movdqa %4, %2
punpcklbw %4, xmm7
punpckhbw %2, xmm7
paddw %1, %4
paddw %2, %3
movdqa %3, [r0+r2] ; line 2
movdqa %4, [r0+r3] ; line 3
movdqa %5, %3
punpcklbw %3, xmm7
punpckhbw %5, xmm7
movdqa %6, %4
punpcklbw %6, xmm7
punpckhbw %4, xmm7
paddw %3, %6
paddw %4, %5
paddw %1, %3 ; block 0, 1
paddw %2, %4 ; block 2, 3
pshufd %3, %1, 0B1h
pshufd %4, %2, 0B1h
paddw %1, %3
paddw %2, %4
movdqa %3, %1
movdqa %4, %2
pshuflw %5, %1, 0B1h
pshufhw %6, %3, 0B1h
paddw %1, %5
paddw %3, %6
pshuflw %5, %2, 0B1h
pshufhw %6, %4, 0B1h
paddw %2, %5
paddw %4, %6
punpcklwd %1, %2
punpckhwd %3, %4
punpcklwd %1, %3
psraw %1, $04
%endmacro
%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
movdqa %1, [r0 ] ; line 0
movdqa %2, [r0+r1] ; line 1
movdqa %3, %1
punpcklbw %1, xmm7
punpckhbw %3, xmm7
movdqa %4, %2
punpcklbw %4, xmm7
punpckhbw %2, xmm7
paddw %1, %4
paddw %2, %3
movdqa %3, [r0+r2] ; line 2
movdqa %4, [r0+r3] ; line 3
movdqa %5, %3
punpcklbw %3, xmm7
punpckhbw %5, xmm7
movdqa %6, %4
punpcklbw %6, xmm7
punpckhbw %4, xmm7
paddw %3, %6
paddw %4, %5
paddw %1, %3 ; block 0, 1
paddw %2, %4 ; block 2, 3
phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
psraw %1, $04
movdqa %1, [r0 ] ; line 0
movdqa %2, [r0+r1] ; line 1
movdqa %3, %1
punpcklbw %1, xmm7
punpckhbw %3, xmm7
movdqa %4, %2
punpcklbw %4, xmm7
punpckhbw %2, xmm7
paddw %1, %4
paddw %2, %3
movdqa %3, [r0+r2] ; line 2
movdqa %4, [r0+r3] ; line 3
movdqa %5, %3
punpcklbw %3, xmm7
punpckhbw %5, xmm7
movdqa %6, %4
punpcklbw %6, xmm7
punpckhbw %4, xmm7
paddw %3, %6
paddw %4, %5
paddw %1, %3 ; block 0, 1
paddw %2, %4 ; block 2, 3
phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
psraw %1, $04
%endmacro
@ -143,7 +143,7 @@ SECTION .text
; , 6/7/2010
;***********************************************************************
; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
;***********************************************************************
WELS_EXTERN AnalysisVaaInfoIntra_sse2
@ -174,71 +174,71 @@ WELS_EXTERN AnalysisVaaInfoIntra_sse2
mov r4,r2
sal r4,$01 ;r4 = 4*iLineSize
pxor xmm7, xmm7
pxor xmm7, xmm7
; loops
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7], xmm0
; loops
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7], xmm0
lea r0, [r0+r4]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+8], xmm0
lea r0, [r0+r4]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+8], xmm0
lea r0, [r0+r4]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+16], xmm0
lea r0, [r0+r4]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+16], xmm0
lea r0, [r0+r4]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+24], xmm0
lea r0, [r0+r4]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+24], xmm0
movdqa xmm0, [r7] ; block 0~7
movdqa xmm1, [r7+16] ; block 8~15
movdqa xmm2, xmm0
paddw xmm0, xmm1
SUM_WORD_8x2_SSE2 xmm0, xmm3
movdqa xmm0, [r7] ; block 0~7
movdqa xmm1, [r7+16] ; block 8~15
movdqa xmm2, xmm0
paddw xmm0, xmm1
SUM_WORD_8x2_SSE2 xmm0, xmm3
pmullw xmm1, xmm1
pmullw xmm2, xmm2
movdqa xmm3, xmm1
movdqa xmm4, xmm2
punpcklwd xmm1, xmm7
punpckhwd xmm3, xmm7
punpcklwd xmm2, xmm7
punpckhwd xmm4, xmm7
paddd xmm1, xmm2
paddd xmm3, xmm4
paddd xmm1, xmm3
pshufd xmm2, xmm1, 01Bh
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
pmullw xmm1, xmm1
pmullw xmm2, xmm2
movdqa xmm3, xmm1
movdqa xmm4, xmm2
punpcklwd xmm1, xmm7
punpckhwd xmm3, xmm7
punpcklwd xmm2, xmm7
punpckhwd xmm4, xmm7
paddd xmm1, xmm2
paddd xmm3, xmm4
paddd xmm1, xmm3
pshufd xmm2, xmm1, 01Bh
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
movd r2d, xmm0
and r2, 0ffffh ; effective low work truncated
mov r3, r2
imul r2, r3
sar r2, $04
movd retrd, xmm1
sub retrd, r2d
movd r2d, xmm0
and r2, 0ffffh ; effective low work truncated
mov r3, r2
imul r2, r3
sar r2, $04
movd retrd, xmm1
sub retrd, r2d
add r7,32
add r7,r5
add r7,32
add r7,r5
%ifdef X86_32
pop r6
pop r5
pop r4
pop r3
pop r6
pop r5
pop r4
pop r3
%endif
POP_XMM
POP_XMM
ret
ret
;***********************************************************************
; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
;***********************************************************************
WELS_EXTERN AnalysisVaaInfoIntra_ssse3
@ -269,47 +269,47 @@ WELS_EXTERN AnalysisVaaInfoIntra_ssse3
mov r4,r2
sal r4,$01 ;r4 = 4*iLineSize
pxor xmm7, xmm7
pxor xmm7, xmm7
; loops
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
; loops
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7],xmm0
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [r7+8],xmm1
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+16],xmm0
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [r7+24],xmm1
movdqa xmm0,[r7]
movdqa xmm1,[r7+16]
movdqa xmm2, xmm0
paddw xmm0, xmm1
SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
movdqa xmm0,[r7]
movdqa xmm1,[r7+16]
movdqa xmm2, xmm0
paddw xmm0, xmm1
SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
pmullw xmm1, xmm1
pmullw xmm2, xmm2
movdqa xmm3, xmm1
movdqa xmm4, xmm2
punpcklwd xmm1, xmm7
punpckhwd xmm3, xmm7
punpcklwd xmm2, xmm7
punpckhwd xmm4, xmm7
paddd xmm1, xmm2
paddd xmm3, xmm4
paddd xmm1, xmm3
pshufd xmm2, xmm1, 01Bh
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
pmullw xmm1, xmm1
pmullw xmm2, xmm2
movdqa xmm3, xmm1
movdqa xmm4, xmm2
punpcklwd xmm1, xmm7
punpckhwd xmm3, xmm7
punpcklwd xmm2, xmm7
punpckhwd xmm4, xmm7
paddd xmm1, xmm2
paddd xmm3, xmm4
paddd xmm1, xmm3
pshufd xmm2, xmm1, 01Bh
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
movd r2d, xmm0
@ -318,94 +318,94 @@ WELS_EXTERN AnalysisVaaInfoIntra_ssse3
imul r2, r3
sar r2, $04
movd retrd, xmm1
sub retrd, r2d
sub retrd, r2d
add r7,32
add r7,r5
add r7,32
add r7,r5
%ifdef X86_32
pop r6
pop r5
pop r4
pop r3
pop r6
pop r5
pop r4
pop r3
%endif
POP_XMM
POP_XMM
ret
ret
;***********************************************************************
; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
;***********************************************************************
WELS_EXTERN MdInterAnalysisVaaInfo_sse41
%assign push_num 0
LOAD_1_PARA
movdqa xmm0,[r0]
pshufd xmm1, xmm0, 01Bh
paddd xmm1, xmm0
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
psrad xmm1, 02h ; iAverageSad
movdqa xmm2, xmm1
psrad xmm2, 06h
movdqa xmm3, xmm0 ; iSadBlock
psrad xmm3, 06h
psubd xmm3, xmm2
pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
pshufd xmm4, xmm3, 01Bh
paddd xmm4, xmm3
pshufd xmm3, xmm4, 0B1h
paddd xmm3, xmm4
movd r0d, xmm3
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
%assign push_num 0
LOAD_1_PARA
movdqa xmm0,[r0]
pshufd xmm1, xmm0, 01Bh
paddd xmm1, xmm0
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
psrad xmm1, 02h ; iAverageSad
movdqa xmm2, xmm1
psrad xmm2, 06h
movdqa xmm3, xmm0 ; iSadBlock
psrad xmm3, 06h
psubd xmm3, xmm2
pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
pshufd xmm4, xmm3, 01Bh
paddd xmm4, xmm3
pshufd xmm3, xmm4, 0B1h
paddd xmm3, xmm4
movd r0d, xmm3
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
jb near .threshold_exit
pshufd xmm0, xmm0, 01Bh
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
movmskps retrd, xmm0
ret
jb near .threshold_exit
pshufd xmm0, xmm0, 01Bh
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
movmskps retrd, xmm0
ret
.threshold_exit:
mov retrd, 15
ret
mov retrd, 15
ret
;***********************************************************************
; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
;***********************************************************************
WELS_EXTERN MdInterAnalysisVaaInfo_sse2
%assign push_num 0
LOAD_1_PARA
movdqa xmm0, [r0]
pshufd xmm1, xmm0, 01Bh
paddd xmm1, xmm0
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
psrad xmm1, 02h ; iAverageSad
movdqa xmm2, xmm1
psrad xmm2, 06h
movdqa xmm3, xmm0 ; iSadBlock
psrad xmm3, 06h
psubd xmm3, xmm2
%assign push_num 0
LOAD_1_PARA
movdqa xmm0, [r0]
pshufd xmm1, xmm0, 01Bh
paddd xmm1, xmm0
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
psrad xmm1, 02h ; iAverageSad
movdqa xmm2, xmm1
psrad xmm2, 06h
movdqa xmm3, xmm0 ; iSadBlock
psrad xmm3, 06h
psubd xmm3, xmm2
; to replace pmulld functionality as below
movdqa xmm2, xmm3
pmuludq xmm2, xmm3
pshufd xmm4, xmm3, 0B1h
pmuludq xmm4, xmm4
movdqa xmm5, xmm2
punpckldq xmm5, xmm4
punpckhdq xmm2, xmm4
punpcklqdq xmm5, xmm2
; to replace pmulld functionality as below
movdqa xmm2, xmm3
pmuludq xmm2, xmm3
pshufd xmm4, xmm3, 0B1h
pmuludq xmm4, xmm4
movdqa xmm5, xmm2
punpckldq xmm5, xmm4
punpckhdq xmm2, xmm4
punpcklqdq xmm5, xmm2
pshufd xmm4, xmm5, 01Bh
paddd xmm4, xmm5
pshufd xmm5, xmm4, 0B1h
paddd xmm5, xmm4
pshufd xmm4, xmm5, 01Bh
paddd xmm4, xmm5
pshufd xmm5, xmm4, 0B1h
paddd xmm5, xmm4
movd r0d, xmm5
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
jb near .threshold_exit
pshufd xmm0, xmm0, 01Bh
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
movmskps retrd, xmm0
ret
movd r0d, xmm5
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
jb near .threshold_exit
pshufd xmm0, xmm0, 01Bh
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
movmskps retrd, xmm0
ret
.threshold_exit:
mov retrd, 15
ret
mov retrd, 15
ret

View File

@ -36,128 +36,128 @@
#ifdef __APPLE__
.macro ROW_TRANSFORM_1_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 $8, $1, #1
vshr.s16 $9, $3, #1
vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
// }
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 $8, $1, #1
vshr.s16 $9, $3, #1
vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_4BYTES // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
.macro TRANSFORM_4BYTES // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro COL_TRANSFORM_1_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 $6, $1, #1
vshr.s32 $7, $3, #1
vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 $6, $1, #1
vshr.s32 $7, $3, #1
vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
#else
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 \arg8, \arg1, #1
vshr.s16 \arg9, \arg3, #1
vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
// }
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 \arg8, \arg1, #1
vshr.s16 \arg9, \arg3, #1
vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 \arg6, \arg1, #1
vshr.s32 \arg7, \arg3, #1
vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 \arg6, \arg1, #1
vshr.s32 \arg7, \arg3, #1
vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
#endif
// r0 int16_t* block,
// r1 int8_t* non_zero_count,
WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
vld1.64 {d0-d2}, [r1]
vld1.64 {d0-d2}, [r1]
vceq.s8 q0, q0, #0
vceq.s8 d2, d2, #0
vmvn q0, q0
vmvn d2, d2
vabs.s8 q0, q0
vabs.s8 d2, d2
vceq.s8 q0, q0, #0
vceq.s8 d2, d2, #0
vmvn q0, q0
vmvn d2, d2
vabs.s8 q0, q0
vabs.s8 d2, d2
vst1.64 {d0-d2}, [r1]
vst1.64 {d0-d2}, [r1]
WELS_ASM_FUNC_END
// uint8_t *pred, const int32_t stride, int16_t *rs
// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles!
vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles!
ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q8, q9, q10, q11, d4, d5
ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q8, q9, q10, q11, d4, d5
TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
// transform element 32bits
vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
// transform element 32bits
vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
COL_TRANSFORM_1_STEP q0, q1, q2, q3, q8, q9, q10, q11
COL_TRANSFORM_1_STEP q0, q1, q2, q3, q8, q9, q10, q11
TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
//after clip_table[MAX_NEG_CROP] into [0, 255]
mov r2, r0
vld1.32 {d20[0]},[r0],r1
vld1.32 {d20[1]},[r0],r1
vld1.32 {d22[0]},[r0],r1
vld1.32 {d22[1]},[r0]
//after clip_table[MAX_NEG_CROP] into [0, 255]
mov r2, r0
vld1.32 {d20[0]},[r0],r1
vld1.32 {d20[1]},[r0],r1
vld1.32 {d22[0]},[r0],r1
vld1.32 {d22[1]},[r0]
vrshrn.s32 d16, q0, #6
vrshrn.s32 d17, q1, #6
vrshrn.s32 d18, q2, #6
vrshrn.s32 d19, q3, #6
vrshrn.s32 d16, q0, #6
vrshrn.s32 d17, q1, #6
vrshrn.s32 d18, q2, #6
vrshrn.s32 d19, q3, #6
vmovl.u8 q0,d20
vmovl.u8 q1,d22
vadd.s16 q0,q8
vadd.s16 q1,q9
vmovl.u8 q0,d20
vmovl.u8 q1,d22
vadd.s16 q0,q8
vadd.s16 q1,q9
vqmovun.s16 d20,q0
vqmovun.s16 d22,q1
vqmovun.s16 d20,q0
vqmovun.s16 d22,q1
vst1.32 {d20[0]},[r2],r1
vst1.32 {d20[1]},[r2],r1
vst1.32 {d22[0]},[r2],r1
vst1.32 {d22[1]},[r2]
vst1.32 {d20[0]},[r2],r1
vst1.32 {d20[1]},[r2],r1
vst1.32 {d22[0]},[r2],r1
vst1.32 {d22[1]},[r2]
WELS_ASM_FUNC_END
#endif

View File

@ -38,104 +38,104 @@
#ifdef __APPLE__
//Global macro
.macro GET_8BYTE_DATA
vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2
vld1.8 {$0[2]}, [$1], $2
vld1.8 {$0[3]}, [$1], $2
vld1.8 {$0[4]}, [$1], $2
vld1.8 {$0[5]}, [$1], $2
vld1.8 {$0[6]}, [$1], $2
vld1.8 {$0[7]}, [$1], $2
vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2
vld1.8 {$0[2]}, [$1], $2
vld1.8 {$0[3]}, [$1], $2
vld1.8 {$0[4]}, [$1], $2
vld1.8 {$0[5]}, [$1], $2
vld1.8 {$0[6]}, [$1], $2
vld1.8 {$0[7]}, [$1], $2
.endmacro
#else
//Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2
vld1.8 {\arg0[2]}, [\arg1], \arg2
vld1.8 {\arg0[3]}, [\arg1], \arg2
vld1.8 {\arg0[4]}, [\arg1], \arg2
vld1.8 {\arg0[5]}, [\arg1], \arg2
vld1.8 {\arg0[6]}, [\arg1], \arg2
vld1.8 {\arg0[7]}, [\arg1], \arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2
vld1.8 {\arg0[2]}, [\arg1], \arg2
vld1.8 {\arg0[3]}, [\arg1], \arg2
vld1.8 {\arg0[4]}, [\arg1], \arg2
vld1.8 {\arg0[5]}, [\arg1], \arg2
vld1.8 {\arg0[6]}, [\arg1], \arg2
vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
//Get the top line data to 'q0'
sub r2, r0, r1
vldm r2, {d0, d1}
//Get the top line data to 'q0'
sub r2, r0, r1
vldm r2, {d0, d1}
mov r2, r0
mov r3, #4
//Set the top line to the each line of MB(16*16)
mov r2, r0
mov r3, #4
//Set the top line to the each line of MB(16*16)
loop_0_get_i16x16_luma_pred_v:
vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1
subs r3, #1
bne loop_0_get_i16x16_luma_pred_v
vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1
subs r3, #1
bne loop_0_get_i16x16_luma_pred_v
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
sub r2, r0, #1
mov r3, #4
sub r2, r0, #1
mov r3, #4
loop_0_get_i16x16_luma_pred_h:
//Get one byte data from left side
vld1.8 {d0[],d1[]}, [r2], r1
vld1.8 {d2[],d3[]}, [r2], r1
vld1.8 {d4[],d5[]}, [r2], r1
vld1.8 {d6[],d7[]}, [r2], r1
//Get one byte data from left side
vld1.8 {d0[],d1[]}, [r2], r1
vld1.8 {d2[],d3[]}, [r2], r1
vld1.8 {d4[],d5[]}, [r2], r1
vld1.8 {d6[],d7[]}, [r2], r1
//Set the line of MB using the left side byte data
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d2,d3}, [r0], r1
vst1.8 {d4,d5}, [r0], r1
vst1.8 {d6,d7}, [r0], r1
//Set the line of MB using the left side byte data
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d2,d3}, [r0], r1
vst1.8 {d4,d5}, [r0], r1
vst1.8 {d6,d7}, [r0], r1
subs r3, #1
bne loop_0_get_i16x16_luma_pred_h
subs r3, #1
bne loop_0_get_i16x16_luma_pred_h
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
//stmdb sp!, { r2-r5, lr}
//Get the left vertical line data
sub r2, r0, #1
GET_8BYTE_DATA d0, r2, r1
GET_8BYTE_DATA d1, r2, r1
//stmdb sp!, { r2-r5, lr}
//Get the left vertical line data
sub r2, r0, #1
GET_8BYTE_DATA d0, r2, r1
GET_8BYTE_DATA d1, r2, r1
//Get the top horizontal line data
sub r2, r0, r1
vldm r2, {d2, d3}
//Get the top horizontal line data
sub r2, r0, r1
vldm r2, {d2, d3}
//Calculate the sum of top horizontal line data and vertical line data
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
//Calculate the sum of top horizontal line data and vertical line data
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
//Calculate the mean value
vrshr.u16 d0, d0, #5
vdup.8 q0, d0[0]
//Calculate the mean value
vrshr.u16 d0, d0, #5
vdup.8 q0, d0[0]
//Set the mean value to the all of member of MB
mov r2, #4
//Set the mean value to the all of member of MB
mov r2, #4
loop_0_get_i16x16_luma_pred_dc_both:
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1
subs r2, #1
bne loop_0_get_i16x16_luma_pred_dc_both
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1
subs r2, #1
bne loop_0_get_i16x16_luma_pred_dc_both
WELS_ASM_FUNC_END
@ -149,386 +149,386 @@ CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
//stmdb sp!, { r2-r5, lr}
//stmdb sp!, { r2-r5, lr}
//Load the table {(8,7,6,5,4,3,2,1) * 5}
adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
vldr d0, [r2]
//Load the table {(8,7,6,5,4,3,2,1) * 5}
adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
vldr d0, [r2]
//Pack the top[-1] ~ top[6] to d1
sub r2, r0, r1
sub r3, r2, #1
vld1.8 d1, [r3]
//Pack the top[-1] ~ top[6] to d1
sub r2, r0, r1
sub r3, r2, #1
vld1.8 d1, [r3]
//Pack the top[8] ~ top[15] to d2
add r3, #9
vld1.8 d2, [r3]
//Pack the top[8] ~ top[15] to d2
add r3, #9
vld1.8 d2, [r3]
//Save the top[15] to d6 for next step
vdup.u8 d6, d2[7]
//Save the top[15] to d6 for next step
vdup.u8 d6, d2[7]
//Get and pack left[-1] ~ left[6] to d4
sub r3, r2, #1
GET_8BYTE_DATA d4, r3, r1
//Get and pack left[-1] ~ left[6] to d4
sub r3, r2, #1
GET_8BYTE_DATA d4, r3, r1
//Get and pack left[8] ~ left[15] to d3
add r3, r1
GET_8BYTE_DATA d3, r3, r1
//Get and pack left[8] ~ left[15] to d3
add r3, r1
GET_8BYTE_DATA d3, r3, r1
//Save the left[15] to d7 for next step
vdup.u8 d7, d3[7]
//Save the left[15] to d7 for next step
vdup.u8 d7, d3[7]
//revert the sequence of d2,d3
vrev64.8 q1, q1
//revert the sequence of d2,d3
vrev64.8 q1, q1
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
vmovl.u8 q0, d0
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
vmovl.u8 q0, d0
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
//Calculate the sum of items of q1, q2
vpadd.s16 d0, d2, d3
vpadd.s16 d1, d4, d5
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
//Calculate the sum of items of q1, q2
vpadd.s16 d0, d2, d3
vpadd.s16 d1, d4, d5
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
//Get the value of 'b', 'c' and extend to q1, q2.
vrshr.s64 q0, #6
vdup.s16 q1, d0[0]
vdup.s16 q2, d1[0]
//Get the value of 'b', 'c' and extend to q1, q2.
vrshr.s64 q0, #6
vdup.s16 q1, d0[0]
vdup.s16 q2, d1[0]
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
vld1.32 {d0}, [r2]
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
vld1.32 {d0}, [r2]
//Get the value of 'a' and save to q3
vaddl.u8 q3, d6, d7
vshl.u16 q3, #4
//Get the value of 'a' and save to q3
vaddl.u8 q3, d6, d7
vshl.u16 q3, #4
//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
vmovl.s8 q0, d0
vmla.s16 q3, q0, q1
vmla.s16 q3, q2, d0[0]
//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
vmovl.s8 q0, d0
vmla.s16 q3, q0, q1
vmla.s16 q3, q2, d0[0]
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
vshl.s16 q8, q1, #3
vadd.s16 q8, q3
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
vshl.s16 q8, q1, #3
vadd.s16 q8, q3
//right shift 5 bits and rounding
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q8, #5
//right shift 5 bits and rounding
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q8, #5
//Set the line of MB
vst1.u32 {d0,d1}, [r0], r1
//Set the line of MB
vst1.u32 {d0,d1}, [r0], r1
//Do the same processing for setting other lines
mov r2, #15
//Do the same processing for setting other lines
mov r2, #15
loop_0_get_i16x16_luma_pred_plane:
vadd.s16 q3, q2
vadd.s16 q8, q2
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q8, #5
vst1.u32 {d0,d1}, [r0], r1
subs r2, #1
bne loop_0_get_i16x16_luma_pred_plane
vadd.s16 q3, q2
vadd.s16 q8, q2
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q8, #5
vst1.u32 {d0,d1}, [r0], r1
subs r2, #1
bne loop_0_get_i16x16_luma_pred_plane
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r2, r0, r1
ldr r2, [r2]
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r2, r0, r1
ldr r2, [r2]
//Set the luma MB using top line
str r2, [r0], r1
str r2, [r0], r1
str r2, [r0], r1
str r2, [r0]
//Set the luma MB using top line
str r2, [r0], r1
str r2, [r0], r1
str r2, [r0], r1
str r2, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column (4 bytes)
sub r2, r0, #1
vld1.8 {d0[]}, [r2], r1
vld1.8 {d1[]}, [r2], r1
vld1.8 {d2[]}, [r2], r1
vld1.8 {d3[]}, [r2]
//stmdb sp!, { r2-r5, lr}
//Load the left column (4 bytes)
sub r2, r0, #1
vld1.8 {d0[]}, [r2], r1
vld1.8 {d1[]}, [r2], r1
vld1.8 {d2[]}, [r2], r1
vld1.8 {d3[]}, [r2]
//Set the luma MB using the left side byte
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
vst1.32 {d2[0]}, [r0], r1
vst1.32 {d3[0]}, [r0]
//Set the luma MB using the left side byte
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
vst1.32 {d2[0]}, [r0], r1
vst1.32 {d3[0]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row data(8 bytes)
sub r2, r0, r1
vld1.32 {d0}, [r2]
//stmdb sp!, { r2-r5, lr}
//Load the top row data(8 bytes)
sub r2, r0, r1
vld1.32 {d0}, [r2]
//For "t7 + (t7<<1)"
vdup.8 d1, d0[7]
//For "t7 + (t7<<1)"
vdup.8 d1, d0[7]
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
vext.8 d1, d0, d1, #1
vaddl.u8 q1, d1, d0
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
vext.8 d1, d0, d1, #1
vaddl.u8 q1, d1, d0
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
vext.8 q2, q1, q1, #14
vadd.u16 q0, q1, q2
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
vext.8 q2, q1, q1, #14
vadd.u16 q0, q1, q2
//right shift 2 bits and rounding
vqrshrn.u16 d0, q0, #2
//right shift 2 bits and rounding
vqrshrn.u16 d0, q0, #2
//Save "ddl0, ddl1, ddl2, ddl3"
vext.8 d1, d0, d0, #1
vst1.32 d1[0], [r0], r1
//Save "ddl0, ddl1, ddl2, ddl3"
vext.8 d1, d0, d0, #1
vst1.32 d1[0], [r0], r1
//Save "ddl1, ddl2, ddl3, ddl4"
vext.8 d1, d0, d0, #2
vst1.32 d1[0], [r0], r1
//Save "ddl1, ddl2, ddl3, ddl4"
vext.8 d1, d0, d0, #2
vst1.32 d1[0], [r0], r1
//Save "ddl2, ddl3, ddl4, ddl5"
vext.8 d1, d0, d0, #3
vst1.32 d1[0], [r0], r1
//Save "ddl2, ddl3, ddl4, ddl5"
vext.8 d1, d0, d0, #3
vst1.32 d1[0], [r0], r1
//Save "ddl3, ddl4, ddl5, ddl6"
vst1.32 d0[1], [r0]
//Save "ddl3, ddl4, ddl5, ddl6"
vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r2, r0, r1
vld1.32 {d0[1]}, [r2]
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r2, r0, r1
vld1.32 {d0[1]}, [r2]
//Load the left column (5 bytes)
sub r2, #1
vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2], r1
vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
//Load the left column (5 bytes)
sub r2, #1
vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2], r1
vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
vaddl.u8 q2, d2, d0
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
vaddl.u8 q2, d2, d0
//q1:{TL0+LT0,LT0+T01,...L12+L23}
vext.8 q3, q3, q2, #14
vadd.u16 q1, q2, q3
//q1:{TL0+LT0,LT0+T01,...L12+L23}
vext.8 q3, q3, q2, #14
vadd.u16 q1, q2, q3
//right shift 2 bits and rounding
vqrshrn.u16 d0, q1, #2
//right shift 2 bits and rounding
vqrshrn.u16 d0, q1, #2
//Adjust the data sequence for setting luma MB of 'pred'
vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]
//Adjust the data sequence for setting luma MB of 'pred'
vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (8 bytes)
sub r2, r0, r1
vld1.32 {d0}, [r2]
//stmdb sp!, { r2-r5, lr}
//Load the top row (8 bytes)
sub r2, r0, r1
vld1.32 {d0}, [r2]
vext.8 d1, d0, d0, #1
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
vext.8 d1, d0, d0, #1
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
vext.8 q2, q1, q1, #2
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
vext.8 q2, q1, q1, #2
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
//calculate the "vl0,vl1,vl2,vl3,vl4"
vqrshrn.u16 d0, q1, #1
//calculate the "vl0,vl1,vl2,vl3,vl4"
vqrshrn.u16 d0, q1, #1
//calculate the "vl5,vl6,vl7,vl8,vl9"
vqrshrn.u16 d1, q2, #2
//calculate the "vl5,vl6,vl7,vl8,vl9"
vqrshrn.u16 d1, q2, #2
//Adjust the data sequence for setting the luma MB
vst1.32 d0[0], [r0], r1
vst1.32 d1[0], [r0], r1
vext.8 d0, d0, d0, #1
vext.8 d1, d1, d1, #1
vst1.32 d0[0], [r0], r1
vst1.32 d1[0], [r0]
//Adjust the data sequence for setting the luma MB
vst1.32 d0[0], [r0], r1
vst1.32 d1[0], [r0], r1
vext.8 d0, d0, d0, #1
vext.8 d1, d1, d1, #1
vst1.32 d0[0], [r0], r1
vst1.32 d1[0], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r2, r0, r1
vld1.32 {d0[1]}, [r2]
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r2, r0, r1
vld1.32 {d0[1]}, [r2]
//Load the left column (4 bytes)
sub r2, #1
vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2]
//Load the left column (4 bytes)
sub r2, #1
vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2]
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
vext.u8 q2, q1, q1, #14
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
vext.u8 q2, q1, q1, #14
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
//Calculate the vr0 ~ vr9
vqrshrn.u16 d1, q2, #2
vqrshrn.u16 d0, q1, #1
//Calculate the vr0 ~ vr9
vqrshrn.u16 d1, q2, #2
vqrshrn.u16 d0, q1, #1
//Adjust the data sequence for setting the luma MB
vst1.32 d0[1], [r0], r1
vst1.32 d1[1], [r0], r1
add r2, r0, r1
vst1.8 d1[3], [r0]!
vst1.16 d0[2], [r0]!
vst1.8 d0[6], [r0]!
vst1.8 d1[2], [r2]!
vst1.16 d1[2], [r2]!
vst1.8 d1[6], [r2]
//Adjust the data sequence for setting the luma MB
vst1.32 d0[1], [r0], r1
vst1.32 d1[1], [r0], r1
add r2, r0, r1
vst1.8 d1[3], [r0]!
vst1.16 d0[2], [r0]!
vst1.8 d0[6], [r0]!
vst1.8 d1[2], [r2]!
vst1.16 d1[2], [r2]!
vst1.8 d1[6], [r2]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column data
sub r2, r0, #1
mov r3, #3
mul r3, r1
add r3, r2
vld1.8 {d0[]}, [r3]
vld1.8 {d0[4]}, [r2], r1
vld1.8 {d0[5]}, [r2], r1
vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
//stmdb sp!, { r2-r5, lr}
//Load the left column data
sub r2, r0, #1
mov r3, #3
mul r3, r1
add r3, r2
vld1.8 {d0[]}, [r3]
vld1.8 {d0[4]}, [r2], r1
vld1.8 {d0[5]}, [r2], r1
vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
vext.8 d1, d0, d0, #1
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
vext.8 d1, d0, d0, #1
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
vext.u8 d2, d5, d4, #2
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
vext.u8 d2, d5, d4, #2
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
//Calculate the hu0 ~ hu5
vqrshrn.u16 d2, q2, #1
vqrshrn.u16 d1, q1, #2
//Calculate the hu0 ~ hu5
vqrshrn.u16 d2, q2, #1
vqrshrn.u16 d1, q1, #2
//Adjust the data sequence for setting the luma MB
vzip.8 d2, d1
vst1.32 d1[0], [r0], r1
vext.8 d2, d1, d1, #2
vst1.32 d2[0], [r0], r1
vst1.32 d1[1], [r0], r1
vst1.32 d0[0], [r0]
//Adjust the data sequence for setting the luma MB
vzip.8 d2, d1
vst1.32 d1[0], [r0], r1
vext.8 d2, d1, d1, #2
vst1.32 d2[0], [r0], r1
vst1.32 d1[1], [r0], r1
vst1.32 d0[0], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
//stmdb sp!, { r2-r5, lr}
//Load the data
sub r2, r0, r1
sub r2, #1
vld1.32 {d0[1]}, [r2], r1
vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
//stmdb sp!, { r2-r5, lr}
//Load the data
sub r2, r0, r1
sub r2, #1
vld1.32 {d0[1]}, [r2], r1
vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
//Calculate the hd0~hd9
vqrshrn.u16 d1, q3, #2
vqrshrn.u16 d0, q2, #1
//Calculate the hd0~hd9
vqrshrn.u16 d1, q3, #2
vqrshrn.u16 d0, q2, #1
//Adjust the data sequence for setting the luma MB
vmov d3, d1
vtrn.8 d0, d1
vext.u8 d2, d1, d1, #6
vst2.16 {d2[3], d3[3]}, [r0], r1
vst2.16 {d0[2], d1[2]}, [r0], r1
vmov d3, d0
vst2.16 {d2[2], d3[2]}, [r0], r1
vst2.16 {d0[1], d1[1]}, [r0]
//Adjust the data sequence for setting the luma MB
vmov d3, d1
vtrn.8 d0, d1
vext.u8 d2, d1, d1, #6
vst2.16 {d2[3], d3[3]}, [r0], r1
vst2.16 {d0[2], d1[2]}, [r0], r1
vmov d3, d0
vst2.16 {d2[2], d3[2]}, [r0], r1
vst2.16 {d0[1], d1[1]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
//stmdb sp!, { r2-r5, lr}
//Get the top row (8 byte)
sub r2, r0, r1
vldr d0, [r2]
//stmdb sp!, { r2-r5, lr}
//Get the top row (8 byte)
sub r2, r0, r1
vldr d0, [r2]
//Set the chroma MB using top row data
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0]
//Set the chroma MB using top row data
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
//stmdb sp!, { r2-r5, lr}
////Get the left column (8 byte)
sub r2, r0, #1
vld1.8 {d0[]}, [r2], r1
vld1.8 {d1[]}, [r2], r1
vld1.8 {d2[]}, [r2], r1
vld1.8 {d3[]}, [r2], r1
vld1.8 {d4[]}, [r2], r1
vld1.8 {d5[]}, [r2], r1
vld1.8 {d6[]}, [r2], r1
vld1.8 {d7[]}, [r2]
//stmdb sp!, { r2-r5, lr}
////Get the left column (8 byte)
sub r2, r0, #1
vld1.8 {d0[]}, [r2], r1
vld1.8 {d1[]}, [r2], r1
vld1.8 {d2[]}, [r2], r1
vld1.8 {d3[]}, [r2], r1
vld1.8 {d4[]}, [r2], r1
vld1.8 {d5[]}, [r2], r1
vld1.8 {d6[]}, [r2], r1
vld1.8 {d7[]}, [r2]
//Set the chroma MB using left column data
vst1.8 {d0}, [r0], r1
vst1.8 {d1}, [r0], r1
vst1.8 {d2}, [r0], r1
vst1.8 {d3}, [r0], r1
vst1.8 {d4}, [r0], r1
vst1.8 {d5}, [r0], r1
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r0]
//Set the chroma MB using left column data
vst1.8 {d0}, [r0], r1
vst1.8 {d1}, [r0], r1
vst1.8 {d2}, [r0], r1
vst1.8 {d3}, [r0], r1
vst1.8 {d4}, [r0], r1
vst1.8 {d5}, [r0], r1
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r0]
WELS_ASM_FUNC_END
@ -576,73 +576,73 @@ CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x2823
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row data
sub r2, r0, #1
sub r2, r1
vld1.32 {d1[0]}, [r2]
add r2, #5
vld1.32 {d0[0]}, [r2]
//stmdb sp!, { r2-r5, lr}
//Load the top row data
sub r2, r0, #1
sub r2, r1
vld1.32 {d1[0]}, [r2]
add r2, #5
vld1.32 {d0[0]}, [r2]
//Load the left column data
sub r2, #5
vld1.8 {d1[4]}, [r2], r1
vld1.8 {d1[5]}, [r2], r1
vld1.8 {d1[6]}, [r2], r1
vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
add r2, r1
vld1.8 {d0[4]}, [r2], r1
vld1.8 {d0[5]}, [r2], r1
vld1.8 {d0[6]}, [r2], r1
vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
//Load the left column data
sub r2, #5
vld1.8 {d1[4]}, [r2], r1
vld1.8 {d1[5]}, [r2], r1
vld1.8 {d1[6]}, [r2], r1
vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
add r2, r1
vld1.8 {d0[4]}, [r2], r1
vld1.8 {d0[5]}, [r2], r1
vld1.8 {d0[6]}, [r2], r1
vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
//Save T7 to d3 for next step
vdup.u8 d3, d0[3]
//Save L7 to d4 for next step
vdup.u8 d4, d0[7]
//Save T7 to d3 for next step
vdup.u8 d3, d0[3]
//Save L7 to d4 for next step
vdup.u8 d4, d0[7]
//Calculate the value of 'a' and save to q2
vaddl.u8 q2, d3, d4
vshl.u16 q2, #4
//Calculate the value of 'a' and save to q2
vaddl.u8 q2, d3, d4
vshl.u16 q2, #4
//Load the table {{1,2,3,4,1,2,3,4}*17}
adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
vld1.32 {d2}, [r2]
//Load the table {{1,2,3,4,1,2,3,4}*17}
adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
vld1.32 {d2}, [r2]
//Calculate the 'b','c', and save to q0
vrev32.8 d1, d1
vsubl.u8 q0, d0, d1
vmovl.u8 q1, d2
vmul.s16 q0, q1
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
vrshr.s64 q0, #5
//Calculate the 'b','c', and save to q0
vrev32.8 d1, d1
vsubl.u8 q0, d0, d1
vmovl.u8 q1, d2
vmul.s16 q0, q1
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
vrshr.s64 q0, #5
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
vld1.32 {d6, d7}, [r2]
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
vld1.32 {d6, d7}, [r2]
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
vdup.s16 q1, d1[0]
vdup.s16 q0, d0[0]
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
vdup.s16 q1, d1[0]
vdup.s16 q0, d0[0]
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
vmla.s16 q2, q0, q3
vmla.s16 q2, q1, d6[0]
vqrshrun.s16 d0, q2, #5
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
vmla.s16 q2, q0, q3
vmla.s16 q2, q1, d6[0]
vqrshrun.s16 d0, q2, #5
//Set a line of chroma MB
vst1.u32 {d0}, [r0], r1
//Set a line of chroma MB
vst1.u32 {d0}, [r0], r1
//Do the same processing for each line.
mov r2, #7
//Do the same processing for each line.
mov r2, #7
loop_0_get_i_chroma_pred_plane:
vadd.s16 q2, q1
vqrshrun.s16 d0, q2, #5
vst1.u32 {d0}, [r0], r1
subs r2, #1
bne loop_0_get_i_chroma_pred_plane
vadd.s16 q2, q1
vqrshrun.s16 d0, q2, #5
vst1.u32 {d0}, [r0], r1
subs r2, #1
bne loop_0_get_i_chroma_pred_plane
WELS_ASM_FUNC_END

View File

@ -54,7 +54,7 @@
%endmacro
%macro MMX_SumSub 3
movq %3, %2
movq %3, %2
psubw %2, %1
paddw %1, %3
%endmacro
@ -62,8 +62,8 @@
%macro MMX_IDCT 6
MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1
MMX_SumSub %1, %4, %6
MMX_SumSub %3, %5, %6
MMX_SumSub %1, %4, %6
MMX_SumSub %3, %5, %6
%endmacro
@ -96,13 +96,13 @@ WELS_EXTERN IdctResAddPred_mmx
movq mm2, [r2+16]
movq mm3, [r2+24]
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
WELS_Zero mm7
WELS_DW32 mm6
WELS_Zero mm7
WELS_DW32 mm6
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0]
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1]
@ -111,5 +111,5 @@ WELS_EXTERN IdctResAddPred_mmx
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1]
emms
emms
ret

File diff suppressed because it is too large Load Diff

View File

@ -38,107 +38,107 @@
#ifdef __APPLE__
//Global macro
.macro GET_8BYTE_DATA
vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2
vld1.8 {$0[2]}, [$1], $2
vld1.8 {$0[3]}, [$1], $2
vld1.8 {$0[4]}, [$1], $2
vld1.8 {$0[5]}, [$1], $2
vld1.8 {$0[6]}, [$1], $2
vld1.8 {$0[7]}, [$1], $2
vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2
vld1.8 {$0[2]}, [$1], $2
vld1.8 {$0[3]}, [$1], $2
vld1.8 {$0[4]}, [$1], $2
vld1.8 {$0[5]}, [$1], $2
vld1.8 {$0[6]}, [$1], $2
vld1.8 {$0[7]}, [$1], $2
.endm
#else
//Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2
vld1.8 {\arg0[2]}, [\arg1], \arg2
vld1.8 {\arg0[3]}, [\arg1], \arg2
vld1.8 {\arg0[4]}, [\arg1], \arg2
vld1.8 {\arg0[5]}, [\arg1], \arg2
vld1.8 {\arg0[6]}, [\arg1], \arg2
vld1.8 {\arg0[7]}, [\arg1], \arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2
vld1.8 {\arg0[2]}, [\arg1], \arg2
vld1.8 {\arg0[3]}, [\arg1], \arg2
vld1.8 {\arg0[4]}, [\arg1], \arg2
vld1.8 {\arg0[5]}, [\arg1], \arg2
vld1.8 {\arg0[6]}, [\arg1], \arg2
vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
//Get the top line data to 'q0'
sub r3, r1, r2
vldm r3, {d0, d1}
//Get the top line data to 'q0'
sub r3, r1, r2
vldm r3, {d0, d1}
//mov r2, #16
mov r3, #4
//Set the top line to the each line of MB(16*16)
//mov r2, #16
mov r3, #4
//Set the top line to the each line of MB(16*16)
loop_0_get_i16x16_luma_pred_v:
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
subs r3, #1
bne loop_0_get_i16x16_luma_pred_v
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
subs r3, #1
bne loop_0_get_i16x16_luma_pred_v
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
//stmdb sp!, {r4, lr}
sub r1, r1, #1
mov r3, #4
sub r1, r1, #1
mov r3, #4
loop_0_get_i16x16_luma_pred_h:
//Get one byte data from left side
vld1.8 {d0[],d1[]}, [r1], r2
vld1.8 {d2[],d3[]}, [r1], r2
vld1.8 {d4[],d5[]}, [r1], r2
vld1.8 {d6[],d7[]}, [r1], r2
//Get one byte data from left side
vld1.8 {d0[],d1[]}, [r1], r2
vld1.8 {d2[],d3[]}, [r1], r2
vld1.8 {d4[],d5[]}, [r1], r2
vld1.8 {d6[],d7[]}, [r1], r2
//Set the line of MB using the left side byte data
vst1.8 {d0,d1}, [r0]!
//add r0, #16
vst1.8 {d2,d3}, [r0]!
//add r0, #16
vst1.8 {d4,d5}, [r0]!
//add r0, #16
vst1.8 {d6,d7}, [r0]!
//add r0, #16
//Set the line of MB using the left side byte data
vst1.8 {d0,d1}, [r0]!
//add r0, #16
vst1.8 {d2,d3}, [r0]!
//add r0, #16
vst1.8 {d4,d5}, [r0]!
//add r0, #16
vst1.8 {d6,d7}, [r0]!
//add r0, #16
subs r3, #1
bne loop_0_get_i16x16_luma_pred_h
subs r3, #1
bne loop_0_get_i16x16_luma_pred_h
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
//stmdb sp!, { r2-r5, lr}
//Get the left vertical line data
sub r3, r1, #1
GET_8BYTE_DATA d0, r3, r2
GET_8BYTE_DATA d1, r3, r2
//stmdb sp!, { r2-r5, lr}
//Get the left vertical line data
sub r3, r1, #1
GET_8BYTE_DATA d0, r3, r2
GET_8BYTE_DATA d1, r3, r2
//Get the top horizontal line data
sub r3, r1, r2
vldm r3, {d2, d3}
//Get the top horizontal line data
sub r3, r1, r2
vldm r3, {d2, d3}
//Calculate the sum of top horizontal line data and vertical line data
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
//Calculate the sum of top horizontal line data and vertical line data
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
//Calculate the mean value
vrshr.u16 d0, d0, #5
vdup.8 q0, d0[0]
//Calculate the mean value
vrshr.u16 d0, d0, #5
vdup.8 q0, d0[0]
//Set the mean value to the all of member of MB
mov r3, #4
//Set the mean value to the all of member of MB
mov r3, #4
loop_0_get_i16x16_luma_pred_dc_both:
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
subs r3, #1
bne loop_0_get_i16x16_luma_pred_dc_both
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
subs r3, #1
bne loop_0_get_i16x16_luma_pred_dc_both
WELS_ASM_FUNC_END
@ -151,383 +151,383 @@ CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
//stmdb sp!, { r4, lr}
//stmdb sp!, { r4, lr}
//Load the table {(8,7,6,5,4,3,2,1) * 5}
adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
vldr d0, [r3]
//Load the table {(8,7,6,5,4,3,2,1) * 5}
adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
vldr d0, [r3]
//Pack the top[-1] ~ top[6] to d1
sub r3, r1, r2
sub r1, r3, #1
vld1.8 d1, [r1]
//Pack the top[-1] ~ top[6] to d1
sub r3, r1, r2
sub r1, r3, #1
vld1.8 d1, [r1]
//Pack the top[8] ~ top[15] to d2
add r1, #9
vld1.8 d2, [r1]
//Pack the top[8] ~ top[15] to d2
add r1, #9
vld1.8 d2, [r1]
//Save the top[15] to d6 for next step
vdup.u8 d6, d2[7]
//Save the top[15] to d6 for next step
vdup.u8 d6, d2[7]
//Get and pack left[-1] ~ left[6] to d4
sub r1, r3, #1
GET_8BYTE_DATA d4, r1, r2
//Get and pack left[-1] ~ left[6] to d4
sub r1, r3, #1
GET_8BYTE_DATA d4, r1, r2
//Get and pack left[8] ~ left[15] to d3
add r1, r2
GET_8BYTE_DATA d3, r1, r2
//Get and pack left[8] ~ left[15] to d3
add r1, r2
GET_8BYTE_DATA d3, r1, r2
//Save the left[15] to d7 for next step
vdup.u8 d7, d3[7]
//Save the left[15] to d7 for next step
vdup.u8 d7, d3[7]
//revert the sequence of d2,d3
vrev64.8 q1, q1
//revert the sequence of d2,d3
vrev64.8 q1, q1
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
vmovl.u8 q0, d0
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
vmovl.u8 q0, d0
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
//Calculate the sum of items of q1, q2
vpadd.s16 d0, d2, d3
vpadd.s16 d1, d4, d5
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
//Calculate the sum of items of q1, q2
vpadd.s16 d0, d2, d3
vpadd.s16 d1, d4, d5
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
//Get the value of 'b', 'c' and extend to q1, q2.
vrshr.s64 q0, #6
vdup.s16 q1, d0[0]
vdup.s16 q2, d1[0]
//Get the value of 'b', 'c' and extend to q1, q2.
vrshr.s64 q0, #6
vdup.s16 q1, d0[0]
vdup.s16 q2, d1[0]
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
vld1.32 {d0}, [r3]
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
vld1.32 {d0}, [r3]
//Get the value of 'a' and save to q3
vaddl.u8 q3, d6, d7
vshl.u16 q3, #4
//Get the value of 'a' and save to q3
vaddl.u8 q3, d6, d7
vshl.u16 q3, #4
//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
vmovl.s8 q0, d0
vmla.s16 q3, q0, q1
vmla.s16 q3, q2, d0[0]
//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
vmovl.s8 q0, d0
vmla.s16 q3, q0, q1
vmla.s16 q3, q2, d0[0]
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
vshl.s16 q8, q1, #3
vadd.s16 q8, q3
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
vshl.s16 q8, q1, #3
vadd.s16 q8, q3
//right shift 5 bits and rounding
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q8, #5
//right shift 5 bits and rounding
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q8, #5
//Set the line of MB
vst1.u32 {d0,d1}, [r0]!
//Set the line of MB
vst1.u32 {d0,d1}, [r0]!
//Do the same processing for setting other lines
mov r3, #15
//Do the same processing for setting other lines
mov r3, #15
loop_0_get_i16x16_luma_pred_plane:
vadd.s16 q3, q2
vadd.s16 q8, q2
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q8, #5
vst1.u32 {d0,d1}, [r0]!
subs r3, #1
bne loop_0_get_i16x16_luma_pred_plane
vadd.s16 q3, q2
vadd.s16 q8, q2
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q8, #5
vst1.u32 {d0,d1}, [r0]!
subs r3, #1
bne loop_0_get_i16x16_luma_pred_plane
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredV_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r3, r1, r2
ldr r3, [r3]
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r3, r1, r2
ldr r3, [r3]
//Set the luma MB using top line
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0]
//Set the luma MB using top line
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredH_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column (4 bytes)
sub r3, r1, #1
vld1.8 {d0[]}, [r3], r2
vld1.8 {d1[]}, [r3], r2
vld1.8 {d2[]}, [r3], r2
vld1.8 {d3[]}, [r3]
//stmdb sp!, { r2-r5, lr}
//Load the left column (4 bytes)
sub r3, r1, #1
vld1.8 {d0[]}, [r3], r2
vld1.8 {d1[]}, [r3], r2
vld1.8 {d2[]}, [r3], r2
vld1.8 {d3[]}, [r3]
//Set the luma MB using the left side byte
vst1.32 {d0[0]}, [r0]!
vst1.32 {d1[0]}, [r0]!
vst1.32 {d2[0]}, [r0]!
vst1.32 {d3[0]}, [r0]
//Set the luma MB using the left side byte
vst1.32 {d0[0]}, [r0]!
vst1.32 {d1[0]}, [r0]!
vst1.32 {d2[0]}, [r0]!
vst1.32 {d3[0]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDL_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row data(8 bytes)
sub r3, r1, r2
vld1.32 {d0}, [r3]
//stmdb sp!, { r2-r5, lr}
//Load the top row data(8 bytes)
sub r3, r1, r2
vld1.32 {d0}, [r3]
//For "t7 + (t7<<1)"
vdup.8 d1, d0[7]
//For "t7 + (t7<<1)"
vdup.8 d1, d0[7]
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
vext.8 d1, d0, d1, #1
vaddl.u8 q1, d1, d0
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
vext.8 d1, d0, d1, #1
vaddl.u8 q1, d1, d0
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
vext.8 q2, q1, q1, #14
vadd.u16 q0, q1, q2
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
vext.8 q2, q1, q1, #14
vadd.u16 q0, q1, q2
//right shift 2 bits and rounding
vqrshrn.u16 d0, q0, #2
//right shift 2 bits and rounding
vqrshrn.u16 d0, q0, #2
//Save "ddl0, ddl1, ddl2, ddl3"
vext.8 d1, d0, d0, #1
vst1.32 d1[0], [r0]!
//Save "ddl0, ddl1, ddl2, ddl3"
vext.8 d1, d0, d0, #1
vst1.32 d1[0], [r0]!
//Save "ddl1, ddl2, ddl3, ddl4"
vext.8 d1, d0, d0, #2
vst1.32 d1[0], [r0]!
//Save "ddl1, ddl2, ddl3, ddl4"
vext.8 d1, d0, d0, #2
vst1.32 d1[0], [r0]!
//Save "ddl2, ddl3, ddl4, ddl5"
vext.8 d1, d0, d0, #3
vst1.32 d1[0], [r0]!
//Save "ddl2, ddl3, ddl4, ddl5"
vext.8 d1, d0, d0, #3
vst1.32 d1[0], [r0]!
//Save "ddl3, ddl4, ddl5, ddl6"
vst1.32 d0[1], [r0]
//Save "ddl3, ddl4, ddl5, ddl6"
vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDR_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r3, r1, r2
vld1.32 {d0[1]}, [r3]
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r3, r1, r2
vld1.32 {d0[1]}, [r3]
//Load the left column (5 bytes)
sub r3, #1
vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3], r2
vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
//Load the left column (5 bytes)
sub r3, #1
vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3], r2
vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
vaddl.u8 q2, d2, d0
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
vaddl.u8 q2, d2, d0
//q1:{TL0+LT0,LT0+T01,...L12+L23}
vext.8 q3, q3, q2, #14
vadd.u16 q1, q2, q3
//q1:{TL0+LT0,LT0+T01,...L12+L23}
vext.8 q3, q3, q2, #14
vadd.u16 q1, q2, q3
//right shift 2 bits and rounding
vqrshrn.u16 d0, q1, #2
//right shift 2 bits and rounding
vqrshrn.u16 d0, q1, #2
//Adjust the data sequence for setting luma MB of 'pred'
vst1.32 d0[1], [r0]!
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]!
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]!
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]
//Adjust the data sequence for setting luma MB of 'pred'
vst1.32 d0[1], [r0]!
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]!
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]!
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (8 bytes)
sub r3, r1, r2
vld1.32 {d0}, [r3]
//stmdb sp!, { r2-r5, lr}
//Load the top row (8 bytes)
sub r3, r1, r2
vld1.32 {d0}, [r3]
vext.8 d1, d0, d0, #1
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
vext.8 d1, d0, d0, #1
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
vext.8 q2, q1, q1, #2
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
vext.8 q2, q1, q1, #2
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
//calculate the "vl0,vl1,vl2,vl3,vl4"
vqrshrn.u16 d0, q1, #1
//calculate the "vl0,vl1,vl2,vl3,vl4"
vqrshrn.u16 d0, q1, #1
//calculate the "vl5,vl6,vl7,vl8,vl9"
vqrshrn.u16 d1, q2, #2
//calculate the "vl5,vl6,vl7,vl8,vl9"
vqrshrn.u16 d1, q2, #2
//Adjust the data sequence for setting the luma MB
vst1.32 d0[0], [r0]!
vst1.32 d1[0], [r0]!
vext.8 d0, d0, d0, #1
vext.8 d1, d1, d1, #1
vst1.32 d0[0], [r0]!
vst1.32 d1[0], [r0]
//Adjust the data sequence for setting the luma MB
vst1.32 d0[0], [r0]!
vst1.32 d1[0], [r0]!
vext.8 d0, d0, d0, #1
vext.8 d1, d1, d1, #1
vst1.32 d0[0], [r0]!
vst1.32 d1[0], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVR_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r3, r1, r2
vld1.32 {d0[1]}, [r3]
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r3, r1, r2
vld1.32 {d0[1]}, [r3]
//Load the left column (4 bytes)
sub r3, #1
vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3]
//Load the left column (4 bytes)
sub r3, #1
vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3]
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
vext.u8 q2, q1, q1, #14
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
vext.u8 q2, q1, q1, #14
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
//Calculate the vr0 ~ vr9
vqrshrn.u16 d1, q2, #2
vqrshrn.u16 d0, q1, #1
//Calculate the vr0 ~ vr9
vqrshrn.u16 d1, q2, #2
vqrshrn.u16 d0, q1, #1
//Adjust the data sequence for setting the luma MB
vst1.32 d0[1], [r0]!
vst1.32 d1[1], [r0]!
//add r2, r0, r1
vst1.8 d1[3], [r0]!
vst1.16 d0[2], [r0]!
vst1.8 d0[6], [r0]!
vst1.8 d1[2], [r0]!
vst1.16 d1[2], [r0]!
vst1.8 d1[6], [r0]
//Adjust the data sequence for setting the luma MB
vst1.32 d0[1], [r0]!
vst1.32 d1[1], [r0]!
//add r2, r0, r1
vst1.8 d1[3], [r0]!
vst1.16 d0[2], [r0]!
vst1.8 d0[6], [r0]!
vst1.8 d1[2], [r0]!
vst1.16 d1[2], [r0]!
vst1.8 d1[6], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHU_neon
//stmdb sp!, { r4, lr}
//Load the left column data
sub r3, r1, #1
mov r1, #3
mul r1, r2
add r1, r3
vld1.8 {d0[]}, [r1]
vld1.8 {d0[4]}, [r3], r2
vld1.8 {d0[5]}, [r3], r2
vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
//stmdb sp!, { r4, lr}
//Load the left column data
sub r3, r1, #1
mov r1, #3
mul r1, r2
add r1, r3
vld1.8 {d0[]}, [r1]
vld1.8 {d0[4]}, [r3], r2
vld1.8 {d0[5]}, [r3], r2
vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
vext.8 d1, d0, d0, #1
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
vext.8 d1, d0, d0, #1
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
vext.u8 d2, d5, d4, #2
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
vext.u8 d2, d5, d4, #2
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
//Calculate the hu0 ~ hu5
vqrshrn.u16 d2, q2, #1
vqrshrn.u16 d1, q1, #2
//Calculate the hu0 ~ hu5
vqrshrn.u16 d2, q2, #1
vqrshrn.u16 d1, q1, #2
//Adjust the data sequence for setting the luma MB
vzip.8 d2, d1
vst1.32 d1[0], [r0]!
vext.8 d2, d1, d1, #2
vst1.32 d2[0], [r0]!
vst1.32 d1[1], [r0]!
vst1.32 d0[0], [r0]
//Adjust the data sequence for setting the luma MB
vzip.8 d2, d1
vst1.32 d1[0], [r0]!
vext.8 d2, d1, d1, #2
vst1.32 d2[0], [r0]!
vst1.32 d1[1], [r0]!
vst1.32 d0[0], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHD_neon
//stmdb sp!, { r2-r5, lr}
//Load the data
sub r3, r1, r2
sub r3, #1
vld1.32 {d0[1]}, [r3], r2
vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
//stmdb sp!, { r2-r5, lr}
//Load the data
sub r3, r1, r2
sub r3, #1
vld1.32 {d0[1]}, [r3], r2
vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
//Calculate the hd0~hd9
vqrshrn.u16 d1, q3, #2
vqrshrn.u16 d0, q2, #1
//Calculate the hd0~hd9
vqrshrn.u16 d1, q3, #2
vqrshrn.u16 d0, q2, #1
//Adjust the data sequence for setting the luma MB
vmov d3, d1
vtrn.8 d0, d1
vext.u8 d2, d1, d1, #6
vst2.16 {d2[3], d3[3]}, [r0]!
vst2.16 {d0[2], d1[2]}, [r0]!
vmov d3, d0
vst2.16 {d2[2], d3[2]}, [r0]!
vst2.16 {d0[1], d1[1]}, [r0]
//Adjust the data sequence for setting the luma MB
vmov d3, d1
vtrn.8 d0, d1
vext.u8 d2, d1, d1, #6
vst2.16 {d2[3], d3[3]}, [r0]!
vst2.16 {d0[2], d1[2]}, [r0]!
vmov d3, d0
vst2.16 {d2[2], d3[2]}, [r0]!
vst2.16 {d0[1], d1[1]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIChromaPredV_neon
//stmdb sp!, { r2-r5, lr}
//Get the top row (8 byte)
sub r3, r1, r2
vldr d0, [r3]
//stmdb sp!, { r2-r5, lr}
//Get the top row (8 byte)
sub r3, r1, r2
vldr d0, [r3]
//Set the chroma MB using top row data
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]
//Set the chroma MB using top row data
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon
//stmdb sp!, { r2-r5, lr}
////Get the left column (8 byte)
sub r3, r1, #1
vld1.8 {d0[]}, [r3], r2
vld1.8 {d1[]}, [r3], r2
vld1.8 {d2[]}, [r3], r2
vld1.8 {d3[]}, [r3], r2
vld1.8 {d4[]}, [r3], r2
vld1.8 {d5[]}, [r3], r2
vld1.8 {d6[]}, [r3], r2
vld1.8 {d7[]}, [r3]
//stmdb sp!, { r2-r5, lr}
////Get the left column (8 byte)
sub r3, r1, #1
vld1.8 {d0[]}, [r3], r2
vld1.8 {d1[]}, [r3], r2
vld1.8 {d2[]}, [r3], r2
vld1.8 {d3[]}, [r3], r2
vld1.8 {d4[]}, [r3], r2
vld1.8 {d5[]}, [r3], r2
vld1.8 {d6[]}, [r3], r2
vld1.8 {d7[]}, [r3]
//Set the chroma MB using left column data
vst1.8 {d0}, [r0]!
vst1.8 {d1}, [r0]!
vst1.8 {d2}, [r0]!
vst1.8 {d3}, [r0]!
vst1.8 {d4}, [r0]!
vst1.8 {d5}, [r0]!
vst1.8 {d6}, [r0]!
vst1.8 {d7}, [r0]
//Set the chroma MB using left column data
vst1.8 {d0}, [r0]!
vst1.8 {d1}, [r0]!
vst1.8 {d2}, [r0]!
vst1.8 {d3}, [r0]!
vst1.8 {d4}, [r0]!
vst1.8 {d5}, [r0]!
vst1.8 {d6}, [r0]!
vst1.8 {d7}, [r0]
WELS_ASM_FUNC_END
@ -575,73 +575,73 @@ CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x2823
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row data
sub r3, r1, #1
sub r3, r2
vld1.32 {d1[0]}, [r3]
add r3, #5
vld1.32 {d0[0]}, [r3]
//stmdb sp!, { r2-r5, lr}
//Load the top row data
sub r3, r1, #1
sub r3, r2
vld1.32 {d1[0]}, [r3]
add r3, #5
vld1.32 {d0[0]}, [r3]
//Load the left column data
sub r3, #5
vld1.8 {d1[4]}, [r3], r2
vld1.8 {d1[5]}, [r3], r2
vld1.8 {d1[6]}, [r3], r2
vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
add r3, r2
vld1.8 {d0[4]}, [r3], r2
vld1.8 {d0[5]}, [r3], r2
vld1.8 {d0[6]}, [r3], r2
vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
//Load the left column data
sub r3, #5
vld1.8 {d1[4]}, [r3], r2
vld1.8 {d1[5]}, [r3], r2
vld1.8 {d1[6]}, [r3], r2
vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
add r3, r2
vld1.8 {d0[4]}, [r3], r2
vld1.8 {d0[5]}, [r3], r2
vld1.8 {d0[6]}, [r3], r2
vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
//Save T7 to d3 for next step
vdup.u8 d3, d0[3]
//Save L7 to d4 for next step
vdup.u8 d4, d0[7]
//Save T7 to d3 for next step
vdup.u8 d3, d0[3]
//Save L7 to d4 for next step
vdup.u8 d4, d0[7]
//Calculate the value of 'a' and save to q2
vaddl.u8 q2, d3, d4
vshl.u16 q2, #4
//Calculate the value of 'a' and save to q2
vaddl.u8 q2, d3, d4
vshl.u16 q2, #4
//Load the table {{1,2,3,4,1,2,3,4}*17}
adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
vld1.32 {d2}, [r3]
//Load the table {{1,2,3,4,1,2,3,4}*17}
adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
vld1.32 {d2}, [r3]
//Calculate the 'b','c', and save to q0
vrev32.8 d1, d1
vsubl.u8 q0, d0, d1
vmovl.u8 q1, d2
vmul.s16 q0, q1
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
vrshr.s64 q0, #5
//Calculate the 'b','c', and save to q0
vrev32.8 d1, d1
vsubl.u8 q0, d0, d1
vmovl.u8 q1, d2
vmul.s16 q0, q1
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
vrshr.s64 q0, #5
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
vld1.32 {d6, d7}, [r3]
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
vld1.32 {d6, d7}, [r3]
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
vdup.s16 q1, d1[0]
vdup.s16 q0, d0[0]
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
vdup.s16 q1, d1[0]
vdup.s16 q0, d0[0]
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
vmla.s16 q2, q0, q3
vmla.s16 q2, q1, d6[0]
vqrshrun.s16 d0, q2, #5
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
vmla.s16 q2, q0, q3
vmla.s16 q2, q1, d6[0]
vqrshrun.s16 d0, q2, #5
//Set a line of chroma MB
vst1.u32 {d0}, [r0]!
//Set a line of chroma MB
vst1.u32 {d0}, [r0]!
//Do the same processing for each line.
mov r3, #7
//Do the same processing for each line.
mov r3, #7
loop_0_get_i_chroma_pred_plane:
vadd.s16 q2, q1
vqrshrun.s16 d0, q2, #5
vst1.u32 {d0}, [r0]!
subs r3, #1
bne loop_0_get_i_chroma_pred_plane
vadd.s16 q2, q1
vqrshrun.s16 d0, q2, #5
vst1.u32 {d0}, [r0]!
subs r3, #1
bne loop_0_get_i_chroma_pred_plane
WELS_ASM_FUNC_END

File diff suppressed because it is too large Load Diff

View File

@ -66,10 +66,10 @@
vsub.s16 q3, q12, q13
vadd.s16 q8, q10, q11
vsub.s16 q9, q10, q11
vsub.s16 q9, q10, q11
vadd.s16 q10, q14, q15
vsub.s16 q11, q14, q15
vsub.s16 q11, q14, q15
vadd.s16 q12, q0, q2
vsub.s16 q14, q0, q2
@ -372,28 +372,28 @@ WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon
stmdb sp!, {r4-r5, lr}
//Loading a horizontal line data (4 bytes)
//line 0
ldr r4, [r0], r1
ldr r5, [r2], r3
usad8 lr, r4, r5
//Loading a horizontal line data (4 bytes)
//line 0
ldr r4, [r0], r1
ldr r5, [r2], r3
usad8 lr, r4, r5
//line 1
ldr r4, [r0], r1
ldr r5, [r2], r3
usada8 lr, r4, r5, lr
ldr r4, [r0], r1
ldr r5, [r2], r3
usada8 lr, r4, r5, lr
//line 2
ldr r4, [r0], r1
ldr r5, [r2], r3
usada8 lr, r4, r5, lr
ldr r4, [r0], r1
ldr r5, [r2], r3
usada8 lr, r4, r5, lr
//line 3
ldr r4, [r0]
ldr r5, [r2]
usada8 r0, r4, r5, lr
//line 3
ldr r4, [r0]
ldr r5, [r2]
usada8 r0, r4, r5, lr
ldmia sp!, {r4-r5, lr}
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
@ -401,340 +401,340 @@ WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x16_neon
stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1
vld1.8 {q0}, [r0], r1 //save pix1
vld1.8 {q1}, [r2], r3 //save pix2 - stride
vld1.8 {q10}, [r2], r3 //save pix2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
vld1.8 {q1}, [r2], r3 //save pix2 - stride
vld1.8 {q10}, [r2], r3 //save pix2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vld1.8 {q8}, [r5], r3 //save pix2 + 1
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vld1.8 {q8}, [r5], r3 //save pix2 + 1
//Do the SAD for 16 bytes
vabdl.u8 q15, d0, d2
vabal.u8 q15, d1, d3
//Do the SAD for 16 bytes
vabdl.u8 q15, d0, d2
vabal.u8 q15, d1, d3
vabdl.u8 q13, d0, d4
vabal.u8 q13, d1, d5
vabdl.u8 q13, d0, d4
vabal.u8 q13, d1, d5
vabdl.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabdl.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabdl.u8 q9, d0, d16
vabal.u8 q9, d1, d17
vabdl.u8 q9, d0, d16
vabal.u8 q9, d1, d17
mov lr, #15
mov lr, #15
pixel_sad_4_16x16_loop_0:
//Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1
vmov.8 q1, q10 //save pix2 - stride
vmov.8 q10, q2
vabal.u8 q15, d0, d2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
vabal.u8 q15, d1, d3
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vabal.u8 q13, d0, d4
vld1.8 {q8}, [r5], r3 //save pix2 + 1
vld1.8 {q0}, [r0], r1 //save pix1
vmov.8 q1, q10 //save pix2 - stride
vmov.8 q10, q2
vabal.u8 q15, d0, d2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
vabal.u8 q15, d1, d3
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vabal.u8 q13, d0, d4
vld1.8 {q8}, [r5], r3 //save pix2 + 1
vabal.u8 q13, d1, d5
subs lr, #1
subs lr, #1
vabal.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabal.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabal.u8 q9, d0, d16
vabal.u8 q9, d1, d17
vabal.u8 q9, d0, d16
vabal.u8 q9, d1, d17
bne pixel_sad_4_16x16_loop_0
bne pixel_sad_4_16x16_loop_0
//Save SAD to 'r0'
ldr r0, [sp, #12]
ldr r0, [sp, #12]
vadd.u16 d0, d30, d31
vadd.u16 d1, d26, d27
vadd.u16 d2, d22, d23
vadd.u16 d3, d18, d19
vadd.u16 d0, d30, d31
vadd.u16 d1, d26, d27
vadd.u16 d2, d22, d23
vadd.u16 d3, d18, d19
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr}
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon
stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1
vld1.8 {q0}, [r0], r1 //save pix1
vld1.8 {q1}, [r2], r3 //save pix2 - stride
vld1.8 {q10}, [r2], r3 //save pix2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
vld1.8 {q1}, [r2], r3 //save pix2 - stride
vld1.8 {q10}, [r2], r3 //save pix2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vld1.8 {q8}, [r5], r3 //save pix2 + 1
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vld1.8 {q8}, [r5], r3 //save pix2 + 1
//Do the SAD for 16 bytes
vabdl.u8 q15, d0, d2
vabal.u8 q15, d1, d3
//Do the SAD for 16 bytes
vabdl.u8 q15, d0, d2
vabal.u8 q15, d1, d3
vabdl.u8 q13, d0, d4
vabal.u8 q13, d1, d5
vabdl.u8 q13, d0, d4
vabal.u8 q13, d1, d5
vabdl.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabdl.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabdl.u8 q9, d0, d16
vabal.u8 q9, d1, d17
vabdl.u8 q9, d0, d16
vabal.u8 q9, d1, d17
mov lr, #7
mov lr, #7
pixel_sad_4_16x8_loop_0:
//Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1
vmov.8 q1, q10 //save pix2 - stride
vmov.8 q10, q2
vabal.u8 q15, d0, d2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
vabal.u8 q15, d1, d3
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vabal.u8 q13, d0, d4
vld1.8 {q8}, [r5], r3 //save pix2 + 1
vld1.8 {q0}, [r0], r1 //save pix1
vmov.8 q1, q10 //save pix2 - stride
vmov.8 q10, q2
vabal.u8 q15, d0, d2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
vabal.u8 q15, d1, d3
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vabal.u8 q13, d0, d4
vld1.8 {q8}, [r5], r3 //save pix2 + 1
vabal.u8 q13, d1, d5
subs lr, #1
subs lr, #1
vabal.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabal.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabal.u8 q9, d0, d16
vabal.u8 q9, d1, d17
vabal.u8 q9, d0, d16
vabal.u8 q9, d1, d17
bne pixel_sad_4_16x8_loop_0
bne pixel_sad_4_16x8_loop_0
//Save SAD to 'r0'
ldr r0, [sp, #12]
ldr r0, [sp, #12]
vadd.u16 d0, d30, d31
vadd.u16 d1, d26, d27
vadd.u16 d2, d22, d23
vadd.u16 d3, d18, d19
vadd.u16 d0, d30, d31
vadd.u16 d1, d26, d27
vadd.u16 d2, d22, d23
vadd.u16 d3, d18, d19
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr}
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon
stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1
vld1.8 {d0}, [r0], r1 //save pix1
vld1.8 {d1}, [r2], r3 //save pix2 - stride
vld1.8 {d6}, [r2], r3 //save pix2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d1}, [r2], r3 //save pix2 - stride
vld1.8 {d6}, [r2], r3 //save pix2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabdl.u8 q15, d0, d1
vabdl.u8 q14, d0, d2
vabdl.u8 q13, d0, d3
vabdl.u8 q12, d0, d4
//Do the SAD for 8 bytes
vabdl.u8 q15, d0, d1
vabdl.u8 q14, d0, d2
vabdl.u8 q13, d0, d3
vabdl.u8 q12, d0, d4
mov lr, #15
mov lr, #15
pixel_sad_4_8x16_loop_0:
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1
vmov.8 d1, d6 //save pix2 - stride
vmov.8 d6, d2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vabal.u8 q15, d0, d1
vld1.8 {d0}, [r0], r1 //save pix1
vmov.8 d1, d6 //save pix2 - stride
vmov.8 d6, d2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vabal.u8 q15, d0, d1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabal.u8 q14, d0, d2
vabal.u8 q13, d0, d3
vabal.u8 q12, d0, d4
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabal.u8 q14, d0, d2
vabal.u8 q13, d0, d3
vabal.u8 q12, d0, d4
subs lr, #1
bne pixel_sad_4_8x16_loop_0
bne pixel_sad_4_8x16_loop_0
//Save SAD to 'r0'
ldr r0, [sp, #12]
ldr r0, [sp, #12]
vadd.u16 d0, d30, d31
vadd.u16 d1, d28, d29
vadd.u16 d2, d26, d27
vadd.u16 d3, d24, d25
vadd.u16 d0, d30, d31
vadd.u16 d1, d28, d29
vadd.u16 d2, d26, d27
vadd.u16 d3, d24, d25
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr}
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon
stmdb sp!, {r4-r5, lr}
stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1
vld1.8 {d0}, [r0], r1 //save pix1
vld1.8 {d1}, [r2], r3 //save pix2 - stride
vld1.8 {d6}, [r2], r3 //save pix2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d1}, [r2], r3 //save pix2 - stride
vld1.8 {d6}, [r2], r3 //save pix2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabdl.u8 q15, d0, d1
vabdl.u8 q14, d0, d2
vabdl.u8 q13, d0, d3
vabdl.u8 q12, d0, d4
//Do the SAD for 8 bytes
vabdl.u8 q15, d0, d1
vabdl.u8 q14, d0, d2
vabdl.u8 q13, d0, d3
vabdl.u8 q12, d0, d4
mov lr, #7
mov lr, #7
pixel_sad_4_8x8_loop_0:
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1
vmov.8 d1, d6 //save pix2 - stride
vmov.8 d6, d2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vabal.u8 q15, d0, d1
vld1.8 {d0}, [r0], r1 //save pix1
vmov.8 d1, d6 //save pix2 - stride
vmov.8 d6, d2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vabal.u8 q15, d0, d1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabal.u8 q14, d0, d2
vabal.u8 q13, d0, d3
vabal.u8 q12, d0, d4
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabal.u8 q14, d0, d2
vabal.u8 q13, d0, d3
vabal.u8 q12, d0, d4
subs lr, #1
bne pixel_sad_4_8x8_loop_0
bne pixel_sad_4_8x8_loop_0
//Save SAD to 'r0'
ldr r0, [sp, #12]
ldr r0, [sp, #12]
vadd.u16 d0, d30, d31
vadd.u16 d1, d28, d29
vadd.u16 d2, d26, d27
vadd.u16 d3, d24, d25
vadd.u16 d0, d30, d31
vadd.u16 d1, d28, d29
vadd.u16 d2, d26, d27
vadd.u16 d3, d24, d25
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr}
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon
vld1.32 {d0[0]}, [r0], r1
vld1.32 {d0[1]}, [r0], r1
vld1.32 {d1[0]}, [r0], r1
vld1.32 {d1[1]}, [r0]
vld1.32 {d0[0]}, [r0], r1
vld1.32 {d0[1]}, [r0], r1
vld1.32 {d1[0]}, [r0], r1
vld1.32 {d1[1]}, [r0]
sub r0, r2, r3
vld1.32 {d2[0]}, [r0], r3
vld1.32 {d2[1]}, [r0], r3
vld1.32 {d3[0]}, [r0], r3
vld1.32 {d3[1]}, [r0], r3
vld1.32 {d4[0]}, [r0], r3
vld1.32 {d4[1]}, [r0]
sub r0, r2, r3
vld1.32 {d2[0]}, [r0], r3
vld1.32 {d2[1]}, [r0], r3
vld1.32 {d3[0]}, [r0], r3
vld1.32 {d3[1]}, [r0], r3
vld1.32 {d4[0]}, [r0], r3
vld1.32 {d4[1]}, [r0]
sub r0, r2, #1
vld1.32 {d5[0]}, [r0], r3
vld1.32 {d5[1]}, [r0], r3
vld1.32 {d6[0]}, [r0], r3
vld1.32 {d6[1]}, [r0]
sub r0, r2, #1
vld1.32 {d5[0]}, [r0], r3
vld1.32 {d5[1]}, [r0], r3
vld1.32 {d6[0]}, [r0], r3
vld1.32 {d6[1]}, [r0]
add r0, r2, #1
vld1.32 {d7[0]}, [r0], r3
vld1.32 {d7[1]}, [r0], r3
vld1.32 {d8[0]}, [r0], r3
vld1.32 {d8[1]}, [r0]
add r0, r2, #1
vld1.32 {d7[0]}, [r0], r3
vld1.32 {d7[1]}, [r0], r3
vld1.32 {d8[0]}, [r0], r3
vld1.32 {d8[1]}, [r0]
vabdl.u8 q15, d0, d2
vabdl.u8 q14, d1, d3
vabdl.u8 q15, d0, d2
vabdl.u8 q14, d1, d3
vabdl.u8 q13, d0, d3
vabdl.u8 q12, d1, d4
vabdl.u8 q13, d0, d3
vabdl.u8 q12, d1, d4
vabdl.u8 q11, d0, d5
vabdl.u8 q10, d1, d6
vabdl.u8 q11, d0, d5
vabdl.u8 q10, d1, d6
vabdl.u8 q9, d0, d7
vabdl.u8 q8, d1, d8
vabdl.u8 q9, d0, d7
vabdl.u8 q8, d1, d8
//Save SAD to 'r4'
ldr r0, [sp]
vadd.u16 q0, q14, q15
vadd.u16 q1, q12, q13
vadd.u16 q2, q10, q11
vadd.u16 q3, q8 , q9
//Save SAD to 'r4'
ldr r0, [sp]
vadd.u16 q0, q14, q15
vadd.u16 q1, q12, q13
vadd.u16 q2, q10, q11
vadd.u16 q3, q8 , q9
vadd.u16 d0, d1
vadd.u16 d1, d2, d3
vadd.u16 d2, d4, d5
vadd.u16 d3, d6, d7
vadd.u16 d0, d1
vadd.u16 d1, d2, d3
vadd.u16 d2, d4, d5
vadd.u16 d3, d6, d7
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
WELS_ASM_FUNC_END
@ -834,16 +834,16 @@ WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
//Load the pix1 data --- 16 bytes
vld1.32 {d0[0]}, [r0], r1
vld1.32 {d0[1]}, [r0], r1
vld1.32 {d1[0]}, [r0], r1
vld1.32 {d1[1]}, [r0]
vld1.32 {d0[0]}, [r0], r1
vld1.32 {d0[1]}, [r0], r1
vld1.32 {d1[0]}, [r0], r1
vld1.32 {d1[1]}, [r0]
//Load the pix2 data --- 16 bytes
vld1.32 {d2[0]}, [r2], r3
vld1.32 {d2[1]}, [r2], r3
vld1.32 {d3[0]}, [r2], r3
vld1.32 {d3[1]}, [r2]
vld1.32 {d2[0]}, [r2], r3
vld1.32 {d2[1]}, [r2], r3
vld1.32 {d3[0]}, [r2], r3
vld1.32 {d3[1]}, [r2]
//Get the difference
vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7}
@ -874,7 +874,7 @@ WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
vmov.u32 r0, d0[0]
vmov.u32 r0, d0[0]
WELS_ASM_FUNC_END

File diff suppressed because it is too large Load Diff

View File

@ -55,262 +55,262 @@ sse2_b_1 db -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1
align 16
byte_1pos_table:
db 0,0,0,0,0,0,0,0, ;0
db 0,0,0,0,0,0,0,1, ;1
db 1,0,0,0,0,0,0,1, ;2
db 1,0,0,0,0,0,0,2, ;3
db 2,0,0,0,0,0,0,1, ;4
db 2,0,0,0,0,0,0,2, ;5
db 2,1,0,0,0,0,0,2, ;6
db 2,1,0,0,0,0,0,3, ;7
db 3,0,0,0,0,0,0,1, ;8
db 3,0,0,0,0,0,0,2, ;9
db 3,1,0,0,0,0,0,2, ;10
db 3,1,0,0,0,0,0,3, ;11
db 3,2,0,0,0,0,0,2, ;12
db 3,2,0,0,0,0,0,3, ;13
db 3,2,1,0,0,0,0,3, ;14
db 3,2,1,0,0,0,0,4, ;15
db 4,0,0,0,0,0,0,1, ;16
db 4,0,0,0,0,0,0,2, ;17
db 4,1,0,0,0,0,0,2, ;18
db 4,1,0,0,0,0,0,3, ;19
db 4,2,0,0,0,0,0,2, ;20
db 4,2,0,0,0,0,0,3, ;21
db 4,2,1,0,0,0,0,3, ;22
db 4,2,1,0,0,0,0,4, ;23
db 4,3,0,0,0,0,0,2, ;24
db 4,3,0,0,0,0,0,3, ;25
db 4,3,1,0,0,0,0,3, ;26
db 4,3,1,0,0,0,0,4, ;27
db 4,3,2,0,0,0,0,3, ;28
db 4,3,2,0,0,0,0,4, ;29
db 4,3,2,1,0,0,0,4, ;30
db 4,3,2,1,0,0,0,5, ;31
db 5,0,0,0,0,0,0,1, ;32
db 5,0,0,0,0,0,0,2, ;33
db 5,1,0,0,0,0,0,2, ;34
db 5,1,0,0,0,0,0,3, ;35
db 5,2,0,0,0,0,0,2, ;36
db 5,2,0,0,0,0,0,3, ;37
db 5,2,1,0,0,0,0,3, ;38
db 5,2,1,0,0,0,0,4, ;39
db 5,3,0,0,0,0,0,2, ;40
db 5,3,0,0,0,0,0,3, ;41
db 5,3,1,0,0,0,0,3, ;42
db 5,3,1,0,0,0,0,4, ;43
db 5,3,2,0,0,0,0,3, ;44
db 5,3,2,0,0,0,0,4, ;45
db 5,3,2,1,0,0,0,4, ;46
db 5,3,2,1,0,0,0,5, ;47
db 5,4,0,0,0,0,0,2, ;48
db 5,4,0,0,0,0,0,3, ;49
db 5,4,1,0,0,0,0,3, ;50
db 5,4,1,0,0,0,0,4, ;51
db 5,4,2,0,0,0,0,3, ;52
db 5,4,2,0,0,0,0,4, ;53
db 5,4,2,1,0,0,0,4, ;54
db 5,4,2,1,0,0,0,5, ;55
db 5,4,3,0,0,0,0,3, ;56
db 5,4,3,0,0,0,0,4, ;57
db 5,4,3,1,0,0,0,4, ;58
db 5,4,3,1,0,0,0,5, ;59
db 5,4,3,2,0,0,0,4, ;60
db 5,4,3,2,0,0,0,5, ;61
db 5,4,3,2,1,0,0,5, ;62
db 5,4,3,2,1,0,0,6, ;63
db 6,0,0,0,0,0,0,1, ;64
db 6,0,0,0,0,0,0,2, ;65
db 6,1,0,0,0,0,0,2, ;66
db 6,1,0,0,0,0,0,3, ;67
db 6,2,0,0,0,0,0,2, ;68
db 6,2,0,0,0,0,0,3, ;69
db 6,2,1,0,0,0,0,3, ;70
db 6,2,1,0,0,0,0,4, ;71
db 6,3,0,0,0,0,0,2, ;72
db 6,3,0,0,0,0,0,3, ;73
db 6,3,1,0,0,0,0,3, ;74
db 6,3,1,0,0,0,0,4, ;75
db 6,3,2,0,0,0,0,3, ;76
db 6,3,2,0,0,0,0,4, ;77
db 6,3,2,1,0,0,0,4, ;78
db 6,3,2,1,0,0,0,5, ;79
db 6,4,0,0,0,0,0,2, ;80
db 6,4,0,0,0,0,0,3, ;81
db 6,4,1,0,0,0,0,3, ;82
db 6,4,1,0,0,0,0,4, ;83
db 6,4,2,0,0,0,0,3, ;84
db 6,4,2,0,0,0,0,4, ;85
db 6,4,2,1,0,0,0,4, ;86
db 6,4,2,1,0,0,0,5, ;87
db 6,4,3,0,0,0,0,3, ;88
db 6,4,3,0,0,0,0,4, ;89
db 6,4,3,1,0,0,0,4, ;90
db 6,4,3,1,0,0,0,5, ;91
db 6,4,3,2,0,0,0,4, ;92
db 6,4,3,2,0,0,0,5, ;93
db 6,4,3,2,1,0,0,5, ;94
db 6,4,3,2,1,0,0,6, ;95
db 6,5,0,0,0,0,0,2, ;96
db 6,5,0,0,0,0,0,3, ;97
db 6,5,1,0,0,0,0,3, ;98
db 6,5,1,0,0,0,0,4, ;99
db 6,5,2,0,0,0,0,3, ;100
db 6,5,2,0,0,0,0,4, ;101
db 6,5,2,1,0,0,0,4, ;102
db 6,5,2,1,0,0,0,5, ;103
db 6,5,3,0,0,0,0,3, ;104
db 6,5,3,0,0,0,0,4, ;105
db 6,5,3,1,0,0,0,4, ;106
db 6,5,3,1,0,0,0,5, ;107
db 6,5,3,2,0,0,0,4, ;108
db 6,5,3,2,0,0,0,5, ;109
db 6,5,3,2,1,0,0,5, ;110
db 6,5,3,2,1,0,0,6, ;111
db 6,5,4,0,0,0,0,3, ;112
db 6,5,4,0,0,0,0,4, ;113
db 6,5,4,1,0,0,0,4, ;114
db 6,5,4,1,0,0,0,5, ;115
db 6,5,4,2,0,0,0,4, ;116
db 6,5,4,2,0,0,0,5, ;117
db 6,5,4,2,1,0,0,5, ;118
db 6,5,4,2,1,0,0,6, ;119
db 6,5,4,3,0,0,0,4, ;120
db 6,5,4,3,0,0,0,5, ;121
db 6,5,4,3,1,0,0,5, ;122
db 6,5,4,3,1,0,0,6, ;123
db 6,5,4,3,2,0,0,5, ;124
db 6,5,4,3,2,0,0,6, ;125
db 6,5,4,3,2,1,0,6, ;126
db 6,5,4,3,2,1,0,7, ;127
db 7,0,0,0,0,0,0,1, ;128
db 7,0,0,0,0,0,0,2, ;129
db 7,1,0,0,0,0,0,2, ;130
db 7,1,0,0,0,0,0,3, ;131
db 7,2,0,0,0,0,0,2, ;132
db 7,2,0,0,0,0,0,3, ;133
db 7,2,1,0,0,0,0,3, ;134
db 7,2,1,0,0,0,0,4, ;135
db 7,3,0,0,0,0,0,2, ;136
db 7,3,0,0,0,0,0,3, ;137
db 7,3,1,0,0,0,0,3, ;138
db 7,3,1,0,0,0,0,4, ;139
db 7,3,2,0,0,0,0,3, ;140
db 7,3,2,0,0,0,0,4, ;141
db 7,3,2,1,0,0,0,4, ;142
db 7,3,2,1,0,0,0,5, ;143
db 7,4,0,0,0,0,0,2, ;144
db 7,4,0,0,0,0,0,3, ;145
db 7,4,1,0,0,0,0,3, ;146
db 7,4,1,0,0,0,0,4, ;147
db 7,4,2,0,0,0,0,3, ;148
db 7,4,2,0,0,0,0,4, ;149
db 7,4,2,1,0,0,0,4, ;150
db 7,4,2,1,0,0,0,5, ;151
db 7,4,3,0,0,0,0,3, ;152
db 7,4,3,0,0,0,0,4, ;153
db 7,4,3,1,0,0,0,4, ;154
db 7,4,3,1,0,0,0,5, ;155
db 7,4,3,2,0,0,0,4, ;156
db 7,4,3,2,0,0,0,5, ;157
db 7,4,3,2,1,0,0,5, ;158
db 7,4,3,2,1,0,0,6, ;159
db 7,5,0,0,0,0,0,2, ;160
db 7,5,0,0,0,0,0,3, ;161
db 7,5,1,0,0,0,0,3, ;162
db 7,5,1,0,0,0,0,4, ;163
db 7,5,2,0,0,0,0,3, ;164
db 7,5,2,0,0,0,0,4, ;165
db 7,5,2,1,0,0,0,4, ;166
db 7,5,2,1,0,0,0,5, ;167
db 7,5,3,0,0,0,0,3, ;168
db 7,5,3,0,0,0,0,4, ;169
db 7,5,3,1,0,0,0,4, ;170
db 7,5,3,1,0,0,0,5, ;171
db 7,5,3,2,0,0,0,4, ;172
db 7,5,3,2,0,0,0,5, ;173
db 7,5,3,2,1,0,0,5, ;174
db 7,5,3,2,1,0,0,6, ;175
db 7,5,4,0,0,0,0,3, ;176
db 7,5,4,0,0,0,0,4, ;177
db 7,5,4,1,0,0,0,4, ;178
db 7,5,4,1,0,0,0,5, ;179
db 7,5,4,2,0,0,0,4, ;180
db 7,5,4,2,0,0,0,5, ;181
db 7,5,4,2,1,0,0,5, ;182
db 7,5,4,2,1,0,0,6, ;183
db 7,5,4,3,0,0,0,4, ;184
db 7,5,4,3,0,0,0,5, ;185
db 7,5,4,3,1,0,0,5, ;186
db 7,5,4,3,1,0,0,6, ;187
db 7,5,4,3,2,0,0,5, ;188
db 7,5,4,3,2,0,0,6, ;189
db 7,5,4,3,2,1,0,6, ;190
db 7,5,4,3,2,1,0,7, ;191
db 7,6,0,0,0,0,0,2, ;192
db 7,6,0,0,0,0,0,3, ;193
db 7,6,1,0,0,0,0,3, ;194
db 7,6,1,0,0,0,0,4, ;195
db 7,6,2,0,0,0,0,3, ;196
db 7,6,2,0,0,0,0,4, ;197
db 7,6,2,1,0,0,0,4, ;198
db 7,6,2,1,0,0,0,5, ;199
db 7,6,3,0,0,0,0,3, ;200
db 7,6,3,0,0,0,0,4, ;201
db 7,6,3,1,0,0,0,4, ;202
db 7,6,3,1,0,0,0,5, ;203
db 7,6,3,2,0,0,0,4, ;204
db 7,6,3,2,0,0,0,5, ;205
db 7,6,3,2,1,0,0,5, ;206
db 7,6,3,2,1,0,0,6, ;207
db 7,6,4,0,0,0,0,3, ;208
db 7,6,4,0,0,0,0,4, ;209
db 7,6,4,1,0,0,0,4, ;210
db 7,6,4,1,0,0,0,5, ;211
db 7,6,4,2,0,0,0,4, ;212
db 7,6,4,2,0,0,0,5, ;213
db 7,6,4,2,1,0,0,5, ;214
db 7,6,4,2,1,0,0,6, ;215
db 7,6,4,3,0,0,0,4, ;216
db 7,6,4,3,0,0,0,5, ;217
db 7,6,4,3,1,0,0,5, ;218
db 7,6,4,3,1,0,0,6, ;219
db 7,6,4,3,2,0,0,5, ;220
db 7,6,4,3,2,0,0,6, ;221
db 7,6,4,3,2,1,0,6, ;222
db 7,6,4,3,2,1,0,7, ;223
db 7,6,5,0,0,0,0,3, ;224
db 7,6,5,0,0,0,0,4, ;225
db 7,6,5,1,0,0,0,4, ;226
db 7,6,5,1,0,0,0,5, ;227
db 7,6,5,2,0,0,0,4, ;228
db 7,6,5,2,0,0,0,5, ;229
db 7,6,5,2,1,0,0,5, ;230
db 7,6,5,2,1,0,0,6, ;231
db 7,6,5,3,0,0,0,4, ;232
db 7,6,5,3,0,0,0,5, ;233
db 7,6,5,3,1,0,0,5, ;234
db 7,6,5,3,1,0,0,6, ;235
db 7,6,5,3,2,0,0,5, ;236
db 7,6,5,3,2,0,0,6, ;237
db 7,6,5,3,2,1,0,6, ;238
db 7,6,5,3,2,1,0,7, ;239
db 7,6,5,4,0,0,0,4, ;240
db 7,6,5,4,0,0,0,5, ;241
db 7,6,5,4,1,0,0,5, ;242
db 7,6,5,4,1,0,0,6, ;243
db 7,6,5,4,2,0,0,5, ;244
db 7,6,5,4,2,0,0,6, ;245
db 7,6,5,4,2,1,0,6, ;246
db 7,6,5,4,2,1,0,7, ;247
db 7,6,5,4,3,0,0,5, ;248
db 7,6,5,4,3,0,0,6, ;249
db 7,6,5,4,3,1,0,6, ;250
db 7,6,5,4,3,1,0,7, ;251
db 7,6,5,4,3,2,0,6, ;252
db 7,6,5,4,3,2,0,7, ;253
db 7,6,5,4,3,2,1,7, ;254
db 7,6,5,4,3,2,1,8, ;255
db 0,0,0,0,0,0,0,0, ;0
db 0,0,0,0,0,0,0,1, ;1
db 1,0,0,0,0,0,0,1, ;2
db 1,0,0,0,0,0,0,2, ;3
db 2,0,0,0,0,0,0,1, ;4
db 2,0,0,0,0,0,0,2, ;5
db 2,1,0,0,0,0,0,2, ;6
db 2,1,0,0,0,0,0,3, ;7
db 3,0,0,0,0,0,0,1, ;8
db 3,0,0,0,0,0,0,2, ;9
db 3,1,0,0,0,0,0,2, ;10
db 3,1,0,0,0,0,0,3, ;11
db 3,2,0,0,0,0,0,2, ;12
db 3,2,0,0,0,0,0,3, ;13
db 3,2,1,0,0,0,0,3, ;14
db 3,2,1,0,0,0,0,4, ;15
db 4,0,0,0,0,0,0,1, ;16
db 4,0,0,0,0,0,0,2, ;17
db 4,1,0,0,0,0,0,2, ;18
db 4,1,0,0,0,0,0,3, ;19
db 4,2,0,0,0,0,0,2, ;20
db 4,2,0,0,0,0,0,3, ;21
db 4,2,1,0,0,0,0,3, ;22
db 4,2,1,0,0,0,0,4, ;23
db 4,3,0,0,0,0,0,2, ;24
db 4,3,0,0,0,0,0,3, ;25
db 4,3,1,0,0,0,0,3, ;26
db 4,3,1,0,0,0,0,4, ;27
db 4,3,2,0,0,0,0,3, ;28
db 4,3,2,0,0,0,0,4, ;29
db 4,3,2,1,0,0,0,4, ;30
db 4,3,2,1,0,0,0,5, ;31
db 5,0,0,0,0,0,0,1, ;32
db 5,0,0,0,0,0,0,2, ;33
db 5,1,0,0,0,0,0,2, ;34
db 5,1,0,0,0,0,0,3, ;35
db 5,2,0,0,0,0,0,2, ;36
db 5,2,0,0,0,0,0,3, ;37
db 5,2,1,0,0,0,0,3, ;38
db 5,2,1,0,0,0,0,4, ;39
db 5,3,0,0,0,0,0,2, ;40
db 5,3,0,0,0,0,0,3, ;41
db 5,3,1,0,0,0,0,3, ;42
db 5,3,1,0,0,0,0,4, ;43
db 5,3,2,0,0,0,0,3, ;44
db 5,3,2,0,0,0,0,4, ;45
db 5,3,2,1,0,0,0,4, ;46
db 5,3,2,1,0,0,0,5, ;47
db 5,4,0,0,0,0,0,2, ;48
db 5,4,0,0,0,0,0,3, ;49
db 5,4,1,0,0,0,0,3, ;50
db 5,4,1,0,0,0,0,4, ;51
db 5,4,2,0,0,0,0,3, ;52
db 5,4,2,0,0,0,0,4, ;53
db 5,4,2,1,0,0,0,4, ;54
db 5,4,2,1,0,0,0,5, ;55
db 5,4,3,0,0,0,0,3, ;56
db 5,4,3,0,0,0,0,4, ;57
db 5,4,3,1,0,0,0,4, ;58
db 5,4,3,1,0,0,0,5, ;59
db 5,4,3,2,0,0,0,4, ;60
db 5,4,3,2,0,0,0,5, ;61
db 5,4,3,2,1,0,0,5, ;62
db 5,4,3,2,1,0,0,6, ;63
db 6,0,0,0,0,0,0,1, ;64
db 6,0,0,0,0,0,0,2, ;65
db 6,1,0,0,0,0,0,2, ;66
db 6,1,0,0,0,0,0,3, ;67
db 6,2,0,0,0,0,0,2, ;68
db 6,2,0,0,0,0,0,3, ;69
db 6,2,1,0,0,0,0,3, ;70
db 6,2,1,0,0,0,0,4, ;71
db 6,3,0,0,0,0,0,2, ;72
db 6,3,0,0,0,0,0,3, ;73
db 6,3,1,0,0,0,0,3, ;74
db 6,3,1,0,0,0,0,4, ;75
db 6,3,2,0,0,0,0,3, ;76
db 6,3,2,0,0,0,0,4, ;77
db 6,3,2,1,0,0,0,4, ;78
db 6,3,2,1,0,0,0,5, ;79
db 6,4,0,0,0,0,0,2, ;80
db 6,4,0,0,0,0,0,3, ;81
db 6,4,1,0,0,0,0,3, ;82
db 6,4,1,0,0,0,0,4, ;83
db 6,4,2,0,0,0,0,3, ;84
db 6,4,2,0,0,0,0,4, ;85
db 6,4,2,1,0,0,0,4, ;86
db 6,4,2,1,0,0,0,5, ;87
db 6,4,3,0,0,0,0,3, ;88
db 6,4,3,0,0,0,0,4, ;89
db 6,4,3,1,0,0,0,4, ;90
db 6,4,3,1,0,0,0,5, ;91
db 6,4,3,2,0,0,0,4, ;92
db 6,4,3,2,0,0,0,5, ;93
db 6,4,3,2,1,0,0,5, ;94
db 6,4,3,2,1,0,0,6, ;95
db 6,5,0,0,0,0,0,2, ;96
db 6,5,0,0,0,0,0,3, ;97
db 6,5,1,0,0,0,0,3, ;98
db 6,5,1,0,0,0,0,4, ;99
db 6,5,2,0,0,0,0,3, ;100
db 6,5,2,0,0,0,0,4, ;101
db 6,5,2,1,0,0,0,4, ;102
db 6,5,2,1,0,0,0,5, ;103
db 6,5,3,0,0,0,0,3, ;104
db 6,5,3,0,0,0,0,4, ;105
db 6,5,3,1,0,0,0,4, ;106
db 6,5,3,1,0,0,0,5, ;107
db 6,5,3,2,0,0,0,4, ;108
db 6,5,3,2,0,0,0,5, ;109
db 6,5,3,2,1,0,0,5, ;110
db 6,5,3,2,1,0,0,6, ;111
db 6,5,4,0,0,0,0,3, ;112
db 6,5,4,0,0,0,0,4, ;113
db 6,5,4,1,0,0,0,4, ;114
db 6,5,4,1,0,0,0,5, ;115
db 6,5,4,2,0,0,0,4, ;116
db 6,5,4,2,0,0,0,5, ;117
db 6,5,4,2,1,0,0,5, ;118
db 6,5,4,2,1,0,0,6, ;119
db 6,5,4,3,0,0,0,4, ;120
db 6,5,4,3,0,0,0,5, ;121
db 6,5,4,3,1,0,0,5, ;122
db 6,5,4,3,1,0,0,6, ;123
db 6,5,4,3,2,0,0,5, ;124
db 6,5,4,3,2,0,0,6, ;125
db 6,5,4,3,2,1,0,6, ;126
db 6,5,4,3,2,1,0,7, ;127
db 7,0,0,0,0,0,0,1, ;128
db 7,0,0,0,0,0,0,2, ;129
db 7,1,0,0,0,0,0,2, ;130
db 7,1,0,0,0,0,0,3, ;131
db 7,2,0,0,0,0,0,2, ;132
db 7,2,0,0,0,0,0,3, ;133
db 7,2,1,0,0,0,0,3, ;134
db 7,2,1,0,0,0,0,4, ;135
db 7,3,0,0,0,0,0,2, ;136
db 7,3,0,0,0,0,0,3, ;137
db 7,3,1,0,0,0,0,3, ;138
db 7,3,1,0,0,0,0,4, ;139
db 7,3,2,0,0,0,0,3, ;140
db 7,3,2,0,0,0,0,4, ;141
db 7,3,2,1,0,0,0,4, ;142
db 7,3,2,1,0,0,0,5, ;143
db 7,4,0,0,0,0,0,2, ;144
db 7,4,0,0,0,0,0,3, ;145
db 7,4,1,0,0,0,0,3, ;146
db 7,4,1,0,0,0,0,4, ;147
db 7,4,2,0,0,0,0,3, ;148
db 7,4,2,0,0,0,0,4, ;149
db 7,4,2,1,0,0,0,4, ;150
db 7,4,2,1,0,0,0,5, ;151
db 7,4,3,0,0,0,0,3, ;152
db 7,4,3,0,0,0,0,4, ;153
db 7,4,3,1,0,0,0,4, ;154
db 7,4,3,1,0,0,0,5, ;155
db 7,4,3,2,0,0,0,4, ;156
db 7,4,3,2,0,0,0,5, ;157
db 7,4,3,2,1,0,0,5, ;158
db 7,4,3,2,1,0,0,6, ;159
db 7,5,0,0,0,0,0,2, ;160
db 7,5,0,0,0,0,0,3, ;161
db 7,5,1,0,0,0,0,3, ;162
db 7,5,1,0,0,0,0,4, ;163
db 7,5,2,0,0,0,0,3, ;164
db 7,5,2,0,0,0,0,4, ;165
db 7,5,2,1,0,0,0,4, ;166
db 7,5,2,1,0,0,0,5, ;167
db 7,5,3,0,0,0,0,3, ;168
db 7,5,3,0,0,0,0,4, ;169
db 7,5,3,1,0,0,0,4, ;170
db 7,5,3,1,0,0,0,5, ;171
db 7,5,3,2,0,0,0,4, ;172
db 7,5,3,2,0,0,0,5, ;173
db 7,5,3,2,1,0,0,5, ;174
db 7,5,3,2,1,0,0,6, ;175
db 7,5,4,0,0,0,0,3, ;176
db 7,5,4,0,0,0,0,4, ;177
db 7,5,4,1,0,0,0,4, ;178
db 7,5,4,1,0,0,0,5, ;179
db 7,5,4,2,0,0,0,4, ;180
db 7,5,4,2,0,0,0,5, ;181
db 7,5,4,2,1,0,0,5, ;182
db 7,5,4,2,1,0,0,6, ;183
db 7,5,4,3,0,0,0,4, ;184
db 7,5,4,3,0,0,0,5, ;185
db 7,5,4,3,1,0,0,5, ;186
db 7,5,4,3,1,0,0,6, ;187
db 7,5,4,3,2,0,0,5, ;188
db 7,5,4,3,2,0,0,6, ;189
db 7,5,4,3,2,1,0,6, ;190
db 7,5,4,3,2,1,0,7, ;191
db 7,6,0,0,0,0,0,2, ;192
db 7,6,0,0,0,0,0,3, ;193
db 7,6,1,0,0,0,0,3, ;194
db 7,6,1,0,0,0,0,4, ;195
db 7,6,2,0,0,0,0,3, ;196
db 7,6,2,0,0,0,0,4, ;197
db 7,6,2,1,0,0,0,4, ;198
db 7,6,2,1,0,0,0,5, ;199
db 7,6,3,0,0,0,0,3, ;200
db 7,6,3,0,0,0,0,4, ;201
db 7,6,3,1,0,0,0,4, ;202
db 7,6,3,1,0,0,0,5, ;203
db 7,6,3,2,0,0,0,4, ;204
db 7,6,3,2,0,0,0,5, ;205
db 7,6,3,2,1,0,0,5, ;206
db 7,6,3,2,1,0,0,6, ;207
db 7,6,4,0,0,0,0,3, ;208
db 7,6,4,0,0,0,0,4, ;209
db 7,6,4,1,0,0,0,4, ;210
db 7,6,4,1,0,0,0,5, ;211
db 7,6,4,2,0,0,0,4, ;212
db 7,6,4,2,0,0,0,5, ;213
db 7,6,4,2,1,0,0,5, ;214
db 7,6,4,2,1,0,0,6, ;215
db 7,6,4,3,0,0,0,4, ;216
db 7,6,4,3,0,0,0,5, ;217
db 7,6,4,3,1,0,0,5, ;218
db 7,6,4,3,1,0,0,6, ;219
db 7,6,4,3,2,0,0,5, ;220
db 7,6,4,3,2,0,0,6, ;221
db 7,6,4,3,2,1,0,6, ;222
db 7,6,4,3,2,1,0,7, ;223
db 7,6,5,0,0,0,0,3, ;224
db 7,6,5,0,0,0,0,4, ;225
db 7,6,5,1,0,0,0,4, ;226
db 7,6,5,1,0,0,0,5, ;227
db 7,6,5,2,0,0,0,4, ;228
db 7,6,5,2,0,0,0,5, ;229
db 7,6,5,2,1,0,0,5, ;230
db 7,6,5,2,1,0,0,6, ;231
db 7,6,5,3,0,0,0,4, ;232
db 7,6,5,3,0,0,0,5, ;233
db 7,6,5,3,1,0,0,5, ;234
db 7,6,5,3,1,0,0,6, ;235
db 7,6,5,3,2,0,0,5, ;236
db 7,6,5,3,2,0,0,6, ;237
db 7,6,5,3,2,1,0,6, ;238
db 7,6,5,3,2,1,0,7, ;239
db 7,6,5,4,0,0,0,4, ;240
db 7,6,5,4,0,0,0,5, ;241
db 7,6,5,4,1,0,0,5, ;242
db 7,6,5,4,1,0,0,6, ;243
db 7,6,5,4,2,0,0,5, ;244
db 7,6,5,4,2,0,0,6, ;245
db 7,6,5,4,2,1,0,6, ;246
db 7,6,5,4,2,1,0,7, ;247
db 7,6,5,4,3,0,0,5, ;248
db 7,6,5,4,3,0,0,6, ;249
db 7,6,5,4,3,1,0,6, ;250
db 7,6,5,4,3,1,0,7, ;251
db 7,6,5,4,3,2,0,6, ;252
db 7,6,5,4,3,2,0,7, ;253
db 7,6,5,4,3,2,1,7, ;254
db 7,6,5,4,3,2,1,8, ;255
;***********************************************************************
; Code
@ -323,43 +323,43 @@ SECTION .text
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
WELS_EXTERN CavlcParamCal_sse2
push ebx
push edi
push esi
push ebx
push edi
push esi
mov eax, [esp+16] ;coffLevel
mov edi, [esp+24] ;Level
mov ebx, [esp+32] ;endIdx
cmp ebx, 3
jne .Level16
pxor xmm1, xmm1
movq xmm0, [eax] ; removed QWORD
jmp .Cal_begin
mov eax, [esp+16] ;coffLevel
mov edi, [esp+24] ;Level
mov ebx, [esp+32] ;endIdx
cmp ebx, 3
jne .Level16
pxor xmm1, xmm1
movq xmm0, [eax] ; removed QWORD
jmp .Cal_begin
.Level16:
movdqa xmm0, [eax]
movdqa xmm1, [eax+16]
movdqa xmm0, [eax]
movdqa xmm1, [eax+16]
.Cal_begin:
movdqa xmm2, xmm0
packsswb xmm0, xmm1
movdqa xmm4, xmm0
pxor xmm3, xmm3
pcmpgtb xmm0, xmm3
pcmpgtb xmm3, xmm4
por xmm0, xmm3
pmovmskb edx, xmm0
cmp edx, 0
je near .return
movdqa xmm6, [sse2_b_1]
pcmpeqw xmm7, xmm7 ;generate -1
mov ebx, 0xff
;pinsrw xmm6, ebx, 3
movdqa xmm2, xmm0
packsswb xmm0, xmm1
movdqa xmm4, xmm0
pxor xmm3, xmm3
pcmpgtb xmm0, xmm3
pcmpgtb xmm3, xmm4
por xmm0, xmm3
pmovmskb edx, xmm0
cmp edx, 0
je near .return
movdqa xmm6, [sse2_b_1]
pcmpeqw xmm7, xmm7 ;generate -1
mov ebx, 0xff
;pinsrw xmm6, ebx, 3
mov bl, dh
lea ebx, [byte_1pos_table+8*ebx]
movq xmm0, [ebx]
pextrw ecx, xmm0, 3
shr ecx, 8
lea ebx, [byte_1pos_table+8*ebx]
movq xmm0, [ebx]
pextrw ecx, xmm0, 3
shr ecx, 8
mov dh, cl
.loopHighFind0:
@ -367,19 +367,19 @@ WELS_EXTERN CavlcParamCal_sse2
je .loopHighFind0End
;mov esi, [ebx]
;and esi, 0xff
movzx esi, byte [ebx]
movzx esi, byte [ebx]
add esi, 8
mov esi, [eax+2*esi]
mov [edi], si
add edi, 2
;add ebx, 1
inc ebx
inc ebx
dec ecx
jmp .loopHighFind0
jmp .loopHighFind0
.loopHighFind0End:
mov cl, dh
cmp cl, 8
pand xmm0, xmm6
pand xmm0, xmm6
jne .LowByteFind0
sub edi, 2
mov esi, [eax+16]
@ -387,8 +387,8 @@ WELS_EXTERN CavlcParamCal_sse2
add edi, 2
.LowByteFind0:
and edx, 0xff
lea ebx, [byte_1pos_table+8*edx]
movq xmm1, [ebx]
lea ebx, [byte_1pos_table+8*edx]
movq xmm1, [ebx]
pextrw esi, xmm1, 3
or esi, 0xff
or ecx, 0xff00
@ -398,16 +398,16 @@ WELS_EXTERN CavlcParamCal_sse2
.loopLowFind0:
cmp esi, 0
je .loopLowFind0End
;mov edx, [ebx]
;and edx, 0xff
movzx edx, byte [ebx]
mov edx, [eax+2*edx]
mov [edi], dx
add edi, 2
;add ebx, 1
inc ebx
;mov edx, [ebx]
;and edx, 0xff
movzx edx, byte [ebx]
mov edx, [eax+2*edx]
mov [edi], dx
add edi, 2
;add ebx, 1
inc ebx
dec esi
jmp .loopLowFind0
jmp .loopLowFind0
.loopLowFind0End:
cmp ch, 8
jne .getLevelEnd
@ -415,12 +415,12 @@ WELS_EXTERN CavlcParamCal_sse2
mov edx, [eax]
mov [edi], dx
.getLevelEnd:
mov edx, [esp+28] ;total_coeffs
mov edx, [esp+28] ;total_coeffs
;mov ebx, ecx
;and ebx, 0xff
movzx ebx, byte cl
movzx ebx, byte cl
add cl, ch
mov [edx], cl
mov [edx], cl
;getRun
movq xmm5, [sse2_b8]
paddb xmm0, xmm5
@ -430,7 +430,7 @@ WELS_EXTERN CavlcParamCal_sse2
sub eax, ebx
shl eax, 3
shl ebx, 3
pinsrw xmm2, ebx, 0
pinsrw xmm2, ebx, 0
pinsrw xmm3, eax, 0
psllq xmm0, xmm3
psrlq xmm0, xmm3
@ -441,19 +441,19 @@ WELS_EXTERN CavlcParamCal_sse2
por xmm0, xmm1
pextrw eax, xmm0, 0
and eax, 0xff
and eax, 0xff
inc eax
sub al, cl
movdqa xmm1, xmm0
paddb xmm1, xmm7
psrldq xmm0, 1
psubb xmm1, xmm0
movdqa xmm1, xmm0
paddb xmm1, xmm7
psrldq xmm0, 1
psubb xmm1, xmm0
mov ecx, [esp+20] ;run
movdqa [ecx], xmm1
movdqa [ecx], xmm1
;getRunEnd
.return:
pop esi
pop edi
pop ebx
ret
pop esi
pop edi
pop ebx
ret
%endif

View File

@ -50,17 +50,17 @@ SECTION .rodata align=16
align 16
SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
dw 10, 13, 10, 13, 13, 16, 13, 16,
dw 10, 13, 10, 13, 13, 16, 13, 16,
dw 11, 14, 11, 14, 14, 18, 14, 18,
dw 11, 14, 11, 14, 14, 18, 14, 18,
dw 13, 16, 13, 16, 16, 20, 16, 20,
dw 13, 16, 13, 16, 16, 20, 16, 20,
dw 11, 14, 11, 14, 14, 18, 14, 18,
dw 13, 16, 13, 16, 16, 20, 16, 20,
dw 13, 16, 13, 16, 16, 20, 16, 20,
dw 14, 18, 14, 18, 18, 23, 18, 23,
dw 14, 18, 14, 18, 18, 23, 18, 23,
dw 16, 20, 16, 20, 20, 25, 20, 25,
dw 16, 20, 16, 20, 20, 25, 20, 25,
dw 14, 18, 14, 18, 18, 23, 18, 23,
dw 16, 20, 16, 20, 20, 25, 20, 25,
dw 16, 20, 16, 20, 20, 25, 20, 25,
dw 18, 23, 18, 23, 23, 29, 23, 29,
dw 18, 23, 18, 23, 23, 29, 23, 29
dw 18, 23, 18, 23, 23, 29, 23, 29
;***********************************************************************
@ -68,27 +68,27 @@ SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
;***********************************************************************
%macro MMX_LoadDiff4P 5
movd %1, [%3]
movd %2, [%4]
punpcklbw %1, %5
punpcklbw %2, %5
psubw %1, %2
movd %1, [%3]
movd %2, [%4]
punpcklbw %1, %5
punpcklbw %2, %5
psubw %1, %2
%endmacro
%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
MMX_LoadDiff4P %1, %9, %5, %7, %10
MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
lea %5, [%5+2*%6]
lea %7, [%7+2*%8]
MMX_LoadDiff4P %3, %9, %5, %7, %10
MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
MMX_LoadDiff4P %1, %9, %5, %7, %10
MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
lea %5, [%5+2*%6]
lea %7, [%7+2*%8]
MMX_LoadDiff4P %3, %9, %5, %7, %10
MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
%endmacro
%macro MMX_SumSubMul2 3
movq %3, %1
psllw %1, $01
paddw %1, %2
psllw %2, $01
movq %3, %1
psllw %1, $01
paddw %1, %2
psllw %2, $01
psubw %3, %2
%endmacro
@ -101,23 +101,23 @@ SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
%endmacro
%macro MMX_SumSub 3
movq %3, %2
movq %3, %2
psubw %2, %1
paddw %1, %3
%endmacro
%macro MMX_DCT 6
MMX_SumSub %4, %1, %6
MMX_SumSub %3, %2, %6
MMX_SumSub %3, %4, %6
MMX_SumSub %4, %1, %6
MMX_SumSub %3, %2, %6
MMX_SumSub %3, %4, %6
MMX_SumSubMul2 %1, %2, %5
%endmacro
%macro MMX_IDCT 6
MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1
MMX_SumSub %1, %4, %6
MMX_SumSub %3, %5, %6
MMX_SumSub %1, %4, %6
MMX_SumSub %3, %5, %6
%endmacro
%macro MMX_StoreDiff4P 6
@ -142,11 +142,11 @@ WELS_EXTERN WelsDctT4_mmx
MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
movq [r0+ 0], mm2
movq [r0+ 8], mm1
@ -170,22 +170,22 @@ WELS_EXTERN WelsIDctT4Rec_mmx
movq mm2, [r4+16]
movq mm3, [r4+24]
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
WELS_Zero mm7
WELS_DW32 mm6
WELS_Zero mm7
WELS_DW32 mm6
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
WELSEMMS
WELSEMMS
LOAD_5_PARA_POP
ret
@ -194,21 +194,21 @@ WELS_EXTERN WelsIDctT4Rec_mmx
; SSE2 functions
;***********************************************************************
%macro SSE2_Store4x8p 6
SSE2_XSawp qdq, %2, %3, %6
SSE2_XSawp qdq, %4, %5, %3
MOVDQ [%1+0x00], %2
MOVDQ [%1+0x10], %4
MOVDQ [%1+0x20], %6
MOVDQ [%1+0x30], %3
SSE2_XSawp qdq, %2, %3, %6
SSE2_XSawp qdq, %4, %5, %3
MOVDQ [%1+0x00], %2
MOVDQ [%1+0x10], %4
MOVDQ [%1+0x20], %6
MOVDQ [%1+0x30], %3
%endmacro
%macro SSE2_Load4x8p 6
MOVDQ %2, [%1+0x00]
MOVDQ %4, [%1+0x10]
MOVDQ %6, [%1+0x20]
MOVDQ %3, [%1+0x30]
SSE2_XSawp qdq, %4, %3, %5
SSE2_XSawp qdq, %2, %6, %3
MOVDQ %2, [%1+0x00]
MOVDQ %4, [%1+0x10]
MOVDQ %6, [%1+0x20]
MOVDQ %3, [%1+0x30]
SSE2_XSawp qdq, %4, %3, %5
SSE2_XSawp qdq, %2, %6, %3
%endmacro
%macro SSE2_SumSubMul2 3
@ -231,57 +231,57 @@ WELS_EXTERN WelsIDctT4Rec_mmx
%macro SSE2_StoreDiff8p 6
paddw %1, %3
psraw %1, $06
movq %2, %6
movq %2, %6
punpcklbw %2, %4
paddsw %2, %1
packuswb %2, %2
movq %5, %2
movq %5, %2
%endmacro
%macro SSE2_StoreDiff8p 5
movq %2, %5
movq %2, %5
punpcklbw %2, %3
paddsw %2, %1
packuswb %2, %2
movq %4, %2
movq %4, %2
%endmacro
%macro SSE2_Load8DC 6
movdqa %1, %6 ; %1 = dc0 dc1
paddw %1, %5
psraw %1, $06 ; (dc + 32) >> 6
%macro SSE2_Load8DC 6
movdqa %1, %6 ; %1 = dc0 dc1
paddw %1, %5
psraw %1, $06 ; (dc + 32) >> 6
movdqa %2, %1
psrldq %2, 4
punpcklwd %2, %2
punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
movdqa %2, %1
psrldq %2, 4
punpcklwd %2, %2
punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
movdqa %3, %1
psrldq %3, 8
punpcklwd %3, %3
punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
movdqa %3, %1
psrldq %3, 8
punpcklwd %3, %3
punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
movdqa %4, %1
psrldq %4, 12
punpcklwd %4, %4
punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
movdqa %4, %1
psrldq %4, 12
punpcklwd %4, %4
punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
punpcklwd %1, %1
punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
punpcklwd %1, %1
punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
%endmacro
%macro SSE2_DCT 6
SSE2_SumSub %6, %3, %5
SSE2_SumSub %1, %2, %5
SSE2_SumSub %3, %2, %5
SSE2_SumSubMul2 %6, %1, %4
SSE2_SumSub %6, %3, %5
SSE2_SumSub %1, %2, %5
SSE2_SumSub %3, %2, %5
SSE2_SumSubMul2 %6, %1, %4
%endmacro
%macro SSE2_IDCT 7
SSE2_SumSub %7, %2, %6
SSE2_SumSubDiv2 %1, %3, %5, %4
SSE2_SumSub %2, %1, %5
SSE2_SumSub %7, %4, %5
SSE2_SumSub %2, %1, %5
SSE2_SumSub %7, %4, %5
%endmacro
;***********************************************************************
@ -294,42 +294,42 @@ WELS_EXTERN WelsDctFourT4_sse2
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r4, r4d
pxor xmm7, xmm7
;Load 4x8
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
;Load 4x8
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
;Load 4x8
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
lea r0, [r0+64]
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
POP_XMM
LOAD_5_PARA_POP
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
;Load 4x8
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
lea r0, [r0+64]
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
POP_XMM
LOAD_5_PARA_POP
ret
@ -337,168 +337,168 @@ WELS_EXTERN WelsDctFourT4_sse2
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
;***********************************************************************
WELS_EXTERN WelsIDctFourT4Rec_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
;Load 4x8
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
;Load 4x8
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
WELS_Zero xmm7
WELS_DW32 xmm6
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
add r4, 64
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
WELS_Zero xmm7
WELS_DW32 xmm6
WELS_Zero xmm7
WELS_DW32 xmm6
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
POP_XMM
LOAD_5_PARA_POP
; pop esi
; pop ebx
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
add r4, 64
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
WELS_Zero xmm7
WELS_DW32 xmm6
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
POP_XMM
LOAD_5_PARA_POP
; pop esi
; pop ebx
ret
%macro SSE2_StoreDiff4x8p 8
SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
%endmacro
;***********************************************************************
; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
;***********************************************************************
WELS_EXTERN WelsIDctRecI16x16Dc_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm7, xmm7
WELS_DW32 xmm6
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm7, xmm7
WELS_DW32 xmm6
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
POP_XMM
LOAD_5_PARA_POP
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
POP_XMM
LOAD_5_PARA_POP
ret
%macro SSE2_SumSubD 3
movdqa %3, %2
movdqa %3, %2
paddd %2, %1
psubd %1, %3
%endmacro
%macro SSE2_SumSubDiv2D 4
paddd %1, %2
paddd %1, %3
psrad %1, 1
movdqa %4, %1
psubd %4, %2
paddd %1, %2
paddd %1, %3
psrad %1, 1
movdqa %4, %1
psubd %4, %2
%endmacro
%macro SSE2_Load4Col 5
movsx r2, WORD[%5]
movd %1, r2d
movsx r2, WORD[%5 + 0x20]
movd %2, r2d
punpckldq %1, %2
movsx r2, WORD[%5 + 0x80]
movd %3, r2d
movsx r2, WORD[%5 + 0xa0]
movd %4, r2d
punpckldq %3, %4
punpcklqdq %1, %3
%macro SSE2_Load4Col 5
movsx r2, WORD[%5]
movd %1, r2d
movsx r2, WORD[%5 + 0x20]
movd %2, r2d
punpckldq %1, %2
movsx r2, WORD[%5 + 0x80]
movd %3, r2d
movsx r2, WORD[%5 + 0xa0]
movd %4, r2d
punpckldq %3, %4
punpcklqdq %1, %3
%endmacro
;***********************************************************************
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
;***********************************************************************
WELS_EXTERN WelsHadamardT4Dc_sse2
%assign push_num 0
LOAD_2_PARA
PUSH_XMM 8
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
%assign push_num 0
LOAD_2_PARA
PUSH_XMM 8
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
SSE2_SumSubD xmm1, xmm2, xmm7
SSE2_SumSubD xmm3, xmm4, xmm7
SSE2_SumSubD xmm2, xmm4, xmm7
SSE2_SumSubD xmm1, xmm3, xmm7
SSE2_SumSubD xmm1, xmm2, xmm7
SSE2_SumSubD xmm3, xmm4, xmm7
SSE2_SumSubD xmm2, xmm4, xmm7
SSE2_SumSubD xmm1, xmm3, xmm7
SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
SSE2_SumSubD xmm4, xmm3, xmm7
SSE2_SumSubD xmm5, xmm1, xmm7
SSE2_SumSubD xmm4, xmm3, xmm7
SSE2_SumSubD xmm5, xmm1, xmm7
WELS_DD1 xmm6
SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
WELS_DD1 xmm6
SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
packssdw xmm3, xmm4
packssdw xmm2, xmm1
movdqa [r0+ 0], xmm3
movdqa [r0+16], xmm2
packssdw xmm3, xmm4
packssdw xmm2, xmm1
movdqa [r0+ 0], xmm3
movdqa [r0+16], xmm2
POP_XMM
ret
POP_XMM
ret

File diff suppressed because it is too large Load Diff

View File

@ -34,362 +34,362 @@
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE_8x8B_MMX 10
MMX_XSwap bw, %1, %2, %8
MMX_XSwap bw, %3, %4, %2
MMX_XSwap bw, %5, %6, %4
movq %6, %9
movq %10, %4
MMX_XSwap bw, %7, %6, %4
MMX_XSwap bw, %1, %2, %8
MMX_XSwap bw, %3, %4, %2
MMX_XSwap bw, %5, %6, %4
movq %6, %9
movq %10, %4
MMX_XSwap bw, %7, %6, %4
MMX_XSwap wd, %1, %3, %6
MMX_XSwap wd, %8, %2, %3
MMX_XSwap wd, %5, %7, %2
movq %7, %10
movq %10, %3
MMX_XSwap wd, %7, %4, %3
MMX_XSwap wd, %1, %3, %6
MMX_XSwap wd, %8, %2, %3
MMX_XSwap wd, %5, %7, %2
movq %7, %10
movq %10, %3
MMX_XSwap wd, %7, %4, %3
MMX_XSwap dq, %1, %5, %4
MMX_XSwap dq, %6, %2, %5
MMX_XSwap dq, %8, %7, %2
movq %7, %10
movq %10, %5
MMX_XSwap dq, %7, %3, %5
MMX_XSwap dq, %1, %5, %4
MMX_XSwap dq, %6, %2, %5
MMX_XSwap dq, %8, %7, %2
movq %7, %10
movq %10, %5
MMX_XSwap dq, %7, %3, %5
movq %3, %10
movq %3, %10
%endmacro
;in: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
movq [%1], mm0 ; result of line 1, x8 bytes
movq [%1+%2], mm3 ; result of line 2
lea %1, [%1+2*%2]
movq [%1], mm5 ; result of line 3
movq [%1+%2], mm2 ; result of line 4
lea %1, [%1+2*%2]
movq [%1], mm7 ; result of line 5
movq [%1+%2], mm1 ; result of line 6
lea %1, [%1+2*%2]
movq [%1], mm6 ; result of line 7
movq [%1+%2], mm4 ; result of line 8
%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
movq [%1], mm0 ; result of line 1, x8 bytes
movq [%1+%2], mm3 ; result of line 2
lea %1, [%1+2*%2]
movq [%1], mm5 ; result of line 3
movq [%1+%2], mm2 ; result of line 4
lea %1, [%1+2*%2]
movq [%1], mm7 ; result of line 5
movq [%1+%2], mm1 ; result of line 6
lea %1, [%1+2*%2]
movq [%1], mm6 ; result of line 7
movq [%1+%2], mm4 ; result of line 8
%endmacro
;in: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
movq [%1], mm0 ; result of line 1, x8 bytes
movq [%1+%2], mm3 ; result of line 2
lea %3, [%1+2*%2]
movq [%3], mm5 ; result of line 3
movq [%3+%2], mm2 ; result of line 4
lea %3, [%3+2*%2]
movq [%3], mm7 ; result of line 5
movq [%3+%2], mm1 ; result of line 6
lea %3, [%3+2*%2]
movq [%3], mm6 ; result of line 7
movq [%3+%2], mm4 ; result of line 8
%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
movq [%1], mm0 ; result of line 1, x8 bytes
movq [%1+%2], mm3 ; result of line 2
lea %3, [%1+2*%2]
movq [%3], mm5 ; result of line 3
movq [%3+%2], mm2 ; result of line 4
lea %3, [%3+2*%2]
movq [%3], mm7 ; result of line 5
movq [%3+%2], mm1 ; result of line 6
lea %3, [%3+2*%2]
movq [%3], mm6 ; result of line 7
movq [%3+%2], mm4 ; result of line 8
%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
; for transpose 16x8
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
%macro TRANSPOSE_8x16B_SSE2 10
SSE2_XSawp bw, %1, %2, %8
SSE2_XSawp bw, %3, %4, %2
SSE2_XSawp bw, %5, %6, %4
movdqa %6, %9
movdqa %10, %4
SSE2_XSawp bw, %7, %6, %4
%macro TRANSPOSE_8x16B_SSE2 10
SSE2_XSawp bw, %1, %2, %8
SSE2_XSawp bw, %3, %4, %2
SSE2_XSawp bw, %5, %6, %4
movdqa %6, %9
movdqa %10, %4
SSE2_XSawp bw, %7, %6, %4
SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2
movdqa %7, %10
movdqa %10, %3
SSE2_XSawp wd, %7, %4, %3
SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2
movdqa %7, %10
movdqa %10, %3
SSE2_XSawp wd, %7, %4, %3
SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2
movdqa %7, %10
movdqa %10, %5
SSE2_XSawp dq, %7, %3, %5
SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2
movdqa %7, %10
movdqa %10, %5
SSE2_XSawp dq, %7, %3, %5
SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %10
movdqa %10, %1
SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %10
%endmacro ; end of TRANSPOSE_8x16B_SSE2
SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %10
movdqa %10, %1
SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %10
%endmacro ; end of TRANSPOSE_8x16B_SSE2
%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
movq [%1], xmm4 ; result of line 1, x8 bytes
movq [%1+%2], xmm2 ; result of line 2
lea %1, [%1+2*%2]
movq [%1], xmm3 ; result of line 3
movq [%1+%2], xmm7 ; result of line 4
%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
movq [%1], xmm4 ; result of line 1, x8 bytes
movq [%1+%2], xmm2 ; result of line 2
lea %1, [%1+2*%2]
movq [%1], xmm3 ; result of line 3
movq [%1+%2], xmm7 ; result of line 4
lea %1, [%1+2*%2]
movq [%1], xmm5 ; result of line 5
movq [%1+%2], xmm1 ; result of line 6
lea %1, [%1+2*%2]
movq [%1], xmm6 ; result of line 7
movq [%1+%2], xmm0 ; result of line 8
lea %1, [%1+2*%2]
movq [%1], xmm5 ; result of line 5
movq [%1+%2], xmm1 ; result of line 6
lea %1, [%1+2*%2]
movq [%1], xmm6 ; result of line 7
movq [%1+%2], xmm0 ; result of line 8
lea %1, [%1+2*%2]
movhpd [%1], xmm4 ; result of line 9
movhpd [%1+%2], xmm2 ; result of line 10
lea %1, [%1+2*%2]
movhpd [%1], xmm3 ; result of line 11
movhpd [%1+%2], xmm7 ; result of line 12
lea %1, [%1+2*%2]
movhpd [%1], xmm4 ; result of line 9
movhpd [%1+%2], xmm2 ; result of line 10
lea %1, [%1+2*%2]
movhpd [%1], xmm3 ; result of line 11
movhpd [%1+%2], xmm7 ; result of line 12
lea %1, [%1+2*%2]
movhpd [%1], xmm5 ; result of line 13
movhpd [%1+%2], xmm1 ; result of line 14
lea %1, [%1+2*%2]
movhpd [%1], xmm6 ; result of line 15
movhpd [%1+%2], xmm0 ; result of line 16
%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
lea %1, [%1+2*%2]
movhpd [%1], xmm5 ; result of line 13
movhpd [%1+%2], xmm1 ; result of line 14
lea %1, [%1+2*%2]
movhpd [%1], xmm6 ; result of line 15
movhpd [%1+%2], xmm0 ; result of line 16
%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
movq [%1], xmm4 ; result of line 1, x8 bytes
movq [%1+%2], xmm2 ; result of line 2
lea %3, [%1+2*%2]
movq [%3], xmm3 ; result of line 3
movq [%3+%2], xmm7 ; result of line 4
%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
movq [%1], xmm4 ; result of line 1, x8 bytes
movq [%1+%2], xmm2 ; result of line 2
lea %3, [%1+2*%2]
movq [%3], xmm3 ; result of line 3
movq [%3+%2], xmm7 ; result of line 4
lea %3, [%3+2*%2]
movq [%3], xmm5 ; result of line 5
movq [%3+%2], xmm1 ; result of line 6
lea %3, [%3+2*%2]
movq [%3], xmm6 ; result of line 7
movq [%3+%2], xmm0 ; result of line 8
lea %3, [%3+2*%2]
movq [%3], xmm5 ; result of line 5
movq [%3+%2], xmm1 ; result of line 6
lea %3, [%3+2*%2]
movq [%3], xmm6 ; result of line 7
movq [%3+%2], xmm0 ; result of line 8
lea %3, [%3+2*%2]
movhpd [%3], xmm4 ; result of line 9
movhpd [%3+%2], xmm2 ; result of line 10
lea %3, [%3+2*%2]
movhpd [%3], xmm3 ; result of line 11
movhpd [%3+%2], xmm7 ; result of line 12
lea %3, [%3+2*%2]
movhpd [%3], xmm4 ; result of line 9
movhpd [%3+%2], xmm2 ; result of line 10
lea %3, [%3+2*%2]
movhpd [%3], xmm3 ; result of line 11
movhpd [%3+%2], xmm7 ; result of line 12
lea %3, [%3+2*%2]
movhpd [%3], xmm5 ; result of line 13
movhpd [%3+%2], xmm1 ; result of line 14
lea %3, [%3+2*%2]
movhpd [%3], xmm6 ; result of line 15
movhpd [%3+%2], xmm0 ; result of line 16
%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
lea %3, [%3+2*%2]
movhpd [%3], xmm5 ; result of line 13
movhpd [%3+%2], xmm1 ; result of line 14
lea %3, [%3+2*%2]
movhpd [%3], xmm6 ; result of line 15
movhpd [%3+%2], xmm0 ; result of line 16
%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
SECTION .text
WELS_EXTERN TransposeMatrixBlock16x16_sse2
; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
push r4
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
push r4
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
mov r4, r7
and r4, 0Fh
sub r7, 10h
sub r7, r4
lea r5, [r3+r3*2]
; top 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+r3*2]
movdqa xmm3, [r2+r5]
lea r2, [r2+r3*4]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+r3*2]
mov r4, r7
and r4, 0Fh
sub r7, 10h
sub r7, r4
lea r5, [r3+r3*2]
; top 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+r3*2]
movdqa xmm3, [r2+r5]
lea r2, [r2+r3*4]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+r3*2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
TRANSPOSE8x16_WRITE_SSE2 r0, r1
TRANSPOSE8x16_WRITE_SSE2 r0, r1
; bottom 8x16 block
lea r2, [r2+r3*4]
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+r3*2]
movdqa xmm3, [r2+r5]
lea r2, [r2+r3*4]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+r3*2]
; bottom 8x16 block
lea r2, [r2+r3*4]
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+r3*2]
movdqa xmm3, [r2+r5]
lea r2, [r2+r3*4]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+r3*2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
mov r5, r1
sal r5, 4
sub r0, r5
lea r0, [r0+r1*2+8]
TRANSPOSE8x16_WRITE_SSE2 r0, r1
mov r5, r1
sal r5, 4
sub r0, r5
lea r0, [r0+r1*2+8]
TRANSPOSE8x16_WRITE_SSE2 r0, r1
add r7, r4
add r7, 10h
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
add r7, r4
add r7, 10h
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
WELS_EXTERN TransposeMatrixBlocksx16_sse2
; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
push r5
push r6
%assign push_num 2
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
mov r5, r7
and r5, 0Fh
sub r7, 10h
sub r7, r5
push r5
push r6
%assign push_num 2
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
mov r5, r7
and r5, 0Fh
sub r7, 10h
sub r7, r5
TRANSPOSE_LOOP_SSE2:
; explictly loading next loop data
lea r6, [r2+r3*8]
push r4
; explictly loading next loop data
lea r6, [r2+r3*8]
push r4
%rep 8
mov r4, [r6]
mov r4, [r6+r3]
lea r6, [r6+r3*2]
mov r4, [r6]
mov r4, [r6+r3]
lea r6, [r6+r3*2]
%endrep
pop r4
; top 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm2, [r2]
movdqa xmm3, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm6, [r2]
pop r4
; top 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm2, [r2]
movdqa xmm3, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
lea r2, [r2+r3*2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
lea r2, [r2+r3*2]
; bottom 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm2, [r2]
movdqa xmm3, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm6, [r2]
; bottom 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm2, [r2]
movdqa xmm3, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
lea r2, [r2+r3*2]
lea r0, [r0+16]
dec r4
jg near TRANSPOSE_LOOP_SSE2
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
lea r2, [r2+r3*2]
lea r0, [r0+16]
dec r4
jg near TRANSPOSE_LOOP_SSE2
add r7, r5
add r7, 10h
POP_XMM
LOAD_5_PARA_POP
pop r6
pop r5
ret
add r7, r5
add r7, 10h
POP_XMM
LOAD_5_PARA_POP
pop r6
pop r5
ret
WELS_EXTERN TransposeMatrixBlock8x8_mmx
; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
sub r7, 8
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
sub r7, 8
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
TRANSPOSE8x8_WRITE_MMX r0, r1
TRANSPOSE8x8_WRITE_MMX r0, r1
emms
add r7, 8
LOAD_4_PARA_POP
ret
emms
add r7, 8
LOAD_4_PARA_POP
ret
WELS_EXTERN TransposeMatrixBlocksx8_mmx
; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
push r5
push r6
%assign push_num 2
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
sub r7, 8
push r5
push r6
%assign push_num 2
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
sub r7, 8
lea r5, [r2+r3*8]
lea r5, [r2+r3*8]
TRANSPOSE_BLOCKS_X8_LOOP_MMX:
; explictly loading next loop data
; explictly loading next loop data
%rep 4
mov r6, [r5]
mov r6, [r5+r3]
lea r5, [r5+r3*2]
mov r6, [r5]
mov r6, [r5+r3]
lea r5, [r5+r3*2]
%endrep
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
lea r0, [r0+8]
lea r2, [r2+2*r3]
dec r4
jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
lea r0, [r0+8]
lea r2, [r2+2*r3]
dec r4
jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
emms
add r7, 8
LOAD_5_PARA_POP
pop r6
pop r5
ret
emms
add r7, 8
LOAD_5_PARA_POP
pop r6
pop r5
ret

View File

@ -51,10 +51,10 @@ SECTION .text
;void WelsPrefetchZero_mmx(int8_t const*_A);
;***********************************************************************
WELS_EXTERN WelsPrefetchZero_mmx
%assign push_num 0
LOAD_1_PARA
prefetchnta [r0]
ret
%assign push_num 0
LOAD_1_PARA
prefetchnta [r0]
ret
;***********************************************************************
@ -62,71 +62,71 @@ WELS_EXTERN WelsPrefetchZero_mmx
;***********************************************************************
WELS_EXTERN WelsSetMemZeroAligned64_sse2
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
neg r1
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
neg r1
pxor xmm0, xmm0
pxor xmm0, xmm0
.memzeroa64_sse2_loops:
movdqa [r0], xmm0
movdqa [r0+16], xmm0
movdqa [r0+32], xmm0
movdqa [r0+48], xmm0
add r0, 0x40
movdqa [r0], xmm0
movdqa [r0+16], xmm0
movdqa [r0+32], xmm0
movdqa [r0+48], xmm0
add r0, 0x40
add r1, 0x40
jnz near .memzeroa64_sse2_loops
add r1, 0x40
jnz near .memzeroa64_sse2_loops
ret
ret
;***********************************************************************
; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
;***********************************************************************
WELS_EXTERN WelsSetMemZeroSize64_mmx
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
neg r1
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
neg r1
pxor mm0, mm0
pxor mm0, mm0
.memzero64_mmx_loops:
movq [r0], mm0
movq [r0+8], mm0
movq [r0+16], mm0
movq [r0+24], mm0
movq [r0+32], mm0
movq [r0+40], mm0
movq [r0+48], mm0
movq [r0+56], mm0
add r0, 0x40
movq [r0], mm0
movq [r0+8], mm0
movq [r0+16], mm0
movq [r0+24], mm0
movq [r0+32], mm0
movq [r0+40], mm0
movq [r0+48], mm0
movq [r0+56], mm0
add r0, 0x40
add r1, 0x40
jnz near .memzero64_mmx_loops
add r1, 0x40
jnz near .memzero64_mmx_loops
WELSEMMS
ret
WELSEMMS
ret
;***********************************************************************
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
;***********************************************************************
WELS_EXTERN WelsSetMemZeroSize8_mmx
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
neg r1
pxor mm0, mm0
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
neg r1
pxor mm0, mm0
.memzero8_mmx_loops:
movq [r0], mm0
add r0, 0x08
movq [r0], mm0
add r0, 0x08
add r1, 0x08
jnz near .memzero8_mmx_loops
add r1, 0x08
jnz near .memzero8_mmx_loops
WELSEMMS
ret
WELSEMMS
ret

View File

@ -49,241 +49,241 @@ SECTION .text
;************************************************
%macro SSE2_Quant8 5
MOVDQ %1, %5
pxor %2, %2
pcmpgtw %2, %1
pxor %1, %2
psubw %1, %2
paddusw %1, %3
pmulhuw %1, %4
pxor %1, %2
psubw %1, %2
MOVDQ %5, %1
MOVDQ %1, %5
pxor %2, %2
pcmpgtw %2, %1
pxor %1, %2
psubw %1, %2
paddusw %1, %3
pmulhuw %1, %4
pxor %1, %2
psubw %1, %2
MOVDQ %5, %1
%endmacro
%macro SSE2_QuantMax8 6
MOVDQ %1, %5
pxor %2, %2
pcmpgtw %2, %1
pxor %1, %2
psubw %1, %2
paddusw %1, %3
pmulhuw %1, %4
pmaxsw %6, %1
pxor %1, %2
psubw %1, %2
MOVDQ %5, %1
MOVDQ %1, %5
pxor %2, %2
pcmpgtw %2, %1
pxor %1, %2
psubw %1, %2
paddusw %1, %3
pmulhuw %1, %4
pmaxsw %6, %1
pxor %1, %2
psubw %1, %2
MOVDQ %5, %1
%endmacro
%define pDct esp + 4
%define ff esp + 8
%define mf esp + 12
%define max esp + 16
%define pDct esp + 4
%define ff esp + 8
%define mf esp + 12
%define max esp + 16
;***********************************************************************
; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
WELS_EXTERN WelsQuant4x4_sse2
%assign push_num 0
LOAD_3_PARA
movdqa xmm2, [r1]
movdqa xmm3, [r2]
%assign push_num 0
LOAD_3_PARA
movdqa xmm2, [r1]
movdqa xmm3, [r2]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
ret
ret
;***********************************************************************
;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
;***********************************************************************
WELS_EXTERN WelsQuant4x4Dc_sse2
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSIONW r1, r1w
SIGN_EXTENSIONW r2, r2w
SSE2_Copy8Times xmm3, r2d
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSIONW r1, r1w
SIGN_EXTENSIONW r2, r2w
SSE2_Copy8Times xmm3, r2d
SSE2_Copy8Times xmm2, r1d
SSE2_Copy8Times xmm2, r1d
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
ret
ret
;***********************************************************************
; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
WELS_EXTERN WelsQuantFour4x4_sse2
%assign push_num 0
LOAD_3_PARA
MOVDQ xmm2, [r1]
MOVDQ xmm3, [r2]
%assign push_num 0
LOAD_3_PARA
MOVDQ xmm2, [r1]
MOVDQ xmm3, [r2]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
ret
ret
;***********************************************************************
; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
;***********************************************************************
WELS_EXTERN WelsQuantFour4x4Max_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
MOVDQ xmm2, [r1]
MOVDQ xmm3, [r2]
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
MOVDQ xmm2, [r1]
MOVDQ xmm3, [r2]
pxor xmm4, xmm4
pxor xmm5, xmm5
pxor xmm6, xmm6
pxor xmm7, xmm7
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
pxor xmm4, xmm4
pxor xmm5, xmm5
pxor xmm6, xmm6
pxor xmm7, xmm7
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
pmaxsw xmm0, xmm4
pmaxsw xmm0, xmm5
pmaxsw xmm0, xmm7
movdqa xmm1, xmm0
punpckhqdq xmm0, xmm1
pmaxsw xmm0, xmm1
SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
pmaxsw xmm0, xmm4
pmaxsw xmm0, xmm5
pmaxsw xmm0, xmm7
movdqa xmm1, xmm0
punpckhqdq xmm0, xmm1
pmaxsw xmm0, xmm1
movq [r3], xmm0
POP_XMM
LOAD_4_PARA_POP
ret
movq [r3], xmm0
POP_XMM
LOAD_4_PARA_POP
ret
%macro MMX_Copy4Times 2
movd %1, %2
punpcklwd %1, %1
punpckldq %1, %1
movd %1, %2
punpcklwd %1, %1
punpckldq %1, %1
%endmacro
SECTION .text
%macro MMX_Quant4 4
pxor %2, %2
pcmpgtw %2, %1
pxor %1, %2
psubw %1, %2
paddusw %1, %3
pmulhuw %1, %4
pxor %1, %2
psubw %1, %2
pxor %2, %2
pcmpgtw %2, %1
pxor %1, %2
psubw %1, %2
paddusw %1, %3
pmulhuw %1, %4
pxor %1, %2
psubw %1, %2
%endmacro
;***********************************************************************
;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
;***********************************************************************
WELS_EXTERN WelsHadamardQuant2x2_mmx
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSIONW r1, r1w
SIGN_EXTENSIONW r2, r2w
movd mm0, [r0]
movd mm1, [r0 + 0x20]
punpcklwd mm0, mm1
movd mm3, [r0 + 0x40]
movd mm1, [r0 + 0x60]
punpcklwd mm3, mm1
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSIONW r1, r1w
SIGN_EXTENSIONW r2, r2w
movd mm0, [r0]
movd mm1, [r0 + 0x20]
punpcklwd mm0, mm1
movd mm3, [r0 + 0x40]
movd mm1, [r0 + 0x60]
punpcklwd mm3, mm1
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
movq mm5, mm3
paddw mm3, mm0
psubw mm0, mm5
punpcklwd mm3, mm0
movq mm1, mm3
psrlq mm1, 32
movq mm5, mm1
paddw mm1, mm3
psubw mm3, mm5
punpcklwd mm1, mm3
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
movq mm5, mm3
paddw mm3, mm0
psubw mm0, mm5
punpcklwd mm3, mm0
movq mm1, mm3
psrlq mm1, 32
movq mm5, mm1
paddw mm1, mm3
psubw mm3, mm5
punpcklwd mm1, mm3
;quant_2x2_dc
MMX_Copy4Times mm3, r2d
MMX_Copy4Times mm2, r1d
MMX_Quant4 mm1, mm0, mm2, mm3
;quant_2x2_dc
MMX_Copy4Times mm3, r2d
MMX_Copy4Times mm2, r1d
MMX_Quant4 mm1, mm0, mm2, mm3
; store dct_2x2
movq [r3], mm1
movq [r4], mm1
; store dct_2x2
movq [r3], mm1
movq [r4], mm1
; pNonZeroCount of dct_2x2
pcmpeqb mm2, mm2 ; mm2 = FF
pxor mm3, mm3
packsswb mm1, mm3
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
psadbw mm1, mm3 ;
mov r1w, 0
mov [r0], r1w
mov [r0 + 0x20], r1w
mov [r0 + 0x40], r1w
mov [r0 + 0x60], r1w
; pNonZeroCount of dct_2x2
pcmpeqb mm2, mm2 ; mm2 = FF
pxor mm3, mm3
packsswb mm1, mm3
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
psadbw mm1, mm3 ;
mov r1w, 0
mov [r0], r1w
mov [r0 + 0x20], r1w
mov [r0 + 0x40], r1w
mov [r0 + 0x60], r1w
movd retrd, mm1
movd retrd, mm1
WELSEMMS
LOAD_5_PARA_POP
ret
WELSEMMS
LOAD_5_PARA_POP
ret
;***********************************************************************
;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf);
;***********************************************************************
WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSIONW r1, r1w
SIGN_EXTENSIONW r2, r2w
movd mm0, [r0]
movd mm1, [r0 + 0x20]
punpcklwd mm0, mm1
movd mm3, [r0 + 0x40]
movd mm1, [r0 + 0x60]
punpcklwd mm3, mm1
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSIONW r1, r1w
SIGN_EXTENSIONW r2, r2w
movd mm0, [r0]
movd mm1, [r0 + 0x20]
punpcklwd mm0, mm1
movd mm3, [r0 + 0x40]
movd mm1, [r0 + 0x60]
punpcklwd mm3, mm1
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
movq mm5, mm3
paddw mm3, mm0
psubw mm0, mm5
punpcklwd mm3, mm0
movq mm1, mm3
psrlq mm1, 32
movq mm5, mm1
paddw mm1, mm3
psubw mm3, mm5
punpcklwd mm1, mm3
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
movq mm5, mm3
paddw mm3, mm0
psubw mm0, mm5
punpcklwd mm3, mm0
movq mm1, mm3
psrlq mm1, 32
movq mm5, mm1
paddw mm1, mm3
psubw mm3, mm5
punpcklwd mm1, mm3
;quant_2x2_dc
MMX_Copy4Times mm3, r2d
MMX_Copy4Times mm2, r1d
MMX_Quant4 mm1, mm0, mm2, mm3
;quant_2x2_dc
MMX_Copy4Times mm3, r2d
MMX_Copy4Times mm2, r1d
MMX_Quant4 mm1, mm0, mm2, mm3
; pNonZeroCount of dct_2x2
pcmpeqb mm2, mm2 ; mm2 = FF
pxor mm3, mm3
packsswb mm1, mm3
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
psadbw mm1, mm3 ;
movd retrd, mm1
; pNonZeroCount of dct_2x2
pcmpeqb mm2, mm2 ; mm2 = FF
pxor mm3, mm3
packsswb mm1, mm3
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
psadbw mm1, mm3 ;
movd retrd, mm1
WELSEMMS
ret
WELSEMMS
ret
%macro SSE2_DeQuant8 3
@ -297,12 +297,12 @@ WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
;***********************************************************************
WELS_EXTERN WelsDequant4x4_sse2
%assign push_num 0
LOAD_2_PARA
%assign push_num 0
LOAD_2_PARA
movdqa xmm1, [r1]
SSE2_DeQuant8 [r0 ], xmm0, xmm1
SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1
movdqa xmm1, [r1]
SSE2_DeQuant8 [r0 ], xmm0, xmm1
SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1
ret
@ -311,18 +311,18 @@ WELS_EXTERN WelsDequant4x4_sse2
;***********************************************************************====
WELS_EXTERN WelsDequantFour4x4_sse2
%assign push_num 0
LOAD_2_PARA
%assign push_num 0
LOAD_2_PARA
movdqa xmm1, [r1]
SSE2_DeQuant8 [r0 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1
movdqa xmm1, [r1]
SSE2_DeQuant8 [r0 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1
SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1
ret
@ -330,41 +330,41 @@ WELS_EXTERN WelsDequantFour4x4_sse2
;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
;***********************************************************************
WELS_EXTERN WelsDequantIHadamard4x4_sse2
%assign push_num 0
LOAD_2_PARA
%ifndef X86_32
movzx r1, r1w
%endif
%assign push_num 0
LOAD_2_PARA
%ifndef X86_32
movzx r1, r1w
%endif
; WelsDequantLumaDc4x4
SSE2_Copy8Times xmm1, r1d
;psrlw xmm1, 2 ; for the (>>2) in ihdm
MOVDQ xmm0, [r0]
MOVDQ xmm2, [r0+0x10]
pmullw xmm0, xmm1
pmullw xmm2, xmm1
; WelsDequantLumaDc4x4
SSE2_Copy8Times xmm1, r1d
;psrlw xmm1, 2 ; for the (>>2) in ihdm
MOVDQ xmm0, [r0]
MOVDQ xmm2, [r0+0x10]
pmullw xmm0, xmm1
pmullw xmm2, xmm1
; ihdm_4x4
movdqa xmm1, xmm0
psrldq xmm1, 8
movdqa xmm3, xmm2
psrldq xmm3, 8
; ihdm_4x4
movdqa xmm1, xmm0
psrldq xmm1, 8
movdqa xmm3, xmm2
psrldq xmm3, 8
SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
SSE2_SumSub xmm2, xmm4, xmm5
SSE2_SumSub xmm1, xmm0, xmm5
SSE2_SumSub xmm4, xmm0, xmm5
SSE2_SumSub xmm2, xmm1, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
SSE2_SumSub xmm2, xmm4, xmm5
SSE2_SumSub xmm1, xmm0, xmm5
SSE2_SumSub xmm4, xmm0, xmm5
SSE2_SumSub xmm2, xmm1, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
punpcklqdq xmm0, xmm1
MOVDQ [r0], xmm0
punpcklqdq xmm0, xmm1
MOVDQ [r0], xmm0
punpcklqdq xmm2, xmm3
MOVDQ [r0+16], xmm2
ret
punpcklqdq xmm2, xmm3
MOVDQ [r0+16], xmm2
ret

View File

@ -35,189 +35,189 @@ SECTION .text
;**********************************************************************************************************************************
;
; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
;
; \note:
; src need align with 16 bytes, ref is optional
; \return value:
; return minimal SAD cost, according index carried by index_min_cost
; \note:
; src need align with 16 bytes, ref is optional
; \return value:
; return minimal SAD cost, according index carried by index_min_cost
;**********************************************************************************************************************************
; try 8 mv via offset
; xmm7 store sad costs
%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
movdqa xmm0, [%1]
movdqu xmm1, [%2]
movdqu xmm2, [%2+8h]
movdqa xmm3, xmm1
movdqa xmm4, xmm2
%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
movdqa xmm0, [%1]
movdqu xmm1, [%2]
movdqu xmm2, [%2+8h]
movdqa xmm3, xmm1
movdqa xmm4, xmm2
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm3, xmm0, 5 ; 101 B
paddw xmm7, xmm3 ; accumulate cost
mpsadbw xmm3, xmm0, 5 ; 101 B
paddw xmm7, xmm3 ; accumulate cost
mpsadbw xmm2, xmm0, 2 ; 010 B
paddw xmm7, xmm2 ; accumulate cost
mpsadbw xmm2, xmm0, 2 ; 010 B
paddw xmm7, xmm2 ; accumulate cost
mpsadbw xmm4, xmm0, 7 ; 111 B
paddw xmm7, xmm4 ; accumulate cost
mpsadbw xmm4, xmm0, 7 ; 111 B
paddw xmm7, xmm4 ; accumulate cost
add %1, %3
add %2, %4
%endmacro ; end of SAD_16x16_LINE_SSE41
%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
movdqa xmm0, [%1]
movdqu xmm1, [%2]
movdqu xmm2, [%2+8h]
movdqa xmm3, xmm1
movdqa xmm4, xmm2
add %1, %3
add %2, %4
%endmacro ; end of SAD_16x16_LINE_SSE41
%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
movdqa xmm0, [%1]
movdqu xmm1, [%2]
movdqu xmm2, [%2+8h]
movdqa xmm3, xmm1
movdqa xmm4, xmm2
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm3, xmm0, 5 ; 101 B
paddw xmm7, xmm3 ; accumulate cost
mpsadbw xmm3, xmm0, 5 ; 101 B
paddw xmm7, xmm3 ; accumulate cost
mpsadbw xmm2, xmm0, 2 ; 010 B
paddw xmm7, xmm2 ; accumulate cost
mpsadbw xmm2, xmm0, 2 ; 010 B
paddw xmm7, xmm2 ; accumulate cost
mpsadbw xmm4, xmm0, 7 ; 111 B
paddw xmm7, xmm4 ; accumulate cost
%endmacro ; end of SAD_16x16_LINE_SSE41E
mpsadbw xmm4, xmm0, 7 ; 111 B
paddw xmm7, xmm4 ; accumulate cost
%endmacro ; end of SAD_16x16_LINE_SSE41E
WELS_EXTERN SampleSad16x16Hor8_sse41
;push ebx
;push esi
;mov eax, [esp+12] ; src
;mov ecx, [esp+16] ; stride_src
;mov ebx, [esp+20] ; ref
;mov edx, [esp+24] ; stride_ref
;mov esi, [esp+28] ; base_cost
;mov eax, [esp+12] ; src
;mov ecx, [esp+16] ; stride_src
;mov ebx, [esp+20] ; ref
;mov edx, [esp+24] ; stride_ref
;mov esi, [esp+28] ; base_cost
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm7, xmm7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm7, xmm7
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41E r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41E r0, r2, r1, r3
pxor xmm0, xmm0
movdqa xmm6, xmm7
punpcklwd xmm6, xmm0
punpckhwd xmm7, xmm0
pxor xmm0, xmm0
movdqa xmm6, xmm7
punpcklwd xmm6, xmm0
punpckhwd xmm7, xmm0
movdqa xmm5, [r4]
movdqa xmm4, xmm5
punpcklwd xmm4, xmm0
punpckhwd xmm5, xmm0
movdqa xmm5, [r4]
movdqa xmm4, xmm5
punpcklwd xmm4, xmm0
punpckhwd xmm5, xmm0
paddd xmm4, xmm6
paddd xmm5, xmm7
movdqa xmm3, xmm4
pminud xmm3, xmm5
pshufd xmm2, xmm3, 01001110B
pminud xmm2, xmm3
pshufd xmm3, xmm2, 10110001B
pminud xmm2, xmm3
movd retrd, xmm2
pcmpeqd xmm4, xmm2
movmskps r2d, xmm4
bsf r1d, r2d
jnz near WRITE_INDEX
paddd xmm4, xmm6
paddd xmm5, xmm7
movdqa xmm3, xmm4
pminud xmm3, xmm5
pshufd xmm2, xmm3, 01001110B
pminud xmm2, xmm3
pshufd xmm3, xmm2, 10110001B
pminud xmm2, xmm3
movd retrd, xmm2
pcmpeqd xmm4, xmm2
movmskps r2d, xmm4
bsf r1d, r2d
jnz near WRITE_INDEX
pcmpeqd xmm5, xmm2
movmskps r2d, xmm5
bsf r1d, r2d
add r1d, 4
pcmpeqd xmm5, xmm2
movmskps r2d, xmm5
bsf r1d, r2d
add r1d, 4
WRITE_INDEX:
mov [r5], r1d
mov [r5], r1d
POP_XMM
LOAD_6_PARA_POP
ret
;**********************************************************************************************************************************
;
; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
;
; \note:
; src and ref is optional to align with 16 due inter 8x8
; \return value:
; return minimal SAD cost, according index carried by index_min_cost
; \note:
; src and ref is optional to align with 16 due inter 8x8
; \return value:
; return minimal SAD cost, according index carried by index_min_cost
;
;**********************************************************************************************************************************
; try 8 mv via offset
; xmm7 store sad costs
%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
movdqu xmm0, [%1]
movdqu xmm1, [%2]
movdqa xmm2, xmm1
%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
movdqu xmm0, [%1]
movdqu xmm1, [%2]
movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm2, xmm0, 5 ; 101 B
paddw xmm7, xmm2 ; accumulate cost
mpsadbw xmm2, xmm0, 5 ; 101 B
paddw xmm7, xmm2 ; accumulate cost
add %1, %3
add %2, %4
%endmacro ; end of SAD_8x8_LINE_SSE41
%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
movdqu xmm0, [%1]
movdqu xmm1, [%2]
movdqa xmm2, xmm1
add %1, %3
add %2, %4
%endmacro ; end of SAD_8x8_LINE_SSE41
%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
movdqu xmm0, [%1]
movdqu xmm1, [%2]
movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm2, xmm0, 5 ; 101 B
paddw xmm7, xmm2 ; accumulate cost
%endmacro ; end of SAD_8x8_LINE_SSE41E
mpsadbw xmm2, xmm0, 5 ; 101 B
paddw xmm7, xmm2 ; accumulate cost
%endmacro ; end of SAD_8x8_LINE_SSE41E
WELS_EXTERN SampleSad8x8Hor8_sse41
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm7, [r4] ; load base cost list
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm7, [r4] ; load base cost list
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41E r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41E r0, r2, r1, r3
phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
mov r1d, retrd
and retrd, 0xFFFF
sar r1d, 16
mov [r5], r1d
phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
mov r1d, retrd
and retrd, 0xFFFF
sar r1d, 16
mov [r5], r1d
POP_XMM
LOAD_6_PARA_POP

View File

@ -104,32 +104,32 @@ db 6,7,6,7,7,8
align 16
high_mask_table:
db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2
db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5
db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8
db 9,12, 0, 1, 1, 4, 2, 4, 5, 8
db 2, 4, 4, 7, 5, 7, 8,11, 3, 4
db 5, 8, 5, 7, 8,11, 6, 8, 8,11
db 9,11,12,15, 0, 1, 1, 4, 1, 3
db 4, 7, 2, 4, 4, 7, 5, 7, 8,11
db 2, 3, 4, 7, 4, 6, 7,10, 5, 7
db 7,10, 8,10,11,14, 3, 4, 4, 7
db 5, 7, 8,11, 5, 7, 7,10, 8,10
db 11,14, 6, 7, 8,11, 8,10,11,14
db 9,11,11,14,12,14,15,18, 0, 0
db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6
db 4, 6, 7,10, 2, 3, 4, 7, 4, 6
db 7,10, 5, 7, 7,10, 8,10,11,14
db 2, 3, 3, 6, 4, 6, 7,10, 4, 6
db 6, 9, 7, 9,10,13, 5, 6, 7,10
db 7, 9,10,13, 8,10,10,13,11,13
db 14,17, 3, 4, 4, 7, 4, 6, 7,10
db 5, 7, 7,10, 8,10,11,14, 5, 6
db 7,10, 7, 9,10,13, 8,10,10,13
db 11,13,14,17, 6, 7, 7,10, 8,10
db 11,14, 8,10,10,13,11,13,14,17
db 9,10,11,14,11,13,14,17,12,14
db 14,17,15,17,18,21
db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2
db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5
db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8
db 9,12, 0, 1, 1, 4, 2, 4, 5, 8
db 2, 4, 4, 7, 5, 7, 8,11, 3, 4
db 5, 8, 5, 7, 8,11, 6, 8, 8,11
db 9,11,12,15, 0, 1, 1, 4, 1, 3
db 4, 7, 2, 4, 4, 7, 5, 7, 8,11
db 2, 3, 4, 7, 4, 6, 7,10, 5, 7
db 7,10, 8,10,11,14, 3, 4, 4, 7
db 5, 7, 8,11, 5, 7, 7,10, 8,10
db 11,14, 6, 7, 8,11, 8,10,11,14
db 9,11,11,14,12,14,15,18, 0, 0
db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6
db 4, 6, 7,10, 2, 3, 4, 7, 4, 6
db 7,10, 5, 7, 7,10, 8,10,11,14
db 2, 3, 3, 6, 4, 6, 7,10, 4, 6
db 6, 9, 7, 9,10,13, 5, 6, 7,10
db 7, 9,10,13, 8,10,10,13,11,13
db 14,17, 3, 4, 4, 7, 4, 6, 7,10
db 5, 7, 7,10, 8,10,11,14, 5, 6
db 7,10, 7, 9,10,13, 8,10,10,13
db 11,13,14,17, 6, 7, 7,10, 8,10
db 11,14, 8,10,10,13,11,13,14,17
db 9,10,11,14,11,13,14,17,12,14
db 14,17,15,17,18,21
align 16
low_mask_table:
@ -167,173 +167,173 @@ SECTION .text
;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
;***********************************************************************
WELS_EXTERN WelsScan4x4DcAc_sse2
%ifdef X86_32
push r3
%assign push_num 1
%else
%assign push_num 0
%endif
LOAD_2_PARA
movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
movdqa xmm1, [r1+16] ; f e d c b a 9 8
pextrw r2d, xmm0, 7 ; ecx = 7
pextrw r3d, xmm1, 2 ; edx = a
pextrw r1d, xmm0, 5 ; eax = 5
pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
pextrw r2d, xmm1, 0 ; ecx = 8
pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
movdqa [r0],xmm0
movdqa [r0+16], xmm1
%ifdef X86_32
pop r3
%endif
ret
%ifdef X86_32
push r3
%assign push_num 1
%else
%assign push_num 0
%endif
LOAD_2_PARA
movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
movdqa xmm1, [r1+16] ; f e d c b a 9 8
pextrw r2d, xmm0, 7 ; ecx = 7
pextrw r3d, xmm1, 2 ; edx = a
pextrw r1d, xmm0, 5 ; eax = 5
pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
pextrw r2d, xmm1, 0 ; ecx = 8
pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
movdqa [r0],xmm0
movdqa [r0+16], xmm1
%ifdef X86_32
pop r3
%endif
ret
;***********************************************************************
;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
;***********************************************************************
WELS_EXTERN WelsScan4x4DcAc_ssse3
%assign push_num 0
LOAD_2_PARA
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
pextrw r2d, xmm0, 7 ; ecx = [7]
pextrw r1d, xmm1, 0 ; eax = [8]
pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
pshufb xmm1, [pb_scanacdc_maskb]
pshufb xmm0, [pb_scanacdc_maska]
%assign push_num 0
LOAD_2_PARA
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
pextrw r2d, xmm0, 7 ; ecx = [7]
pextrw r1d, xmm1, 0 ; eax = [8]
pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
pshufb xmm1, [pb_scanacdc_maskb]
pshufb xmm0, [pb_scanacdc_maska]
movdqa [r0],xmm0
movdqa [r0+16], xmm1
ret
movdqa [r0],xmm0
movdqa [r0+16], xmm1
ret
;***********************************************************************
;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
;***********************************************************************
WELS_EXTERN WelsScan4x4Ac_sse2
%assign push_num 0
LOAD_2_PARA
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
movdqa xmm2, xmm0
punpcklqdq xmm0, xmm1
punpckhqdq xmm2, xmm1
%assign push_num 0
LOAD_2_PARA
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
movdqa xmm2, xmm0
punpcklqdq xmm0, xmm1
punpckhqdq xmm2, xmm1
movdqa xmm3, xmm0
punpckldq xmm0, xmm2
punpckhdq xmm3, xmm2
pextrw r1d , xmm0, 3
pextrw r2d , xmm0, 7
pinsrw xmm0, r1d, 7
pextrw r1d, xmm3, 4
pinsrw xmm3, r2d, 4
pextrw r2d, xmm3, 0
pinsrw xmm3, r1d, 0
pinsrw xmm0, r2d, 3
movdqa xmm3, xmm0
punpckldq xmm0, xmm2
punpckhdq xmm3, xmm2
pextrw r1d , xmm0, 3
pextrw r2d , xmm0, 7
pinsrw xmm0, r1d, 7
pextrw r1d, xmm3, 4
pinsrw xmm3, r2d, 4
pextrw r2d, xmm3, 0
pinsrw xmm3, r1d, 0
pinsrw xmm0, r2d, 3
pshufhw xmm1, xmm0, 0x93
pshuflw xmm2, xmm3, 0x39
pshufhw xmm1, xmm0, 0x93
pshuflw xmm2, xmm3, 0x39
movdqa xmm3, xmm2
psrldq xmm1, 2
pslldq xmm3, 14
por xmm1, xmm3
psrldq xmm2, 2
movdqa [r0],xmm1
movdqa [r0+16], xmm2
ret
movdqa [r0],xmm1
movdqa [r0+16], xmm2
ret
;***********************************************************************
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
;***********************************************************************
WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
%ifdef X86_32
push r3
%assign push_num 1
%else
%assign push_num 0
%endif
LOAD_1_PARA
movdqa xmm0, [r0]
movdqa xmm1, [r0+16]
%ifdef X86_32
push r3
%assign push_num 1
%else
%assign push_num 0
%endif
LOAD_1_PARA
movdqa xmm0, [r0]
movdqa xmm1, [r0+16]
packsswb xmm0, xmm1
; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
xor r3, r3
packsswb xmm0, xmm1
; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
xor r3, r3
pxor xmm3, xmm3
pcmpeqb xmm0, xmm3
pmovmskb r3d, xmm0
xor r3, 0xffff
xor r0, r0
mov r2, 7
mov r1, 8
xor r0, r0
mov r2, 7
mov r1, 8
.loop_low8_find1:
bt r3, r2
jc .loop_high8_find1
dec r2
jnz .loop_low8_find1
bt r3, r2
jc .loop_high8_find1
dec r2
jnz .loop_low8_find1
.loop_high8_find1:
bt r3, r1
jc .find1end
inc r1
cmp r1,16
jb .loop_high8_find1
bt r3, r1
jc .find1end
inc r1
cmp r1,16
jb .loop_high8_find1
.find1end:
sub r1, r2
sub r1, 1
lea r2, [i_ds_table]
add r0b, [r2+r1]
mov r1, r3
and r3, 0xff
shr r1, 8
and r1, 0xff
lea r2 , [low_mask_table]
add r0b, [r2 +r3]
lea r2, [high_mask_table]
add r0b, [r2+r1]
%ifdef X86_32
pop r3
%else
mov retrd, r0d
%endif
ret
sub r1, r2
sub r1, 1
lea r2, [i_ds_table]
add r0b, [r2+r1]
mov r1, r3
and r3, 0xff
shr r1, 8
and r1, 0xff
lea r2 , [low_mask_table]
add r0b, [r2 +r3]
lea r2, [high_mask_table]
add r0b, [r2+r1]
%ifdef X86_32
pop r3
%else
mov retrd, r0d
%endif
ret
;***********************************************************************
; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
;***********************************************************************
WELS_EXTERN WelsGetNoneZeroCount_sse2
%assign push_num 0
LOAD_1_PARA
movdqa xmm0, [r0]
movdqa xmm1, [r0+16]
pxor xmm2, xmm2
pcmpeqw xmm0, xmm2
pcmpeqw xmm1, xmm2
packsswb xmm1, xmm0
xor r1, r1
pmovmskb r1d, xmm1
xor r1d, 0xffff
mov r2, r1
and r1, 0xff
shr r2, 8
; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
; xor retr, retr
;add al, [nozero_count_table+r2]
lea r0 , [nozero_count_table]
movzx r2, byte [r0+r2]
movzx r1, byte [r0+r1]
mov retrq, r2
add retrq, r1
;add al, [nozero_count_table+r1]
ret
%assign push_num 0
LOAD_1_PARA
movdqa xmm0, [r0]
movdqa xmm1, [r0+16]
pxor xmm2, xmm2
pcmpeqw xmm0, xmm2
pcmpeqw xmm1, xmm2
packsswb xmm1, xmm0
xor r1, r1
pmovmskb r1d, xmm1
xor r1d, 0xffff
mov r2, r1
and r1, 0xff
shr r2, 8
; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
; xor retr, retr
;add al, [nozero_count_table+r2]
lea r0 , [nozero_count_table]
movzx r2, byte [r0+r2]
movzx r1, byte [r0+r1]
mov retrq, r2
add retrq, r1
;add al, [nozero_count_table+r1]
ret

View File

@ -36,17 +36,17 @@
#ifdef __APPLE__
.macro SQR_ADD_16BYTES
vmull.u8 q3, $0, $0
vmull.u8 q8, $1, $1
vpadal.u16 $2, q3
vpadal.u16 $2, q8
vmull.u8 q3, $0, $0
vmull.u8 q8, $1, $1
vpadal.u16 $2, q3
vpadal.u16 $2, q8
.endm
#else
.macro SQR_ADD_16BYTES arg0, arg1, arg2
vmull.u8 q3, \arg0, \arg0
vmull.u8 q8, \arg1, \arg1
vpadal.u16 \arg2, q3
vpadal.u16 \arg2, q8
vmull.u8 q3, \arg0, \arg0
vmull.u8 q8, \arg1, \arg1
vpadal.u16 \arg2, q3
vpadal.u16 \arg2, q8
.endm
#endif
@ -54,66 +54,66 @@
WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
stmdb sp!, {r4}
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
vabd.u8 q13, q14, q15
vmull.u8 q12, d27, d27
vmull.u8 q11, d26, d26
vaddl.u16 q12, d24, d25
vpadal.u16 q12, q11 //sqr
vabd.u8 q13, q14, q15
vmull.u8 q12, d27, d27
vmull.u8 q11, d26, d26
vaddl.u16 q12, d24, d25
vpadal.u16 q12, q11 //sqr
vaddl.u8 q13, d26, d27 //sum
vaddl.u8 q10, d28, d29 //sum_cur
vaddl.u8 q10, d28, d29 //sum_cur
vmull.u8 q9, d29, d29
vmull.u8 q8, d28, d28
vaddl.u16 q9, d18, d19 //sqr_cur
vpadal.u16 q9, q8
vmull.u8 q9, d29, d29
vmull.u8 q8, d28, d28
vaddl.u16 q9, d18, d19 //sqr_cur
vpadal.u16 q9, q8
mov r4, #15
mov r4, #15
pixel_var_16x16_loop0:
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
vabd.u8 q2, q0, q1
vabd.u8 q2, q0, q1
//q10 save sum_cur
vpadal.u8 q10, q1
//q10 save sum_cur
vpadal.u8 q10, q1
//q12 save sqr
SQR_ADD_16BYTES d4, d5, q12
//q12 save sqr
SQR_ADD_16BYTES d4, d5, q12
//q13 save sum
vpadal.u8 q13, q2
vpadal.u8 q13, q2
subs r4, #1
subs r4, #1
//q9 save sqr_cur
SQR_ADD_16BYTES d2, d3, q9
//q9 save sqr_cur
SQR_ADD_16BYTES d2, d3, q9
bne pixel_var_16x16_loop0
bne pixel_var_16x16_loop0
vadd.u16 d0, d26, d27 //sum
vadd.u16 d1, d20, d21 //sum_cur
vpaddl.u16 q0, q0
vadd.u32 d2, d24, d25 //sqr
vadd.u32 d3, d18, d19 //sqr_cur
vpadd.u32 d0, d0, d1
vpadd.u32 d1, d2, d3
vadd.u16 d0, d26, d27 //sum
vadd.u16 d1, d20, d21 //sum_cur
vpaddl.u16 q0, q0
vadd.u32 d2, d24, d25 //sqr
vadd.u32 d3, d18, d19 //sqr_cur
vpadd.u32 d0, d0, d1
vpadd.u32 d1, d2, d3
ldr r4, [sp, #4]
ldr r4, [sp, #4]
vshr.u32 q0, q0, #8
vmul.u32 d0, d0
vsub.u32 d0, d1, d0
vshr.u32 q0, q0, #8
vmul.u32 d0, d0
vsub.u32 d0, d1, d0
vmovl.u32 q0, d0
vst2.16 {d0[0], d1[0]}, [r4]
vst2.16 {d0[0], d1[0]}, [r4]
ldmia sp!, {r4}
ldmia sp!, {r4}
WELS_ASM_FUNC_END

View File

@ -30,313 +30,313 @@
*
*/
#ifdef HAVE_NEON
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
stmdb sp!, {r4-r8, lr}
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
stmdb sp!, {r4-r8, lr}
//Get the width and height
ldr r4, [sp, #24] //src_width
ldr r5, [sp, #28] //src_height
//Get the width and height
ldr r4, [sp, #24] //src_width
ldr r5, [sp, #28] //src_height
//Initialize the register
mov r6, r2
mov r8, r0
mov lr, #0
lsr r5, #1
//Initialize the register
mov r6, r2
mov r8, r0
mov lr, #0
lsr r5, #1
//Save the tailer for the unasigned size
mla r7, r1, r5, r0
vld1.32 {q15}, [r7]
//Save the tailer for the unasigned size
mla r7, r1, r5, r0
vld1.32 {q15}, [r7]
add r7, r2, r3
//processing a colume data
add r7, r2, r3
//processing a colume data
comp_ds_bilinear_loop0:
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
vpaddl.u8 q3, q3
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrshr.u16 q2, #1
vrshr.u16 q3, #1
vrhadd.u16 q0, q2
vrhadd.u16 q1, q3
vmovn.u16 d0, q0
vmovn.u16 d1, q1
vst1.32 {q0}, [r0]!
add lr, #32
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
vpaddl.u8 q3, q3
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrshr.u16 q2, #1
vrshr.u16 q3, #1
vrhadd.u16 q0, q2
vrhadd.u16 q1, q3
vmovn.u16 d0, q0
vmovn.u16 d1, q1
vst1.32 {q0}, [r0]!
add lr, #32
cmp lr, r4
movcs lr, #0
addcs r6, r6, r3, lsl #1
movcs r2, r6
addcs r7, r2, r3
addcs r8, r1
movcs r0, r8
subscs r5, #1
bne comp_ds_bilinear_loop0
cmp lr, r4
movcs lr, #0
addcs r6, r6, r3, lsl #1
movcs r2, r6
addcs r7, r2, r3
addcs r8, r1
movcs r0, r8
subscs r5, #1
bne comp_ds_bilinear_loop0
//restore the tailer for the unasigned size
vst1.32 {q15}, [r0]
//restore the tailer for the unasigned size
vst1.32 {q15}, [r0]
ldmia sp!, {r4-r8,lr}
ldmia sp!, {r4-r8,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
stmdb sp!, {r4-r7, lr}
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
stmdb sp!, {r4-r7, lr}
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
lsr r5, #1
//processing a colume data
//processing a colume data
comp_ds_bilinear_w_x8_loop0:
lsr r6, r4, #3
add r7, r2, r3
//processing a line data
lsr r6, r4, #3
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x8_loop1:
vld1.8 {d0}, [r2]!
vld1.8 {d1}, [r7]!
vpaddl.u8 q0, q0
vrshr.u16 q0, #1
vrhadd.u16 d0, d1
vld1.8 {d0}, [r2]!
vld1.8 {d1}, [r7]!
vpaddl.u8 q0, q0
vrshr.u16 q0, #1
vrhadd.u16 d0, d1
vmovn.u16 d0, q0
vst1.32 {d0[0]}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x8_loop1
vmovn.u16 d0, q0
vst1.32 {d0[0]}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x8_loop1
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x8_loop0
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x8_loop0
ldmia sp!, {r4-r7,lr}
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
stmdb sp!, {r4-r7, lr}
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
stmdb sp!, {r4-r7, lr}
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
lsr r5, #1
//processing a colume data
//processing a colume data
comp_ds_bilinear_w_x16_loop0:
lsr r6, r4, #4
add r7, r2, r3
//processing a line data
lsr r6, r4, #4
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x16_loop1:
vld1.8 {q0}, [r2]!
vld1.8 {q1}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrhadd.u16 q0, q1
vld1.8 {q0}, [r2]!
vld1.8 {q1}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrhadd.u16 q0, q1
vmovn.u16 d0, q0
vst1.32 {d0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x16_loop1
vmovn.u16 d0, q0
vst1.32 {d0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x16_loop1
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x16_loop0
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x16_loop0
ldmia sp!, {r4-r7,lr}
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
stmdb sp!, {r4-r7, lr}
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
stmdb sp!, {r4-r7, lr}
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
lsr r5, #1
//processing a colume data
//processing a colume data
comp_ds_bilinear_w_x32_loop0:
lsr r6, r4, #5
add r7, r2, r3
//processing a line data
lsr r6, r4, #5
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x32_loop1:
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
vpaddl.u8 q3, q3
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrshr.u16 q2, #1
vrshr.u16 q3, #1
vrhadd.u16 q0, q2
vrhadd.u16 q1, q3
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
vpaddl.u8 q3, q3
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrshr.u16 q2, #1
vrshr.u16 q3, #1
vrhadd.u16 q0, q2
vrhadd.u16 q1, q3
vmovn.u16 d0, q0
vmovn.u16 d1, q1
vst1.32 {q0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x32_loop1
vmovn.u16 d0, q0
vmovn.u16 d1, q1
vst1.32 {q0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x32_loop1
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x32_loop0
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x32_loop0
ldmia sp!, {r4-r7,lr}
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
stmdb sp!, {r4-r12, lr}
//Get the data from stack
ldr r4, [sp, #40] //the addr of src
ldr r5, [sp, #44] //the value of src_stride
//Get the data from stack
ldr r4, [sp, #40] //the addr of src
ldr r5, [sp, #44] //the value of src_stride
ldr r6, [sp, #48] //the value of scaleX
ldr r7, [sp, #52] //the value of scaleY
mov r10, #32768
sub r10, #1
and r8, r6, r10 // r8 uinc(scaleX mod 32767)
and r8, r6, r10 // r8 uinc(scaleX mod 32767)
mov r11, #-1
mul r11, r8 // r11 -uinc
mul r11, r8 // r11 -uinc
vdup.s16 d2, r8
vdup.s16 d0, r11
vzip.s16 d0, d2 // uinc -uinc uinc -uinc
and r9, r7, r10 // r9 vinc(scaleY mod 32767)
and r9, r7, r10 // r9 vinc(scaleY mod 32767)
mov r11, #-1
mul r11, r9 // r11 -vinc
mul r11, r9 // r11 -vinc
vdup.s16 d2, r9
vdup.s16 d3, r11
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
vdup.s16 d2, r9
vdup.s16 d3, r11
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
mov r11, #0x40000000
mov r11, #0x40000000
mov r12, #0x4000
sub r12, #1
add r11, r12
vdup.s32 d1, r11; //init u 16384 16383 16384 16383
vdup.s32 d1, r11; //init u 16384 16383 16384 16383
mov r11, #16384
mov r11, #16384
vdup.s16 d16, r11
sub r11, #1
vdup.s16 d17, r11
vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383
vdup.s16 d17, r11
vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383
veor q14, q14
sub r1, r2 // stride - width
mov r8, #16384 // yInverse
sub r3, #1
veor q14, q14
sub r1, r2 // stride - width
mov r8, #16384 // yInverse
sub r3, #1
_HEIGHT:
ldr r4, [sp, #40] //the addr of src
mov r11, r8
lsr r11, #15
mul r11, r5
add r11, r4 // get current row address
mov r12, r11
add r12, r5
mov r11, r8
lsr r11, #15
mul r11, r5
add r11, r4 // get current row address
mov r12, r11
add r12, r5
mov r9, #16384 // xInverse
sub r10, r2, #1
mov r9, #16384 // xInverse
sub r10, r2, #1
vmov.s16 d6, d1
_WIDTH:
mov lr, r9
lsr lr, #15
mov lr, r9
lsr lr, #15
add r4, r11,lr
vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
add r4, r12,lr
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
vzip.32 d28, d29 //q14: 000d000c000b000a;
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
vzip.32 d28, d29 //q14: 000d000c000b000a;
vmull.u16 q13, d6, d7 //q13: init u * init v
vmull.u32 q12, d26,d28
vmlal.u32 q12, d27,d29
vqadd.u64 d24, d24,d25
vrshr.u64 d24, #30
vmull.u16 q13, d6, d7 //q13: init u * init v
vmull.u32 q12, d26,d28
vmlal.u32 q12, d27,d29
vqadd.u64 d24, d24,d25
vrshr.u64 d24, #30
vst1.8 {d24[0]}, [r0]!
add r9, r6
vadd.u16 d6, d0 // inc u
vshl.u16 d6, #1
vshr.u16 d6, #1
subs r10, #1
bne _WIDTH
vst1.8 {d24[0]}, [r0]!
add r9, r6
vadd.u16 d6, d0 // inc u
vshl.u16 d6, #1
vshr.u16 d6, #1
subs r10, #1
bne _WIDTH
WIDTH_END:
lsr r9, #15
lsr r9, #15
add r4,r11,r9
vld1.8 {d24[0]}, [r4]
vst1.8 {d24[0]}, [r0]
add r0, #1
add r8, r7
add r0, r1
vadd.s16 d7, d5 // inc v
vshl.u16 d7, #1
vshr.u16 d7, #1
subs r3, #1
bne _HEIGHT
vld1.8 {d24[0]}, [r4]
vst1.8 {d24[0]}, [r0]
add r0, #1
add r8, r7
add r0, r1
vadd.s16 d7, d5 // inc v
vshl.u16 d7, #1
vshr.u16 d7, #1
subs r3, #1
bne _HEIGHT
LAST_ROW:
ldr r4, [sp, #40] //the addr of src
lsr r8, #15
mul r8, r5
add r4, r8 // get current row address
mov r9, #16384
lsr r8, #15
mul r8, r5
add r4, r8 // get current row address
mov r9, #16384
_LAST_ROW_WIDTH:
mov r11, r9
lsr r11, #15
mov r11, r9
lsr r11, #15
add r3, r4,r11
vld1.8 {d0[0]}, [r3]
vst1.8 {d0[0]}, [r0]
add r0, #1
add r9, r6
subs r2, #1
bne _LAST_ROW_WIDTH
add r3, r4,r11
vld1.8 {d0[0]}, [r3]
vst1.8 {d0[0]}, [r0]
add r0, #1
add r9, r6
subs r2, #1
bne _LAST_ROW_WIDTH
ldmia sp!, {r4-r12, lr}
ldmia sp!, {r4-r12, lr}
WELS_ASM_FUNC_END
#endif

View File

@ -37,32 +37,32 @@
WELS_ASM_FUNC_BEGIN WelsProcessingSampleSad8x8_neon
stmdb sp!, {lr}
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3
//Do the SAD for 8 bytes
vabdl.u8 q1, d0, d1
//Do the SAD for 8 bytes
vabdl.u8 q1, d0, d1
mov lr, #7
mov lr, #7
pixel_sad_8x8_loop0:
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3
vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3
subs lr, #1
subs lr, #1
//Do the SAD for 8 bytes
vabal.u8 q1, d0, d1
bne pixel_sad_8x8_loop0
//Do the SAD for 8 bytes
vabal.u8 q1, d0, d1
bne pixel_sad_8x8_loop0
vadd.u16 d2, d3
vpaddl.u16 d2, d2
vpaddl.u32 d2, d2
vmov.u32 r0, d2[0]//TBO...
vadd.u16 d2, d3
vpaddl.u16 d2, d2
vpaddl.u32 d2, d2
vmov.u32 r0, d2[0]//TBO...
ldmia sp!, {lr}
ldmia sp!, {lr}
WELS_ASM_FUNC_END
#endif

File diff suppressed because it is too large Load Diff

View File

@ -56,217 +56,217 @@ sse2_20 times 8 dw 20
;***********************************************************************
SECTION .text
%macro WEIGHT_LINE 9
movq %2, %9
punpcklbw %2, %7
movdqa %8, %2
%macro WEIGHT_LINE 9
movq %2, %9
punpcklbw %2, %7
movdqa %8, %2
movdqa %1, %6
psubusb %1, %8
psubusb %8, %6
por %8, %1 ; ABS(curPixel - centerPixel);
movdqa %1, %6
psubusb %1, %8
psubusb %8, %6
por %8, %1 ; ABS(curPixel - centerPixel);
movdqa %1, %3
psubusb %1, %8
movdqa %1, %3
psubusb %1, %8
pmullw %1, %1
psrlw %1, 5
pmullw %2, %1
paddusw %4, %1
paddusw %5, %2
pmullw %1, %1
psrlw %1, 5
pmullw %2, %1
paddusw %4, %1
paddusw %5, %2
%endmacro
%macro WEIGHT_LINE1_UV 4
movdqa %2, %1
punpcklbw %2, %4
paddw %3, %2
%macro WEIGHT_LINE1_UV 4
movdqa %2, %1
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 1
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 1
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 2
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 2
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
paddw %3, %2
%endmacro
%macro WEIGHT_LINE2_UV 4
movdqa %2, %1
punpcklbw %2, %4
paddw %3, %2
%macro WEIGHT_LINE2_UV 4
movdqa %2, %1
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 1
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 1
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 2
punpcklbw %2, %4
psllw %2, 2
paddw %3, %2
movdqa %2, %1
psrldq %2, 2
punpcklbw %2, %4
psllw %2, 2
paddw %3, %2
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
paddw %3, %2
%endmacro
%macro WEIGHT_LINE3_UV 4
movdqa %2, %1
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
%macro WEIGHT_LINE3_UV 4
movdqa %2, %1
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 1
punpcklbw %2, %4
psllw %2, 2
paddw %3, %2
movdqa %2, %1
psrldq %2, 1
punpcklbw %2, %4
psllw %2, 2
paddw %3, %2
movdqa %2, %1
psrldq %2, 2
punpcklbw %2, %4
pmullw %2, [sse2_20]
paddw %3, %2
movdqa %2, %1
psrldq %2, 2
punpcklbw %2, %4
pmullw %2, [sse2_20]
paddw %3, %2
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
psllw %2, 2
paddw %3, %2
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
psllw %2, 2
paddw %3, %2
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
%endmacro
;***********************************************************************
; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
;***********************************************************************
; 1 2 3
; 4 0 5
; 6 7 8
; 0: the center point
; 1 2 3
; 4 0 5
; 6 7 8
; 0: the center point
WELS_EXTERN BilateralLumaFilter8_sse2
push r3
%assign push_num 1
LOAD_2_PARA
PUSH_XMM 8
push r3
%assign push_num 1
LOAD_2_PARA
PUSH_XMM 8
pxor xmm7, xmm7
pxor xmm7, xmm7
mov r3, r0
mov r3, r0
movq xmm6, [r0]
punpcklbw xmm6, xmm7
movdqa xmm3, [sse2_32]
pxor xmm4, xmm4 ; nTotWeight
pxor xmm5, xmm5 ; nSum
movq xmm6, [r0]
punpcklbw xmm6, xmm7
movdqa xmm3, [sse2_32]
pxor xmm4, xmm4 ; nTotWeight
pxor xmm5, xmm5 ; nSum
dec r0
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
dec r0
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
sub r0, r1
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
sub r0, r1
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
lea r0, [r0 + r1 * 2]
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
lea r0, [r0 + r1 * 2]
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
pcmpeqw xmm0, xmm0
psrlw xmm0, 15
psllw xmm0, 8
psubusw xmm0, xmm4
pmullw xmm0, xmm6
paddusw xmm5, xmm0
psrlw xmm5, 8
packuswb xmm5, xmm5
movq [r3], xmm5
pcmpeqw xmm0, xmm0
psrlw xmm0, 15
psllw xmm0, 8
psubusw xmm0, xmm4
pmullw xmm0, xmm6
paddusw xmm5, xmm0
psrlw xmm5, 8
packuswb xmm5, xmm5
movq [r3], xmm5
POP_XMM
pop r3
%assign push_num 0
POP_XMM
pop r3
%assign push_num 0
ret
ret
;***********************************************************************
; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
;***********************************************************************
;5x5 filter:
;1 1 2 1 1
;1 2 4 2 1
;2 4 20 4 2
;1 2 4 2 1
;1 1 2 1 1
;1 1 2 1 1
;1 2 4 2 1
;2 4 20 4 2
;1 2 4 2 1
;1 1 2 1 1
WELS_EXTERN WaverageChromaFilter8_sse2
push r3
push r3
%assign push_num 1
%assign push_num 1
LOAD_2_PARA
LOAD_2_PARA
mov r3, r1
add r3, r3
sub r0, r3 ; pixels - 2 * stride
sub r0, 2
mov r3, r1
add r3, r3
sub r0, r3 ; pixels - 2 * stride
sub r0, 2
pxor xmm0, xmm0
pxor xmm3, xmm3
pxor xmm0, xmm0
pxor xmm3, xmm3
movdqu xmm1, [r0]
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0]
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1]
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1]
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
add r0, r3
movdqu xmm1, [r0]
WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
add r0, r3
movdqu xmm1, [r0]
WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1]
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1]
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1 * 2]
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1 * 2]
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
psrlw xmm3, 6
packuswb xmm3, xmm3
movq [r0 + 2], xmm3
psrlw xmm3, 6
packuswb xmm3, xmm3
movq [r0 + 2], xmm3
pop r3
pop r3
%assign push_num 0
ret
%assign push_num 0
ret

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff