Merge pull request #2111 from GuangweiWang/downsampler
add new c and assembly functions to optimize downsampler when downscale equal 1:3/1:4
This commit is contained in:
commit
5373b8a3aa
@ -338,4 +338,121 @@ _LAST_ROW_WIDTH:
|
||||
ldmia sp!, {r4-r12, lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
WELS_ASM_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_neon
|
||||
stmdb sp!, {r4-r8, lr}
|
||||
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #24] //src_width
|
||||
ldr r5, [sp, #28] //src_height
|
||||
|
||||
//Initialize the register
|
||||
mov r6, r2
|
||||
mov r8, r0
|
||||
mov lr, #0
|
||||
|
||||
//Save the tailer for the un-aligned size
|
||||
mla r7, r1, r5, r0
|
||||
vld1.32 {q15}, [r7]
|
||||
|
||||
add r7, r2, r3
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_onethird_loop0:
|
||||
|
||||
vld3.8 {d0, d1, d2}, [r2]!
|
||||
vld3.8 {d3, d4, d5}, [r2]!
|
||||
vld3.8 {d16, d17, d18}, [r7]!
|
||||
vld3.8 {d19, d20, d21}, [r7]!
|
||||
|
||||
vaddl.u8 q11, d0, d1
|
||||
vaddl.u8 q12, d3, d4
|
||||
vaddl.u8 q13, d16, d17
|
||||
vaddl.u8 q14, d19, d20
|
||||
vrshr.u16 q11, #1
|
||||
vrshr.u16 q12, #1
|
||||
vrshr.u16 q13, #1
|
||||
vrshr.u16 q14, #1
|
||||
|
||||
vrhadd.u16 q11, q13
|
||||
vrhadd.u16 q12, q14
|
||||
|
||||
vmovn.u16 d0, q11
|
||||
vmovn.u16 d1, q12
|
||||
vst1.8 {q0}, [r0]!
|
||||
|
||||
add lr, #48
|
||||
cmp lr, r4
|
||||
movcs lr, #0
|
||||
addcs r6, r3, lsl #1
|
||||
addcs r6, r6, r3
|
||||
movcs r2, r6
|
||||
addcs r7, r2, r3
|
||||
addcs r8, r1
|
||||
movcs r0, r8
|
||||
subscs r5, #1
|
||||
bne comp_ds_bilinear_onethird_loop0
|
||||
|
||||
//restore the tailer for the un-aligned size
|
||||
vst1.32 {q15}, [r0]
|
||||
|
||||
ldmia sp!, {r4-r8,lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
WELS_ASM_FUNC_BEGIN DyadicBilinearQuarterDownsampler_neon
|
||||
stmdb sp!, {r4-r8, lr}
|
||||
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #24] //src_width
|
||||
ldr r5, [sp, #28] //src_height
|
||||
|
||||
//Initialize the register
|
||||
mov r6, r2
|
||||
mov r8, r0
|
||||
mov lr, #0
|
||||
lsr r5, #2
|
||||
|
||||
//Save the tailer for the un-aligned size
|
||||
mla r7, r1, r5, r0
|
||||
vld1.32 {q15}, [r7]
|
||||
|
||||
add r7, r2, r3
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_quarter_loop0:
|
||||
|
||||
vld2.16 {q0, q1}, [r2]!
|
||||
vld2.16 {q2, q3}, [r2]!
|
||||
vld2.16 {q8, q9}, [r7]!
|
||||
vld2.16 {q10, q11}, [r7]!
|
||||
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q2, q2
|
||||
vpaddl.u8 q8, q8
|
||||
vpaddl.u8 q10, q10
|
||||
vrshr.u16 q0, #1
|
||||
vrshr.u16 q2, #1
|
||||
vrshr.u16 q8, #1
|
||||
vrshr.u16 q10, #1
|
||||
|
||||
vrhadd.u16 q0, q8
|
||||
vrhadd.u16 q2, q10
|
||||
vmovn.u16 d0, q0
|
||||
vmovn.u16 d1, q2
|
||||
vst1.8 {q0}, [r0]!
|
||||
|
||||
add lr, #64
|
||||
cmp lr, r4
|
||||
movcs lr, #0
|
||||
addcs r6, r3, lsl #2
|
||||
movcs r2, r6
|
||||
addcs r7, r2, r3
|
||||
addcs r8, r1
|
||||
movcs r0, r8
|
||||
subscs r5, #1
|
||||
bne comp_ds_bilinear_quarter_loop0
|
||||
|
||||
//restore the tailer for the un-aligned size
|
||||
vst1.32 {q15}, [r0]
|
||||
|
||||
ldmia sp!, {r4-r8,lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#endif
|
||||
|
@ -84,7 +84,6 @@ comp_ds_bilinear_loop0:
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_AArch64_neon
|
||||
sub w9, w3, w4
|
||||
sub w1, w1, w4, lsr #1
|
||||
@ -123,6 +122,113 @@ comp_ds_bilinear_w_x32_loop1:
|
||||
cbnz w5, comp_ds_bilinear_w_x32_loop0
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_AArch64_neon
|
||||
|
||||
//Initialize the register
|
||||
mov x6, x2
|
||||
mov x8, x0
|
||||
mov w9, #0
|
||||
|
||||
//Save the tailer for the unasigned size
|
||||
smaddl x7, w1, w5, x0
|
||||
ld1 {v16.16b}, [x7]
|
||||
|
||||
add x7, x2, w3, sxtw
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_onethird_loop0:
|
||||
|
||||
ld3 {v0.16b, v1.16b, v2.16b}, [x2], #48
|
||||
ld3 {v4.16b, v5.16b, v6.16b}, [x7], #48
|
||||
|
||||
uaddl v2.8h, v0.8b, v1.8b
|
||||
uaddl2 v3.8h, v0.16b, v1.16b
|
||||
uaddl v6.8h, v4.8b, v5.8b
|
||||
uaddl2 v7.8h, v4.16b, v5.16b
|
||||
urshr v2.8h, v2.8h, #1
|
||||
urshr v3.8h, v3.8h, #1
|
||||
urshr v6.8h, v6.8h, #1
|
||||
urshr v7.8h, v7.8h, #1
|
||||
|
||||
urhadd v0.8h, v2.8h, v6.8h
|
||||
urhadd v1.8h, v3.8h, v7.8h
|
||||
xtn v0.8b, v0.8h
|
||||
xtn v1.8b, v1.8h
|
||||
st1 {v0.8b,v1.8b}, [x0], #16
|
||||
|
||||
add w9, w9, #48
|
||||
|
||||
cmp w9, w4
|
||||
b.cc comp_ds_bilinear_onethird_loop0
|
||||
|
||||
mov w9, #0
|
||||
add x6, x6, w3, sxtw #1
|
||||
add x6, x6, w3, sxtw
|
||||
mov x2, x6
|
||||
add x7, x2, w3, sxtw
|
||||
add x8, x8, w1, sxtw
|
||||
mov x0, x8
|
||||
sub w5, w5, #1
|
||||
|
||||
cbnz w5, comp_ds_bilinear_onethird_loop0
|
||||
|
||||
//restore the tailer for the unasigned size
|
||||
st1 {v16.16b}, [x0]
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearQuarterDownsampler_AArch64_neon
|
||||
//Initialize the register
|
||||
mov x6, x2
|
||||
mov x8, x0
|
||||
mov w9, #0
|
||||
lsr w5, w5, #2
|
||||
|
||||
//Save the tailer for the unasigned size
|
||||
smaddl x7, w1, w5, x0
|
||||
ld1 {v16.16b}, [x7]
|
||||
|
||||
add x7, x2, w3, sxtw
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_quarter_loop0:
|
||||
|
||||
ld2 {v0.8h, v1.8h}, [x2], #32
|
||||
ld2 {v2.8h, v3.8h}, [x2], #32
|
||||
ld2 {v4.8h, v5.8h}, [x7], #32
|
||||
ld2 {v6.8h, v7.8h}, [x7], #32
|
||||
|
||||
uaddlp v0.8h, v0.16b
|
||||
uaddlp v1.8h, v2.16b
|
||||
uaddlp v4.8h, v4.16b
|
||||
uaddlp v5.8h, v6.16b
|
||||
urshr v0.8h, v0.8h, #1
|
||||
urshr v1.8h, v1.8h, #1
|
||||
urshr v4.8h, v4.8h, #1
|
||||
urshr v5.8h, v5.8h, #1
|
||||
|
||||
urhadd v0.8h, v0.8h, v4.8h
|
||||
urhadd v1.8h, v1.8h, v5.8h
|
||||
xtn v0.8b, v0.8h
|
||||
xtn v1.8b, v1.8h
|
||||
st1 {v0.8b,v1.8b}, [x0], #16
|
||||
|
||||
add w9, w9, #64
|
||||
|
||||
cmp w9, w4
|
||||
b.cc comp_ds_bilinear_quarter_loop0
|
||||
|
||||
mov w9, #0
|
||||
add x6, x6, w3, sxtw #2
|
||||
mov x2, x6
|
||||
add x7, x2, w3, sxtw
|
||||
add x8, x8, w1, sxtw
|
||||
mov x0, x8
|
||||
sub w5, w5, #1
|
||||
|
||||
cbnz w5, comp_ds_bilinear_quarter_loop0
|
||||
|
||||
//restore the tailer for the unasigned size
|
||||
st1 {v16.16b}, [x0]
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN GeneralBilinearAccurateDownsampler_AArch64_neon
|
||||
mov w10, #32767
|
||||
and w8, w6, w10
|
||||
|
@ -53,6 +53,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;
|
||||
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;
|
||||
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;
|
||||
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_c;
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_c;
|
||||
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsampler_c;
|
||||
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsampler_c;
|
||||
#if defined(X86_ASM)
|
||||
@ -60,6 +62,7 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
|
||||
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse;
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse;
|
||||
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsamplerWidthx8_sse;
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse;
|
||||
}
|
||||
if (iCpuFlag & WELS_CPU_SSE2) {
|
||||
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
|
||||
@ -68,10 +71,14 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
|
||||
if (iCpuFlag & WELS_CPU_SSSE3) {
|
||||
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_ssse3;
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3;
|
||||
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_ssse3;
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_ssse3;
|
||||
}
|
||||
if (iCpuFlag & WELS_CPU_SSE41) {
|
||||
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4;
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4;
|
||||
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse4;
|
||||
}
|
||||
#endif//X86_ASM
|
||||
|
||||
@ -81,6 +88,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon;
|
||||
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon;
|
||||
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon;
|
||||
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_neon;
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_neon;
|
||||
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon;
|
||||
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_neon;
|
||||
}
|
||||
@ -92,6 +101,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_AArch64_neon;
|
||||
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_AArch64_neon;
|
||||
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_AArch64_neon;
|
||||
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_AArch64_neon;
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_AArch64_neon;
|
||||
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
|
||||
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
|
||||
}
|
||||
@ -124,6 +135,28 @@ EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDs
|
||||
(uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
|
||||
m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
|
||||
(uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
|
||||
} else if ((iSrcWidthY >> 2) == iDstWidthY && (iSrcHeightY >> 2) == iDstHeightY) {
|
||||
|
||||
m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
|
||||
(uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
|
||||
|
||||
m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
|
||||
(uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
|
||||
|
||||
m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
|
||||
(uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
|
||||
|
||||
} else if ((iSrcWidthY / 3) == iDstWidthY && (iSrcHeightY / 3) == iDstHeightY) {
|
||||
|
||||
m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
|
||||
(uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iDstHeightY);
|
||||
|
||||
m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
|
||||
(uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iDstHeightUV);
|
||||
|
||||
m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
|
||||
(uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iDstHeightUV);
|
||||
|
||||
} else {
|
||||
m_pfDownsample.pfGeneralRatioLuma ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], iDstWidthY, iDstHeightY,
|
||||
(uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
|
||||
|
@ -54,20 +54,29 @@ typedef void (HalveDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride,
|
||||
const int32_t kiSrcWidth, const int32_t kiSrcHeight);
|
||||
|
||||
typedef void (SpecificDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride,
|
||||
const int32_t kiSrcWidth, const int32_t kiHeight);
|
||||
|
||||
typedef void (GeneralDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
|
||||
const int32_t kiDstHeight,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight);
|
||||
|
||||
typedef HalveDownsampleFunc* PHalveDownsampleFunc;
|
||||
typedef SpecificDownsampleFunc* PSpecificDownsampleFunc;
|
||||
typedef GeneralDownsampleFunc* PGeneralDownsampleFunc;
|
||||
|
||||
HalveDownsampleFunc DyadicBilinearDownsampler_c;
|
||||
HalveDownsampleFunc DyadicBilinearDownsampler_c;
|
||||
GeneralDownsampleFunc GeneralBilinearFastDownsampler_c;
|
||||
GeneralDownsampleFunc GeneralBilinearAccurateDownsampler_c;
|
||||
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_c;
|
||||
SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_c;
|
||||
|
||||
typedef struct {
|
||||
// align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;
|
||||
PHalveDownsampleFunc pfHalfAverage[4];
|
||||
PSpecificDownsampleFunc pfOneThirdDownsampler;
|
||||
PSpecificDownsampleFunc pfQuarterDownsampler;
|
||||
PGeneralDownsampleFunc pfGeneralRatioLuma;
|
||||
PGeneralDownsampleFunc pfGeneralRatioChroma;
|
||||
} SDownsampleFuncs;
|
||||
@ -93,10 +102,19 @@ HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_sse4;
|
||||
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
|
||||
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
|
||||
|
||||
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_ssse3;
|
||||
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_sse4;
|
||||
SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_sse;
|
||||
SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_ssse3;
|
||||
SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_sse4;
|
||||
|
||||
void GeneralBilinearFastDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
|
||||
const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
|
||||
const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,
|
||||
const uint32_t kuiScaleY);
|
||||
void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
|
||||
const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
|
||||
const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,
|
||||
const uint32_t kuiScaleY);
|
||||
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
@ -109,6 +127,10 @@ HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_neon;
|
||||
|
||||
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_neon;
|
||||
|
||||
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_neon;
|
||||
|
||||
SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_neon;
|
||||
|
||||
void GeneralBilinearAccurateDownsampler_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
|
||||
const int32_t kiDstHeight,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
|
||||
@ -125,8 +147,13 @@ HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_AArch64_neon;
|
||||
|
||||
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
|
||||
|
||||
void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
|
||||
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_AArch64_neon;
|
||||
|
||||
SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_AArch64_neon;
|
||||
|
||||
void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride,
|
||||
const int32_t kiDstWidth, const int32_t kiDstHeight,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
|
||||
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
@ -68,6 +68,53 @@ void DyadicBilinearDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
|
||||
}
|
||||
}
|
||||
|
||||
void DyadicBilinearQuarterDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride,
|
||||
const int32_t kiSrcWidth, const int32_t kiSrcHeight)
|
||||
|
||||
{
|
||||
uint8_t* pDstLine = pDst;
|
||||
uint8_t* pSrcLine = pSrc;
|
||||
const int32_t kiSrcStridex4 = kiSrcStride << 2;
|
||||
const int32_t kiDstWidth = kiSrcWidth >> 2;
|
||||
const int32_t kiDstHeight = kiSrcHeight >> 2;
|
||||
|
||||
for (int32_t j = 0; j < kiDstHeight; j ++) {
|
||||
for (int32_t i = 0; i < kiDstWidth; i ++) {
|
||||
const int32_t kiSrcX = i << 2;
|
||||
const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
|
||||
const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
|
||||
|
||||
pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
|
||||
}
|
||||
pDstLine += kiDstStride;
|
||||
pSrcLine += kiSrcStridex4;
|
||||
}
|
||||
}
|
||||
|
||||
void DyadicBilinearOneThirdDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride,
|
||||
const int32_t kiSrcWidth, const int32_t kiDstHeight)
|
||||
|
||||
{
|
||||
uint8_t* pDstLine = pDst;
|
||||
uint8_t* pSrcLine = pSrc;
|
||||
const int32_t kiSrcStridex3 = kiSrcStride * 3;
|
||||
const int32_t kiDstWidth = kiSrcWidth / 3;
|
||||
|
||||
for (int32_t j = 0; j < kiDstHeight; j ++) {
|
||||
for (int32_t i = 0; i < kiDstWidth; i ++) {
|
||||
const int32_t kiSrcX = i * 3;
|
||||
const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
|
||||
const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
|
||||
|
||||
pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
|
||||
}
|
||||
pDstLine += kiDstStride;
|
||||
pSrcLine += kiSrcStridex3;
|
||||
}
|
||||
}
|
||||
|
||||
void GeneralBilinearFastDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
|
||||
const int32_t kiDstHeight,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
|
||||
|
@ -67,6 +67,22 @@ shufb_mask_high:
|
||||
add_extra_half:
|
||||
dd 16384,0,0,0
|
||||
|
||||
shufb_mask_quarter:
|
||||
db 00h, 04h, 08h, 0ch, 80h, 80h, 80h, 80h, 01h, 05h, 09h, 0dh, 80h, 80h, 80h, 80h
|
||||
|
||||
shufb_mask_onethird_low_1:
|
||||
db 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h
|
||||
shufb_mask_onethird_low_2:
|
||||
db 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh, 80h, 80h, 80h, 80h, 80h
|
||||
shufb_mask_onethird_low_3:
|
||||
db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 01h, 04h, 07h, 0ah, 0dh
|
||||
|
||||
shufb_mask_onethird_high_1:
|
||||
db 01h, 04h, 07h, 0ah, 0dh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h
|
||||
shufb_mask_onethird_high_2:
|
||||
db 80h, 80h, 80h, 80h, 80h, 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h
|
||||
shufb_mask_onethird_high_3:
|
||||
db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh
|
||||
|
||||
;***********************************************************************
|
||||
; Code
|
||||
@ -1896,3 +1912,686 @@ FAST_LAST_ROW_END:
|
||||
pop r12
|
||||
ret
|
||||
%endif
|
||||
|
||||
;***********************************************************************
|
||||
; void DyadicBilinearOneThirdDownsampler_ssse3( unsigned char* pDst, const int iDstStride,
|
||||
; unsigned char* pSrc, const int iSrcStride,
|
||||
; const int iSrcWidth, const int iSrcHeight );
|
||||
;***********************************************************************
|
||||
WELS_EXTERN DyadicBilinearOneThirdDownsampler_ssse3
|
||||
%ifdef X86_32
|
||||
push r6
|
||||
%assign push_num 1
|
||||
%else
|
||||
%assign push_num 0
|
||||
%endif
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
|
||||
%ifndef X86_32
|
||||
push r12
|
||||
mov r12, r4
|
||||
%endif
|
||||
|
||||
mov r6, r1 ;Save the tailer for the unasigned size
|
||||
imul r6, r5
|
||||
add r6, r0
|
||||
movdqa xmm7, [r6]
|
||||
|
||||
.yloops_onethird_sse3:
|
||||
%ifdef X86_32
|
||||
mov r4, arg5
|
||||
%else
|
||||
mov r4, r12
|
||||
%endif
|
||||
|
||||
mov r6, r0 ;save base address
|
||||
; each loop = source bandwidth: 48 bytes
|
||||
.xloops_onethird_sse3:
|
||||
; 1st part horizonal loop: x48 bytes
|
||||
; mem hi<- ->lo
|
||||
;1st Line Src: xmm0: F * e E * d D * c C * b B * a A
|
||||
; xmm2: k K * j J * i I * h H * g G * f
|
||||
; xmm2: * p P * o O * n N * m M * l L *
|
||||
;
|
||||
;2nd Line Src: xmm2: F' * e' E' * d' D' * c' C' * b' B' * a' A'
|
||||
; xmm1: k' K' * j' J' * i' I' * h' H' * g' G' * f'
|
||||
; xmm1: * p' P' * o' O' * n' N' * m' M' * l' L' *
|
||||
;=> target:
|
||||
;: P O N M L K J I H G F E D C B A
|
||||
;: p o n m l k j i h g f e d c b a
|
||||
;: P' .. A'
|
||||
;: p' .. a'
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;1st line
|
||||
movdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm5, [shufb_mask_onethird_low_1]
|
||||
movdqa xmm6, [shufb_mask_onethird_high_1]
|
||||
pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
|
||||
pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
|
||||
|
||||
movdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f
|
||||
movdqa xmm3, xmm2
|
||||
movdqa xmm5, [shufb_mask_onethird_low_2]
|
||||
movdqa xmm6, [shufb_mask_onethird_high_2]
|
||||
pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
|
||||
pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
|
||||
|
||||
paddusb xmm0, xmm2 ;0 0 0 0 0 K J I H G F E D C B A -> xmm0
|
||||
paddusb xmm1, xmm3 ;0 0 0 0 0 k j i h g f e d c b a -> xmm1
|
||||
|
||||
movdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L *
|
||||
movdqa xmm3, xmm2
|
||||
movdqa xmm5, [shufb_mask_onethird_low_3]
|
||||
movdqa xmm6, [shufb_mask_onethird_high_3]
|
||||
pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
|
||||
pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
|
||||
|
||||
paddusb xmm0, xmm2 ;P O N M L K J I H G F E D C B A -> xmm0
|
||||
paddusb xmm1, xmm3 ;p o n m l k j i h g f e d c b a -> xmm1
|
||||
pavgb xmm0, xmm1 ;1st line average -> xmm0
|
||||
|
||||
;2nd line
|
||||
movdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A'
|
||||
movdqa xmm3, xmm2
|
||||
movdqa xmm5, [shufb_mask_onethird_low_1]
|
||||
movdqa xmm6, [shufb_mask_onethird_high_1]
|
||||
pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
|
||||
pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3
|
||||
|
||||
movdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f'
|
||||
movdqa xmm4, xmm1
|
||||
movdqa xmm5, [shufb_mask_onethird_low_2]
|
||||
movdqa xmm6, [shufb_mask_onethird_high_2]
|
||||
pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1
|
||||
pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
|
||||
|
||||
paddusb xmm2, xmm1 ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2
|
||||
paddusb xmm3, xmm4 ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3
|
||||
|
||||
movdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' *
|
||||
movdqa xmm4, xmm1
|
||||
movdqa xmm5, [shufb_mask_onethird_low_3]
|
||||
movdqa xmm6, [shufb_mask_onethird_high_3]
|
||||
pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
|
||||
pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
|
||||
|
||||
paddusb xmm2, xmm1 ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2
|
||||
paddusb xmm3, xmm4 ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3
|
||||
pavgb xmm2, xmm3 ;2nd line average -> xmm2
|
||||
|
||||
pavgb xmm0, xmm2 ; bytes-average(1st line , 2nd line )
|
||||
|
||||
; write pDst
|
||||
movdqa [r0], xmm0 ;write result in dst
|
||||
|
||||
; next SMB
|
||||
lea r2, [r2+48] ;current src address
|
||||
lea r0, [r0+16] ;current dst address
|
||||
|
||||
sub r4, 48 ;xloops counter
|
||||
cmp r4, 0
|
||||
jg near .xloops_onethird_sse3
|
||||
|
||||
sub r6, r0 ;offset = base address - current address
|
||||
lea r2, [r2+2*r3] ;
|
||||
lea r2, [r2+r3] ;
|
||||
lea r2, [r2+2*r6] ;current line + 3 lines
|
||||
lea r2, [r2+r6]
|
||||
lea r0, [r0+r1]
|
||||
lea r0, [r0+r6] ;current dst lien + 1 line
|
||||
|
||||
dec r5
|
||||
jg near .yloops_onethird_sse3
|
||||
|
||||
movdqa [r0], xmm7 ;restore the tailer for the unasigned size
|
||||
|
||||
%ifndef X86_32
|
||||
pop r12
|
||||
%endif
|
||||
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
%endif
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; void DyadicBilinearOneThirdDownsampler_sse4( unsigned char* pDst, const int iDstStride,
|
||||
; unsigned char* pSrc, const int iSrcStride,
|
||||
; const int iSrcWidth, const int iSrcHeight );
|
||||
;***********************************************************************
|
||||
WELS_EXTERN DyadicBilinearOneThirdDownsampler_sse4
|
||||
%ifdef X86_32
|
||||
push r6
|
||||
%assign push_num 1
|
||||
%else
|
||||
%assign push_num 0
|
||||
%endif
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
|
||||
%ifndef X86_32
|
||||
push r12
|
||||
mov r12, r4
|
||||
%endif
|
||||
|
||||
mov r6, r1 ;Save the tailer for the unasigned size
|
||||
imul r6, r5
|
||||
add r6, r0
|
||||
movdqa xmm7, [r6]
|
||||
|
||||
.yloops_onethird_sse4:
|
||||
%ifdef X86_32
|
||||
mov r4, arg5
|
||||
%else
|
||||
mov r4, r12
|
||||
%endif
|
||||
|
||||
mov r6, r0 ;save base address
|
||||
; each loop = source bandwidth: 48 bytes
|
||||
.xloops_onethird_sse4:
|
||||
; 1st part horizonal loop: x48 bytes
|
||||
; mem hi<- ->lo
|
||||
;1st Line Src: xmm0: F * e E * d D * c C * b B * a A
|
||||
; xmm2: k K * j J * i I * h H * g G * f
|
||||
; xmm2: * p P * o O * n N * m M * l L *
|
||||
;
|
||||
;2nd Line Src: xmm2: F' * e' E' * d' D' * c' C' * b' B' * a' A'
|
||||
; xmm1: k' K' * j' J' * i' I' * h' H' * g' G' * f'
|
||||
; xmm1: * p' P' * o' O' * n' N' * m' M' * l' L' *
|
||||
;=> target:
|
||||
;: P O N M L K J I H G F E D C B A
|
||||
;: p o n m l k j i h g f e d c b a
|
||||
;: P' .. A'
|
||||
;: p' .. a'
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;1st line
|
||||
movntdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm5, [shufb_mask_onethird_low_1]
|
||||
movdqa xmm6, [shufb_mask_onethird_high_1]
|
||||
pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
|
||||
pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
|
||||
|
||||
movntdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f
|
||||
movdqa xmm3, xmm2
|
||||
movdqa xmm5, [shufb_mask_onethird_low_2]
|
||||
movdqa xmm6, [shufb_mask_onethird_high_2]
|
||||
pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
|
||||
pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
|
||||
|
||||
paddusb xmm0, xmm2 ;0 0 0 0 0 K J I H G F E D C B A -> xmm0
|
||||
paddusb xmm1, xmm3 ;0 0 0 0 0 k j i h g f e d c b a -> xmm1
|
||||
|
||||
movntdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L *
|
||||
movdqa xmm3, xmm2
|
||||
movdqa xmm5, [shufb_mask_onethird_low_3]
|
||||
movdqa xmm6, [shufb_mask_onethird_high_3]
|
||||
pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
|
||||
pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
|
||||
|
||||
paddusb xmm0, xmm2 ;P O N M L K J I H G F E D C B A -> xmm0
|
||||
paddusb xmm1, xmm3 ;p o n m l k j i h g f e d c b a -> xmm1
|
||||
pavgb xmm0, xmm1 ;1st line average -> xmm0
|
||||
|
||||
;2nd line
|
||||
movntdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A'
|
||||
movdqa xmm3, xmm2
|
||||
movdqa xmm5, [shufb_mask_onethird_low_1]
|
||||
movdqa xmm6, [shufb_mask_onethird_high_1]
|
||||
pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
|
||||
pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3
|
||||
|
||||
movntdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f'
|
||||
movdqa xmm4, xmm1
|
||||
movdqa xmm5, [shufb_mask_onethird_low_2]
|
||||
movdqa xmm6, [shufb_mask_onethird_high_2]
|
||||
pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1
|
||||
pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
|
||||
|
||||
paddusb xmm2, xmm1 ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2
|
||||
paddusb xmm3, xmm4 ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3
|
||||
|
||||
movntdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' *
|
||||
movdqa xmm4, xmm1
|
||||
movdqa xmm5, [shufb_mask_onethird_low_3]
|
||||
movdqa xmm6, [shufb_mask_onethird_high_3]
|
||||
pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
|
||||
pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
|
||||
|
||||
paddusb xmm2, xmm1 ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2
|
||||
paddusb xmm3, xmm4 ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3
|
||||
pavgb xmm2, xmm3 ;2nd line average -> xmm2
|
||||
|
||||
pavgb xmm0, xmm2 ; bytes-average(1st line , 2nd line )
|
||||
|
||||
; write pDst
|
||||
movdqa [r0], xmm0 ;write result in dst
|
||||
|
||||
; next SMB
|
||||
lea r2, [r2+48] ;current src address
|
||||
lea r0, [r0+16] ;current dst address
|
||||
|
||||
sub r4, 48 ;xloops counter
|
||||
cmp r4, 0
|
||||
jg near .xloops_onethird_sse4
|
||||
|
||||
sub r6, r0 ;offset = base address - current address
|
||||
lea r2, [r2+2*r3] ;
|
||||
lea r2, [r2+r3] ;
|
||||
lea r2, [r2+2*r6] ;current line + 3 lines
|
||||
lea r2, [r2+r6]
|
||||
lea r0, [r0+r1]
|
||||
lea r0, [r0+r6] ;current dst lien + 1 line
|
||||
|
||||
dec r5
|
||||
jg near .yloops_onethird_sse4
|
||||
|
||||
movdqa [r0], xmm7 ;restore the tailer for the unasigned size
|
||||
|
||||
%ifndef X86_32
|
||||
pop r12
|
||||
%endif
|
||||
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
%endif
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; void DyadicBilinearQuarterDownsampler_sse( unsigned char* pDst, const int iDstStride,
|
||||
; unsigned char* pSrc, const int iSrcStride,
|
||||
; const int iSrcWidth, const int iSrcHeight );
|
||||
;***********************************************************************
|
||||
WELS_EXTERN DyadicBilinearQuarterDownsampler_sse
|
||||
%ifdef X86_32
|
||||
push r6
|
||||
%assign push_num 1
|
||||
%else
|
||||
%assign push_num 0
|
||||
%endif
|
||||
LOAD_6_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
|
||||
%ifndef X86_32
|
||||
push r12
|
||||
mov r12, r4
|
||||
%endif
|
||||
sar r5, $02 ; iSrcHeight >> 2
|
||||
|
||||
mov r6, r1 ;Save the tailer for the unasigned size
|
||||
imul r6, r5
|
||||
add r6, r0
|
||||
movq xmm7, [r6]
|
||||
|
||||
.yloops_quarter_sse:
|
||||
%ifdef X86_32
|
||||
mov r4, arg5
|
||||
%else
|
||||
mov r4, r12
|
||||
%endif
|
||||
|
||||
mov r6, r0 ;save base address
|
||||
; each loop = source bandwidth: 32 bytes
|
||||
.xloops_quarter_sse:
|
||||
; 1st part horizonal loop: x16 bytes
|
||||
; mem hi<- ->lo
|
||||
;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
|
||||
;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
|
||||
;
|
||||
;=> target:
|
||||
;: G E C A,
|
||||
;:
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
movq mm0, [r2] ; 1st pSrc line
|
||||
movq mm1, [r2+8] ; 1st pSrc line + 8
|
||||
movq mm2, [r2+r3] ; 2nd pSrc line
|
||||
movq mm3, [r2+r3+8] ; 2nd pSrc line + 8
|
||||
|
||||
pshufw mm0, mm0, 0d8h ; x X x X c C a A
|
||||
pshufw mm1, mm1, 0d8h ; x X x X g G e E
|
||||
pshufw mm2, mm2, 0d8h ; x X x X k K i I
|
||||
pshufw mm3, mm3, 0d8h ; x X x X o O m M
|
||||
|
||||
punpckldq mm0, mm1 ; g G e E c C a A
|
||||
punpckldq mm2, mm3 ; o O m M k K i I
|
||||
|
||||
; to handle mm0,mm2
|
||||
pshufw mm4, mm0, 0d8h ;g G c C e E a A
|
||||
pshufw mm5, mm4, 04eh ;e E a A g G c C
|
||||
punpcklbw mm4, mm5 ;g e G E c a C A -> mm4
|
||||
pshufw mm4, mm4, 0d8h ;g e c a G E C A -> mm4
|
||||
|
||||
pshufw mm5, mm2, 0d8h ;o O k K m M i I
|
||||
pshufw mm6, mm5, 04eh ;m M i I o O k K
|
||||
punpcklbw mm5, mm6 ;o m O M k i K I
|
||||
pshufw mm5, mm5, 0d8h ;o m k i O M K I -> mm5
|
||||
|
||||
; to handle mm4, mm5
|
||||
movq mm0, mm4
|
||||
punpckldq mm0, mm6 ;x x x x G E C A
|
||||
punpckhdq mm4, mm6 ;x x x x g e c a
|
||||
|
||||
movq mm1, mm5
|
||||
punpckldq mm1, mm6 ;x x x x O M K I
|
||||
punpckhdq mm5, mm6 ;x x x x o m k i
|
||||
|
||||
; avg within MB horizon width (8 x 2 lines)
|
||||
pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
|
||||
pavgb mm1, mm5 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
|
||||
pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
|
||||
|
||||
; 2nd part horizonal loop: x16 bytes
|
||||
movq mm1, [r2+16] ; 1st pSrc line + 16
|
||||
movq mm2, [r2+24] ; 1st pSrc line + 24
|
||||
movq mm3, [r2+r3+16] ; 2nd pSrc line + 16
|
||||
movq mm4, [r2+r3+24] ; 2nd pSrc line + 24
|
||||
|
||||
pshufw mm1, mm1, 0d8h
|
||||
pshufw mm2, mm2, 0d8h
|
||||
pshufw mm3, mm3, 0d8h
|
||||
pshufw mm4, mm4, 0d8h
|
||||
|
||||
punpckldq mm1, mm2
|
||||
punpckldq mm3, mm4
|
||||
|
||||
; to handle mm1, mm3
|
||||
pshufw mm4, mm1, 0d8h
|
||||
pshufw mm5, mm4, 04eh
|
||||
punpcklbw mm4, mm5
|
||||
pshufw mm4, mm4, 0d8h
|
||||
|
||||
pshufw mm5, mm3, 0d8h
|
||||
pshufw mm6, mm5, 04eh
|
||||
punpcklbw mm5, mm6
|
||||
pshufw mm5, mm5, 0d8h
|
||||
|
||||
; to handle mm4, mm5
|
||||
movq mm2, mm4
|
||||
punpckldq mm2, mm6
|
||||
punpckhdq mm4, mm6
|
||||
|
||||
movq mm3, mm5
|
||||
punpckldq mm3, mm6
|
||||
punpckhdq mm5, mm6
|
||||
|
||||
; avg within MB horizon width (8 x 2 lines)
|
||||
pavgb mm2, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
|
||||
pavgb mm3, mm5 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
|
||||
pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
|
||||
|
||||
movd [r0 ], mm0
|
||||
movd [r0+4], mm2
|
||||
|
||||
; next SMB
|
||||
lea r2, [r2+32]
|
||||
lea r0, [r0+8]
|
||||
|
||||
sub r4, 32
|
||||
cmp r4, 0
|
||||
jg near .xloops_quarter_sse
|
||||
|
||||
sub r6, r0
|
||||
; next line
|
||||
lea r2, [r2+4*r3] ; next 4 end of lines
|
||||
lea r2, [r2+4*r6] ; reset to base 0 [- 4 * iDstWidth]
|
||||
lea r0, [r0+r1]
|
||||
lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
|
||||
|
||||
dec r5
|
||||
jg near .yloops_quarter_sse
|
||||
|
||||
movq [r0], xmm7 ;restored the tailer for the unasigned size
|
||||
|
||||
WELSEMMS
|
||||
%ifndef X86_32
|
||||
pop r12
|
||||
%endif
|
||||
LOAD_6_PARA_POP
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
%endif
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; void DyadicBilinearQuarterDownsampler_ssse3( unsigned char* pDst, const int iDstStride,
|
||||
; unsigned char* pSrc, const int iSrcStride,
|
||||
; const int iSrcWidth, const int iSrcHeight );
|
||||
;***********************************************************************
|
||||
WELS_EXTERN DyadicBilinearQuarterDownsampler_ssse3
|
||||
;push ebx
|
||||
;push edx
|
||||
;push esi
|
||||
;push edi
|
||||
;push ebp
|
||||
|
||||
;mov edi, [esp+24] ; pDst
|
||||
;mov edx, [esp+28] ; iDstStride
|
||||
;mov esi, [esp+32] ; pSrc
|
||||
;mov ecx, [esp+36] ; iSrcStride
|
||||
;mov ebp, [esp+44] ; iSrcHeight
|
||||
%ifdef X86_32
|
||||
push r6
|
||||
%assign push_num 1
|
||||
%else
|
||||
%assign push_num 0
|
||||
%endif
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
|
||||
%ifndef X86_32
|
||||
push r12
|
||||
mov r12, r4
|
||||
%endif
|
||||
sar r5, $02 ; iSrcHeight >> 2
|
||||
|
||||
mov r6, r1 ;Save the tailer for the unasigned size
|
||||
imul r6, r5
|
||||
add r6, r0
|
||||
movq xmm7, [r6]
|
||||
|
||||
movdqa xmm6, [shufb_mask_quarter]
|
||||
.yloops_quarter_sse3:
|
||||
;mov eax, [esp+40] ; iSrcWidth
|
||||
;sar eax, $02 ; iSrcWidth >> 2
|
||||
;mov ebx, eax ; iDstWidth restored at ebx
|
||||
;sar eax, $04 ; (iSrcWidth >> 2) / 16 ; loop count = num_of_mb
|
||||
;neg ebx ; - (iSrcWidth >> 2)
|
||||
%ifdef X86_32
|
||||
mov r4, arg5
|
||||
%else
|
||||
mov r4, r12
|
||||
%endif
|
||||
|
||||
mov r6, r0
|
||||
; each loop = source bandwidth: 32 bytes
|
||||
.xloops_quarter_sse3:
|
||||
; 1st part horizonal loop: x32 bytes
|
||||
; mem hi<- ->lo
|
||||
;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
|
||||
; xmm1: p P o O n N m M l L k K j J i I
|
||||
;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
|
||||
; xmm3: p P o O n N m M l L k K j J i I
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
movdqa xmm0, [r2] ; 1st_src_line
|
||||
movdqa xmm1, [r2+16] ; 1st_src_line + 16
|
||||
movdqa xmm2, [r2+r3] ; 2nd_src_line
|
||||
movdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16
|
||||
|
||||
pshufb xmm0, xmm6 ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A
|
||||
pshufb xmm1, xmm6 ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I
|
||||
pshufb xmm2, xmm6 ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A
|
||||
pshufb xmm3, xmm6 ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm5, xmm2
|
||||
punpckldq xmm0, xmm1 ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0
|
||||
punpckhdq xmm4, xmm1 ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4
|
||||
punpckldq xmm2, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2
|
||||
punpckhdq xmm5, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5
|
||||
|
||||
pavgb xmm0, xmm4
|
||||
pavgb xmm2, xmm5
|
||||
pavgb xmm0, xmm2 ;average
|
||||
|
||||
; write pDst
|
||||
movq [r0], xmm0
|
||||
|
||||
; next SMB
|
||||
lea r2, [r2+32]
|
||||
lea r0, [r0+8]
|
||||
|
||||
sub r4, 32
|
||||
cmp r4, 0
|
||||
jg near .xloops_quarter_sse3
|
||||
|
||||
sub r6, r0
|
||||
; next line
|
||||
lea r2, [r2+4*r3] ; next end of lines
|
||||
lea r2, [r2+4*r6] ; reset to base 0 [- 4 * iDstWidth]
|
||||
lea r0, [r0+r1]
|
||||
lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
|
||||
|
||||
dec r5
|
||||
jg near .yloops_quarter_sse3
|
||||
|
||||
movq [r0], xmm7 ;restored the tailer for the unasigned size
|
||||
|
||||
%ifndef X86_32
|
||||
pop r12
|
||||
%endif
|
||||
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
%endif
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; void DyadicBilinearQuarterDownsampler_sse4( unsigned char* pDst, const int iDstStride,
|
||||
; unsigned char* pSrc, const int iSrcStride,
|
||||
; const int iSrcWidth, const int iSrcHeight );
|
||||
;***********************************************************************
|
||||
WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4
|
||||
%ifdef X86_32
|
||||
push r6
|
||||
%assign push_num 1
|
||||
%else
|
||||
%assign push_num 0
|
||||
%endif
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
|
||||
%ifndef X86_32
|
||||
push r12
|
||||
mov r12, r4
|
||||
%endif
|
||||
sar r5, $02 ; iSrcHeight >> 2
|
||||
|
||||
mov r6, r1 ;Save the tailer for the unasigned size
|
||||
imul r6, r5
|
||||
add r6, r0
|
||||
movq xmm7, [r6]
|
||||
|
||||
movdqa xmm6, [shufb_mask_quarter] ;mask
|
||||
|
||||
.yloops_quarter_sse4:
|
||||
%ifdef X86_32
|
||||
mov r4, arg5
|
||||
%else
|
||||
mov r4, r12
|
||||
%endif
|
||||
|
||||
mov r6, r0
|
||||
; each loop = source bandwidth: 32 bytes
|
||||
.xloops_quarter_sse4:
|
||||
; 1st part horizonal loop: x16 bytes
|
||||
; mem hi<- ->lo
|
||||
;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
|
||||
; xmm1: p P o O n N m M l L k K j J i I
|
||||
;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
|
||||
; xmm3: p P o O n N m M l L k K j J i I
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
movntdqa xmm0, [r2] ; 1st_src_line
|
||||
movntdqa xmm1, [r2+16] ; 1st_src_line + 16
|
||||
movntdqa xmm2, [r2+r3] ; 2nd_src_line
|
||||
movntdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16
|
||||
|
||||
pshufb xmm0, xmm6 ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A
|
||||
pshufb xmm1, xmm6 ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I
|
||||
pshufb xmm2, xmm6 ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A
|
||||
pshufb xmm3, xmm6 ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm5, xmm2
|
||||
punpckldq xmm0, xmm1 ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0
|
||||
punpckhdq xmm4, xmm1 ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4
|
||||
punpckldq xmm2, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2
|
||||
punpckhdq xmm5, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5
|
||||
|
||||
pavgb xmm0, xmm4
|
||||
pavgb xmm2, xmm5
|
||||
pavgb xmm0, xmm2 ;average
|
||||
|
||||
; write pDst
|
||||
movq [r0], xmm0
|
||||
|
||||
; next SMB
|
||||
lea r2, [r2+32]
|
||||
lea r0, [r0+8]
|
||||
|
||||
sub r4, 32
|
||||
cmp r4, 0
|
||||
jg near .xloops_quarter_sse4
|
||||
|
||||
sub r6, r0
|
||||
lea r2, [r2+4*r3] ; next end of lines
|
||||
lea r2, [r2+4*r6] ; reset to base 0 [- 2 * iDstWidth]
|
||||
lea r0, [r0+r1]
|
||||
lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
|
||||
|
||||
dec r5
|
||||
jg near .yloops_quarter_sse4
|
||||
|
||||
movq [r0], xmm7 ;restore the tailer for the unasigned size
|
||||
|
||||
%ifndef X86_32
|
||||
pop r12
|
||||
%endif
|
||||
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
%endif
|
||||
ret
|
||||
|
||||
|
@ -2512,7 +2512,7 @@ const uint32_t kiHeight = 96; //DO NOT CHANGE!
|
||||
const uint32_t kiFrameRate = 12; //DO NOT CHANGE!
|
||||
const uint32_t kiFrameNum = 100; //DO NOT CHANGE!
|
||||
const char* pHashStr[] = { //DO NOT CHANGE!
|
||||
"058076b265686fc85b2b99cf7a53106f216f16c3",
|
||||
"585663f78cadb70d9c9f179b9b53b90ffddf3178",
|
||||
"f350001c333902029800bd291fbed915a4bdf19a",
|
||||
"eb9d853b7daec03052c4850027ac94adc84c3a7e"
|
||||
};
|
||||
|
@ -131,7 +131,7 @@ static const EncodeFileParam kFileParamArray[] = {
|
||||
},
|
||||
{
|
||||
"res/Cisco_Absolute_Power_1280x720_30fps.yuv",
|
||||
"a4707845cacc437fb52010eb020fca6d4bc1102d", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
|
||||
"2b5965c752e1f722592c3ce9a1eb82445c9dbaa3", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
|
||||
},
|
||||
// the following values may be adjusted for times since we start tuning the strategy
|
||||
{
|
||||
|
@ -199,6 +199,79 @@ TEST (DownSampleTest, func) { \
|
||||
} \
|
||||
}
|
||||
|
||||
#define GENERATE_DyadicBilinearOneThirdDownsampler_UT(func, ASM, CPUFLAGS) \
|
||||
TEST (DownSampleTest, func) { \
|
||||
if (ASM) {\
|
||||
int32_t iCpuCores = 0; \
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \
|
||||
if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
|
||||
return; \
|
||||
} \
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, dst_c, 50000, 16); \
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, src_c, 50000, 16); \
|
||||
int dst_stride_c; \
|
||||
int src_stride_c; \
|
||||
int src_width_c; \
|
||||
int src_height_c; \
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, dst_a, 50000, 16); \
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, src_a, 50000, 16); \
|
||||
int dst_stride_a; \
|
||||
int src_stride_a; \
|
||||
int src_width_a; \
|
||||
int src_height_a; \
|
||||
dst_stride_c = dst_stride_a = 560; \
|
||||
src_stride_c = src_stride_a = 560; \
|
||||
src_width_c = src_width_a = 480; \
|
||||
src_height_c = src_height_a = 30; \
|
||||
for (int j = 0; j < 50000; j++) { \
|
||||
dst_c[j] = dst_a[j] = rand() % 256; \
|
||||
src_c[j] = src_a[j] = rand() % 256; \
|
||||
} \
|
||||
DyadicBilinearOneThirdDownsampler_c (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c/3); \
|
||||
func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a/3); \
|
||||
for (int j = 0; j < (src_height_c /3 ); j++) { \
|
||||
for (int m = 0; m < (src_width_c /3); m++) { \
|
||||
ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define GENERATE_DyadicBilinearQuarterDownsampler_UT(func, ASM, CPUFLAGS) \
|
||||
TEST (DownSampleTest, func) { \
|
||||
if (ASM) {\
|
||||
int32_t iCpuCores = 0; \
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \
|
||||
if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
|
||||
return; \
|
||||
} \
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, dst_c, 50000, 16); \
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, src_c, 50000, 16); \
|
||||
int dst_stride_c; \
|
||||
int src_stride_c; \
|
||||
int src_width_c; \
|
||||
int src_height_c; \
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, dst_a, 50000, 16); \
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, src_a, 50000, 16); \
|
||||
int dst_stride_a; \
|
||||
int src_stride_a; \
|
||||
int src_width_a; \
|
||||
int src_height_a; \
|
||||
dst_stride_c = dst_stride_a = 560; \
|
||||
src_stride_c = src_stride_a = 560; \
|
||||
src_width_c = src_width_a = 640; \
|
||||
src_height_c = src_height_a = 80; \
|
||||
for (int j = 0; j < 50000; j++) { \
|
||||
dst_c[j] = dst_a[j] = rand() % 256; \
|
||||
src_c[j] = src_a[j] = rand() % 256; \
|
||||
} \
|
||||
DyadicBilinearQuarterDownsampler_c (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \
|
||||
func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a); \
|
||||
for (int j = 0; j < (src_height_c >> 2); j++) { \
|
||||
for (int m = 0; m < (src_width_c >> 2); m++) { \
|
||||
ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
#define GENERATE_GeneralBilinearDownsampler_UT(func, ref, ASM, CPUFLAGS) \
|
||||
TEST (DownSampleTest, func) { \
|
||||
if (ASM) {\
|
||||
@ -259,6 +332,13 @@ GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_ssse3,
|
||||
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse4, 1, WELS_CPU_SSE41)
|
||||
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse4, 1, WELS_CPU_SSE41)
|
||||
|
||||
GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_ssse3, 1, WELS_CPU_SSSE3)
|
||||
GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_sse4, 1, WELS_CPU_SSE41)
|
||||
|
||||
GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_sse, 1, WELS_CPU_SSE)
|
||||
GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_ssse3, 1, WELS_CPU_SSSE3)
|
||||
GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_sse4, 1, WELS_CPU_SSE41)
|
||||
|
||||
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_sse2, GeneralBilinearFastDownsampler_ref, 1,
|
||||
WELS_CPU_SSE2)
|
||||
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse2,
|
||||
@ -269,6 +349,10 @@ GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_s
|
||||
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_neon, 1, WELS_CPU_NEON)
|
||||
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsampler_neon, 1, WELS_CPU_NEON)
|
||||
|
||||
GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_neon, 1, WELS_CPU_NEON)
|
||||
|
||||
GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_neon, 1, WELS_CPU_NEON)
|
||||
|
||||
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_neon,
|
||||
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_NEON)
|
||||
#endif
|
||||
@ -277,6 +361,10 @@ GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_n
|
||||
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_AArch64_neon, 1, WELS_CPU_NEON)
|
||||
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsampler_AArch64_neon, 1, WELS_CPU_NEON)
|
||||
|
||||
GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_AArch64_neon, 1, WELS_CPU_NEON)
|
||||
|
||||
GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_AArch64_neon, 1, WELS_CPU_NEON)
|
||||
|
||||
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_AArch64_neon,
|
||||
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_NEON)
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user