Merge pull request #2111 from GuangweiWang/downsampler

add new c and assembly functions to optimize downsampler when downscale equal 1:3/1:4
This commit is contained in:
HaiboZhu 2015-09-11 17:36:13 +08:00
commit 5373b8a3aa
9 changed files with 1125 additions and 8 deletions

View File

@ -338,4 +338,121 @@ _LAST_ROW_WIDTH:
ldmia sp!, {r4-r12, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_neon
stmdb sp!, {r4-r8, lr}
//Get the width and height
ldr r4, [sp, #24] //src_width
ldr r5, [sp, #28] //src_height
//Initialize the register
mov r6, r2
mov r8, r0
mov lr, #0
//Save the tailer for the un-aligned size
mla r7, r1, r5, r0
vld1.32 {q15}, [r7]
add r7, r2, r3
//processing a colume data
comp_ds_bilinear_onethird_loop0:
vld3.8 {d0, d1, d2}, [r2]!
vld3.8 {d3, d4, d5}, [r2]!
vld3.8 {d16, d17, d18}, [r7]!
vld3.8 {d19, d20, d21}, [r7]!
vaddl.u8 q11, d0, d1
vaddl.u8 q12, d3, d4
vaddl.u8 q13, d16, d17
vaddl.u8 q14, d19, d20
vrshr.u16 q11, #1
vrshr.u16 q12, #1
vrshr.u16 q13, #1
vrshr.u16 q14, #1
vrhadd.u16 q11, q13
vrhadd.u16 q12, q14
vmovn.u16 d0, q11
vmovn.u16 d1, q12
vst1.8 {q0}, [r0]!
add lr, #48
cmp lr, r4
movcs lr, #0
addcs r6, r3, lsl #1
addcs r6, r6, r3
movcs r2, r6
addcs r7, r2, r3
addcs r8, r1
movcs r0, r8
subscs r5, #1
bne comp_ds_bilinear_onethird_loop0
//restore the tailer for the un-aligned size
vst1.32 {q15}, [r0]
ldmia sp!, {r4-r8,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DyadicBilinearQuarterDownsampler_neon
stmdb sp!, {r4-r8, lr}
//Get the width and height
ldr r4, [sp, #24] //src_width
ldr r5, [sp, #28] //src_height
//Initialize the register
mov r6, r2
mov r8, r0
mov lr, #0
lsr r5, #2
//Save the tailer for the un-aligned size
mla r7, r1, r5, r0
vld1.32 {q15}, [r7]
add r7, r2, r3
//processing a colume data
comp_ds_bilinear_quarter_loop0:
vld2.16 {q0, q1}, [r2]!
vld2.16 {q2, q3}, [r2]!
vld2.16 {q8, q9}, [r7]!
vld2.16 {q10, q11}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q2, q2
vpaddl.u8 q8, q8
vpaddl.u8 q10, q10
vrshr.u16 q0, #1
vrshr.u16 q2, #1
vrshr.u16 q8, #1
vrshr.u16 q10, #1
vrhadd.u16 q0, q8
vrhadd.u16 q2, q10
vmovn.u16 d0, q0
vmovn.u16 d1, q2
vst1.8 {q0}, [r0]!
add lr, #64
cmp lr, r4
movcs lr, #0
addcs r6, r3, lsl #2
movcs r2, r6
addcs r7, r2, r3
addcs r8, r1
movcs r0, r8
subscs r5, #1
bne comp_ds_bilinear_quarter_loop0
//restore the tailer for the un-aligned size
vst1.32 {q15}, [r0]
ldmia sp!, {r4-r8,lr}
WELS_ASM_FUNC_END
#endif

View File

@ -84,7 +84,6 @@ comp_ds_bilinear_loop0:
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_AArch64_neon
sub w9, w3, w4
sub w1, w1, w4, lsr #1
@ -123,6 +122,113 @@ comp_ds_bilinear_w_x32_loop1:
cbnz w5, comp_ds_bilinear_w_x32_loop0
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_AArch64_neon
//Initialize the register
mov x6, x2
mov x8, x0
mov w9, #0
//Save the tailer for the unasigned size
smaddl x7, w1, w5, x0
ld1 {v16.16b}, [x7]
add x7, x2, w3, sxtw
//processing a colume data
comp_ds_bilinear_onethird_loop0:
ld3 {v0.16b, v1.16b, v2.16b}, [x2], #48
ld3 {v4.16b, v5.16b, v6.16b}, [x7], #48
uaddl v2.8h, v0.8b, v1.8b
uaddl2 v3.8h, v0.16b, v1.16b
uaddl v6.8h, v4.8b, v5.8b
uaddl2 v7.8h, v4.16b, v5.16b
urshr v2.8h, v2.8h, #1
urshr v3.8h, v3.8h, #1
urshr v6.8h, v6.8h, #1
urshr v7.8h, v7.8h, #1
urhadd v0.8h, v2.8h, v6.8h
urhadd v1.8h, v3.8h, v7.8h
xtn v0.8b, v0.8h
xtn v1.8b, v1.8h
st1 {v0.8b,v1.8b}, [x0], #16
add w9, w9, #48
cmp w9, w4
b.cc comp_ds_bilinear_onethird_loop0
mov w9, #0
add x6, x6, w3, sxtw #1
add x6, x6, w3, sxtw
mov x2, x6
add x7, x2, w3, sxtw
add x8, x8, w1, sxtw
mov x0, x8
sub w5, w5, #1
cbnz w5, comp_ds_bilinear_onethird_loop0
//restore the tailer for the unasigned size
st1 {v16.16b}, [x0]
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearQuarterDownsampler_AArch64_neon
//Initialize the register
mov x6, x2
mov x8, x0
mov w9, #0
lsr w5, w5, #2
//Save the tailer for the unasigned size
smaddl x7, w1, w5, x0
ld1 {v16.16b}, [x7]
add x7, x2, w3, sxtw
//processing a colume data
comp_ds_bilinear_quarter_loop0:
ld2 {v0.8h, v1.8h}, [x2], #32
ld2 {v2.8h, v3.8h}, [x2], #32
ld2 {v4.8h, v5.8h}, [x7], #32
ld2 {v6.8h, v7.8h}, [x7], #32
uaddlp v0.8h, v0.16b
uaddlp v1.8h, v2.16b
uaddlp v4.8h, v4.16b
uaddlp v5.8h, v6.16b
urshr v0.8h, v0.8h, #1
urshr v1.8h, v1.8h, #1
urshr v4.8h, v4.8h, #1
urshr v5.8h, v5.8h, #1
urhadd v0.8h, v0.8h, v4.8h
urhadd v1.8h, v1.8h, v5.8h
xtn v0.8b, v0.8h
xtn v1.8b, v1.8h
st1 {v0.8b,v1.8b}, [x0], #16
add w9, w9, #64
cmp w9, w4
b.cc comp_ds_bilinear_quarter_loop0
mov w9, #0
add x6, x6, w3, sxtw #2
mov x2, x6
add x7, x2, w3, sxtw
add x8, x8, w1, sxtw
mov x0, x8
sub w5, w5, #1
cbnz w5, comp_ds_bilinear_quarter_loop0
//restore the tailer for the unasigned size
st1 {v16.16b}, [x0]
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN GeneralBilinearAccurateDownsampler_AArch64_neon
mov w10, #32767
and w8, w6, w10

View File

@ -53,6 +53,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_c;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_c;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsampler_c;
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsampler_c;
#if defined(X86_ASM)
@ -60,6 +62,7 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse;
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse;
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsamplerWidthx8_sse;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse;
}
if (iCpuFlag & WELS_CPU_SSE2) {
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
@ -68,10 +71,14 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
if (iCpuFlag & WELS_CPU_SSSE3) {
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_ssse3;
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_ssse3;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_ssse3;
}
if (iCpuFlag & WELS_CPU_SSE41) {
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4;
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse4;
}
#endif//X86_ASM
@ -81,6 +88,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon;
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon;
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_neon;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_neon;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon;
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_neon;
}
@ -92,6 +101,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_AArch64_neon;
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_AArch64_neon;
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_AArch64_neon;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_AArch64_neon;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_AArch64_neon;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
}
@ -124,6 +135,28 @@ EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDs
(uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
(uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
} else if ((iSrcWidthY >> 2) == iDstWidthY && (iSrcHeightY >> 2) == iDstHeightY) {
m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
(uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
(uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
(uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
} else if ((iSrcWidthY / 3) == iDstWidthY && (iSrcHeightY / 3) == iDstHeightY) {
m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
(uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iDstHeightY);
m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
(uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iDstHeightUV);
m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
(uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iDstHeightUV);
} else {
m_pfDownsample.pfGeneralRatioLuma ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], iDstWidthY, iDstHeightY,
(uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);

View File

@ -54,20 +54,29 @@ typedef void (HalveDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride,
uint8_t* pSrc, const int32_t kiSrcStride,
const int32_t kiSrcWidth, const int32_t kiSrcHeight);
typedef void (SpecificDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride,
uint8_t* pSrc, const int32_t kiSrcStride,
const int32_t kiSrcWidth, const int32_t kiHeight);
typedef void (GeneralDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight);
typedef HalveDownsampleFunc* PHalveDownsampleFunc;
typedef SpecificDownsampleFunc* PSpecificDownsampleFunc;
typedef GeneralDownsampleFunc* PGeneralDownsampleFunc;
HalveDownsampleFunc DyadicBilinearDownsampler_c;
HalveDownsampleFunc DyadicBilinearDownsampler_c;
GeneralDownsampleFunc GeneralBilinearFastDownsampler_c;
GeneralDownsampleFunc GeneralBilinearAccurateDownsampler_c;
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_c;
SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_c;
typedef struct {
// align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;
PHalveDownsampleFunc pfHalfAverage[4];
PSpecificDownsampleFunc pfOneThirdDownsampler;
PSpecificDownsampleFunc pfQuarterDownsampler;
PGeneralDownsampleFunc pfGeneralRatioLuma;
PGeneralDownsampleFunc pfGeneralRatioChroma;
} SDownsampleFuncs;
@ -93,10 +102,19 @@ HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_sse4;
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_ssse3;
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_sse4;
SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_sse;
SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_ssse3;
SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_sse4;
void GeneralBilinearFastDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,
const uint32_t kuiScaleY);
void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,
const uint32_t kuiScaleY);
WELSVP_EXTERN_C_END
#endif
@ -109,6 +127,10 @@ HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_neon;
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_neon;
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_neon;
SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_neon;
void GeneralBilinearAccurateDownsampler_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
@ -125,8 +147,13 @@ HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_AArch64_neon;
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_AArch64_neon;
SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_AArch64_neon;
void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride,
const int32_t kiDstWidth, const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
WELSVP_EXTERN_C_END
#endif

View File

@ -68,6 +68,53 @@ void DyadicBilinearDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
}
}
void DyadicBilinearQuarterDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
uint8_t* pSrc, const int32_t kiSrcStride,
const int32_t kiSrcWidth, const int32_t kiSrcHeight)
{
uint8_t* pDstLine = pDst;
uint8_t* pSrcLine = pSrc;
const int32_t kiSrcStridex4 = kiSrcStride << 2;
const int32_t kiDstWidth = kiSrcWidth >> 2;
const int32_t kiDstHeight = kiSrcHeight >> 2;
for (int32_t j = 0; j < kiDstHeight; j ++) {
for (int32_t i = 0; i < kiDstWidth; i ++) {
const int32_t kiSrcX = i << 2;
const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
}
pDstLine += kiDstStride;
pSrcLine += kiSrcStridex4;
}
}
void DyadicBilinearOneThirdDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
uint8_t* pSrc, const int32_t kiSrcStride,
const int32_t kiSrcWidth, const int32_t kiDstHeight)
{
uint8_t* pDstLine = pDst;
uint8_t* pSrcLine = pSrc;
const int32_t kiSrcStridex3 = kiSrcStride * 3;
const int32_t kiDstWidth = kiSrcWidth / 3;
for (int32_t j = 0; j < kiDstHeight; j ++) {
for (int32_t i = 0; i < kiDstWidth; i ++) {
const int32_t kiSrcX = i * 3;
const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
}
pDstLine += kiDstStride;
pSrcLine += kiSrcStridex3;
}
}
void GeneralBilinearFastDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {

View File

@ -67,6 +67,22 @@ shufb_mask_high:
add_extra_half:
dd 16384,0,0,0
shufb_mask_quarter:
db 00h, 04h, 08h, 0ch, 80h, 80h, 80h, 80h, 01h, 05h, 09h, 0dh, 80h, 80h, 80h, 80h
shufb_mask_onethird_low_1:
db 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h
shufb_mask_onethird_low_2:
db 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh, 80h, 80h, 80h, 80h, 80h
shufb_mask_onethird_low_3:
db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 01h, 04h, 07h, 0ah, 0dh
shufb_mask_onethird_high_1:
db 01h, 04h, 07h, 0ah, 0dh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h
shufb_mask_onethird_high_2:
db 80h, 80h, 80h, 80h, 80h, 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h
shufb_mask_onethird_high_3:
db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh
;***********************************************************************
; Code
@ -1896,3 +1912,686 @@ FAST_LAST_ROW_END:
pop r12
ret
%endif
;***********************************************************************
; void DyadicBilinearOneThirdDownsampler_ssse3( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearOneThirdDownsampler_ssse3
%ifdef X86_32
push r6
%assign push_num 1
%else
%assign push_num 0
%endif
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
mov r12, r4
%endif
mov r6, r1 ;Save the tailer for the unasigned size
imul r6, r5
add r6, r0
movdqa xmm7, [r6]
.yloops_onethird_sse3:
%ifdef X86_32
mov r4, arg5
%else
mov r4, r12
%endif
mov r6, r0 ;save base address
; each loop = source bandwidth: 48 bytes
.xloops_onethird_sse3:
; 1st part horizonal loop: x48 bytes
; mem hi<- ->lo
;1st Line Src: xmm0: F * e E * d D * c C * b B * a A
; xmm2: k K * j J * i I * h H * g G * f
; xmm2: * p P * o O * n N * m M * l L *
;
;2nd Line Src: xmm2: F' * e' E' * d' D' * c' C' * b' B' * a' A'
; xmm1: k' K' * j' J' * i' I' * h' H' * g' G' * f'
; xmm1: * p' P' * o' O' * n' N' * m' M' * l' L' *
;=> target:
;: P O N M L K J I H G F E D C B A
;: p o n m l k j i h g f e d c b a
;: P' .. A'
;: p' .. a'
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;1st line
movdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A
movdqa xmm1, xmm0
movdqa xmm5, [shufb_mask_onethird_low_1]
movdqa xmm6, [shufb_mask_onethird_high_1]
pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
movdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f
movdqa xmm3, xmm2
movdqa xmm5, [shufb_mask_onethird_low_2]
movdqa xmm6, [shufb_mask_onethird_high_2]
pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
paddusb xmm0, xmm2 ;0 0 0 0 0 K J I H G F E D C B A -> xmm0
paddusb xmm1, xmm3 ;0 0 0 0 0 k j i h g f e d c b a -> xmm1
movdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L *
movdqa xmm3, xmm2
movdqa xmm5, [shufb_mask_onethird_low_3]
movdqa xmm6, [shufb_mask_onethird_high_3]
pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
paddusb xmm0, xmm2 ;P O N M L K J I H G F E D C B A -> xmm0
paddusb xmm1, xmm3 ;p o n m l k j i h g f e d c b a -> xmm1
pavgb xmm0, xmm1 ;1st line average -> xmm0
;2nd line
movdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A'
movdqa xmm3, xmm2
movdqa xmm5, [shufb_mask_onethird_low_1]
movdqa xmm6, [shufb_mask_onethird_high_1]
pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3
movdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f'
movdqa xmm4, xmm1
movdqa xmm5, [shufb_mask_onethird_low_2]
movdqa xmm6, [shufb_mask_onethird_high_2]
pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
paddusb xmm2, xmm1 ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2
paddusb xmm3, xmm4 ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3
movdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' *
movdqa xmm4, xmm1
movdqa xmm5, [shufb_mask_onethird_low_3]
movdqa xmm6, [shufb_mask_onethird_high_3]
pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
paddusb xmm2, xmm1 ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2
paddusb xmm3, xmm4 ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3
pavgb xmm2, xmm3 ;2nd line average -> xmm2
pavgb xmm0, xmm2 ; bytes-average(1st line , 2nd line )
; write pDst
movdqa [r0], xmm0 ;write result in dst
; next SMB
lea r2, [r2+48] ;current src address
lea r0, [r0+16] ;current dst address
sub r4, 48 ;xloops counter
cmp r4, 0
jg near .xloops_onethird_sse3
sub r6, r0 ;offset = base address - current address
lea r2, [r2+2*r3] ;
lea r2, [r2+r3] ;
lea r2, [r2+2*r6] ;current line + 3 lines
lea r2, [r2+r6]
lea r0, [r0+r1]
lea r0, [r0+r6] ;current dst lien + 1 line
dec r5
jg near .yloops_onethird_sse3
movdqa [r0], xmm7 ;restore the tailer for the unasigned size
%ifndef X86_32
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
pop r6
%endif
ret
;***********************************************************************
; void DyadicBilinearOneThirdDownsampler_sse4( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearOneThirdDownsampler_sse4
%ifdef X86_32
push r6
%assign push_num 1
%else
%assign push_num 0
%endif
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
mov r12, r4
%endif
mov r6, r1 ;Save the tailer for the unasigned size
imul r6, r5
add r6, r0
movdqa xmm7, [r6]
.yloops_onethird_sse4:
%ifdef X86_32
mov r4, arg5
%else
mov r4, r12
%endif
mov r6, r0 ;save base address
; each loop = source bandwidth: 48 bytes
.xloops_onethird_sse4:
; 1st part horizonal loop: x48 bytes
; mem hi<- ->lo
;1st Line Src: xmm0: F * e E * d D * c C * b B * a A
; xmm2: k K * j J * i I * h H * g G * f
; xmm2: * p P * o O * n N * m M * l L *
;
;2nd Line Src: xmm2: F' * e' E' * d' D' * c' C' * b' B' * a' A'
; xmm1: k' K' * j' J' * i' I' * h' H' * g' G' * f'
; xmm1: * p' P' * o' O' * n' N' * m' M' * l' L' *
;=> target:
;: P O N M L K J I H G F E D C B A
;: p o n m l k j i h g f e d c b a
;: P' .. A'
;: p' .. a'
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;1st line
movntdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A
movdqa xmm1, xmm0
movdqa xmm5, [shufb_mask_onethird_low_1]
movdqa xmm6, [shufb_mask_onethird_high_1]
pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
movntdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f
movdqa xmm3, xmm2
movdqa xmm5, [shufb_mask_onethird_low_2]
movdqa xmm6, [shufb_mask_onethird_high_2]
pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
paddusb xmm0, xmm2 ;0 0 0 0 0 K J I H G F E D C B A -> xmm0
paddusb xmm1, xmm3 ;0 0 0 0 0 k j i h g f e d c b a -> xmm1
movntdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L *
movdqa xmm3, xmm2
movdqa xmm5, [shufb_mask_onethird_low_3]
movdqa xmm6, [shufb_mask_onethird_high_3]
pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
paddusb xmm0, xmm2 ;P O N M L K J I H G F E D C B A -> xmm0
paddusb xmm1, xmm3 ;p o n m l k j i h g f e d c b a -> xmm1
pavgb xmm0, xmm1 ;1st line average -> xmm0
;2nd line
movntdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A'
movdqa xmm3, xmm2
movdqa xmm5, [shufb_mask_onethird_low_1]
movdqa xmm6, [shufb_mask_onethird_high_1]
pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3
movntdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f'
movdqa xmm4, xmm1
movdqa xmm5, [shufb_mask_onethird_low_2]
movdqa xmm6, [shufb_mask_onethird_high_2]
pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
paddusb xmm2, xmm1 ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2
paddusb xmm3, xmm4 ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3
movntdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' *
movdqa xmm4, xmm1
movdqa xmm5, [shufb_mask_onethird_low_3]
movdqa xmm6, [shufb_mask_onethird_high_3]
pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
paddusb xmm2, xmm1 ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2
paddusb xmm3, xmm4 ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3
pavgb xmm2, xmm3 ;2nd line average -> xmm2
pavgb xmm0, xmm2 ; bytes-average(1st line , 2nd line )
; write pDst
movdqa [r0], xmm0 ;write result in dst
; next SMB
lea r2, [r2+48] ;current src address
lea r0, [r0+16] ;current dst address
sub r4, 48 ;xloops counter
cmp r4, 0
jg near .xloops_onethird_sse4
sub r6, r0 ;offset = base address - current address
lea r2, [r2+2*r3] ;
lea r2, [r2+r3] ;
lea r2, [r2+2*r6] ;current line + 3 lines
lea r2, [r2+r6]
lea r0, [r0+r1]
lea r0, [r0+r6] ;current dst lien + 1 line
dec r5
jg near .yloops_onethird_sse4
movdqa [r0], xmm7 ;restore the tailer for the unasigned size
%ifndef X86_32
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
pop r6
%endif
ret
;***********************************************************************
; void DyadicBilinearQuarterDownsampler_sse( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearQuarterDownsampler_sse
%ifdef X86_32
push r6
%assign push_num 1
%else
%assign push_num 0
%endif
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
mov r12, r4
%endif
sar r5, $02 ; iSrcHeight >> 2
mov r6, r1 ;Save the tailer for the unasigned size
imul r6, r5
add r6, r0
movq xmm7, [r6]
.yloops_quarter_sse:
%ifdef X86_32
mov r4, arg5
%else
mov r4, r12
%endif
mov r6, r0 ;save base address
; each loop = source bandwidth: 32 bytes
.xloops_quarter_sse:
; 1st part horizonal loop: x16 bytes
; mem hi<- ->lo
;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
;
;=> target:
;: G E C A,
;:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movq mm0, [r2] ; 1st pSrc line
movq mm1, [r2+8] ; 1st pSrc line + 8
movq mm2, [r2+r3] ; 2nd pSrc line
movq mm3, [r2+r3+8] ; 2nd pSrc line + 8
pshufw mm0, mm0, 0d8h ; x X x X c C a A
pshufw mm1, mm1, 0d8h ; x X x X g G e E
pshufw mm2, mm2, 0d8h ; x X x X k K i I
pshufw mm3, mm3, 0d8h ; x X x X o O m M
punpckldq mm0, mm1 ; g G e E c C a A
punpckldq mm2, mm3 ; o O m M k K i I
; to handle mm0,mm2
pshufw mm4, mm0, 0d8h ;g G c C e E a A
pshufw mm5, mm4, 04eh ;e E a A g G c C
punpcklbw mm4, mm5 ;g e G E c a C A -> mm4
pshufw mm4, mm4, 0d8h ;g e c a G E C A -> mm4
pshufw mm5, mm2, 0d8h ;o O k K m M i I
pshufw mm6, mm5, 04eh ;m M i I o O k K
punpcklbw mm5, mm6 ;o m O M k i K I
pshufw mm5, mm5, 0d8h ;o m k i O M K I -> mm5
; to handle mm4, mm5
movq mm0, mm4
punpckldq mm0, mm6 ;x x x x G E C A
punpckhdq mm4, mm6 ;x x x x g e c a
movq mm1, mm5
punpckldq mm1, mm6 ;x x x x O M K I
punpckhdq mm5, mm6 ;x x x x o m k i
; avg within MB horizon width (8 x 2 lines)
pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
pavgb mm1, mm5 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
; 2nd part horizonal loop: x16 bytes
movq mm1, [r2+16] ; 1st pSrc line + 16
movq mm2, [r2+24] ; 1st pSrc line + 24
movq mm3, [r2+r3+16] ; 2nd pSrc line + 16
movq mm4, [r2+r3+24] ; 2nd pSrc line + 24
pshufw mm1, mm1, 0d8h
pshufw mm2, mm2, 0d8h
pshufw mm3, mm3, 0d8h
pshufw mm4, mm4, 0d8h
punpckldq mm1, mm2
punpckldq mm3, mm4
; to handle mm1, mm3
pshufw mm4, mm1, 0d8h
pshufw mm5, mm4, 04eh
punpcklbw mm4, mm5
pshufw mm4, mm4, 0d8h
pshufw mm5, mm3, 0d8h
pshufw mm6, mm5, 04eh
punpcklbw mm5, mm6
pshufw mm5, mm5, 0d8h
; to handle mm4, mm5
movq mm2, mm4
punpckldq mm2, mm6
punpckhdq mm4, mm6
movq mm3, mm5
punpckldq mm3, mm6
punpckhdq mm5, mm6
; avg within MB horizon width (8 x 2 lines)
pavgb mm2, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
pavgb mm3, mm5 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
movd [r0 ], mm0
movd [r0+4], mm2
; next SMB
lea r2, [r2+32]
lea r0, [r0+8]
sub r4, 32
cmp r4, 0
jg near .xloops_quarter_sse
sub r6, r0
; next line
lea r2, [r2+4*r3] ; next 4 end of lines
lea r2, [r2+4*r6] ; reset to base 0 [- 4 * iDstWidth]
lea r0, [r0+r1]
lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
dec r5
jg near .yloops_quarter_sse
movq [r0], xmm7 ;restored the tailer for the unasigned size
WELSEMMS
%ifndef X86_32
pop r12
%endif
LOAD_6_PARA_POP
%ifdef X86_32
pop r6
%endif
ret
;***********************************************************************
; void DyadicBilinearQuarterDownsampler_ssse3( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearQuarterDownsampler_ssse3
;push ebx
;push edx
;push esi
;push edi
;push ebp
;mov edi, [esp+24] ; pDst
;mov edx, [esp+28] ; iDstStride
;mov esi, [esp+32] ; pSrc
;mov ecx, [esp+36] ; iSrcStride
;mov ebp, [esp+44] ; iSrcHeight
%ifdef X86_32
push r6
%assign push_num 1
%else
%assign push_num 0
%endif
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
mov r12, r4
%endif
sar r5, $02 ; iSrcHeight >> 2
mov r6, r1 ;Save the tailer for the unasigned size
imul r6, r5
add r6, r0
movq xmm7, [r6]
movdqa xmm6, [shufb_mask_quarter]
.yloops_quarter_sse3:
;mov eax, [esp+40] ; iSrcWidth
;sar eax, $02 ; iSrcWidth >> 2
;mov ebx, eax ; iDstWidth restored at ebx
;sar eax, $04 ; (iSrcWidth >> 2) / 16 ; loop count = num_of_mb
;neg ebx ; - (iSrcWidth >> 2)
%ifdef X86_32
mov r4, arg5
%else
mov r4, r12
%endif
mov r6, r0
; each loop = source bandwidth: 32 bytes
.xloops_quarter_sse3:
; 1st part horizonal loop: x32 bytes
; mem hi<- ->lo
;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
; xmm1: p P o O n N m M l L k K j J i I
;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
; xmm3: p P o O n N m M l L k K j J i I
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movdqa xmm0, [r2] ; 1st_src_line
movdqa xmm1, [r2+16] ; 1st_src_line + 16
movdqa xmm2, [r2+r3] ; 2nd_src_line
movdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16
pshufb xmm0, xmm6 ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A
pshufb xmm1, xmm6 ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I
pshufb xmm2, xmm6 ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A
pshufb xmm3, xmm6 ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I
movdqa xmm4, xmm0
movdqa xmm5, xmm2
punpckldq xmm0, xmm1 ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0
punpckhdq xmm4, xmm1 ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4
punpckldq xmm2, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2
punpckhdq xmm5, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5
pavgb xmm0, xmm4
pavgb xmm2, xmm5
pavgb xmm0, xmm2 ;average
; write pDst
movq [r0], xmm0
; next SMB
lea r2, [r2+32]
lea r0, [r0+8]
sub r4, 32
cmp r4, 0
jg near .xloops_quarter_sse3
sub r6, r0
; next line
lea r2, [r2+4*r3] ; next end of lines
lea r2, [r2+4*r6] ; reset to base 0 [- 4 * iDstWidth]
lea r0, [r0+r1]
lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
dec r5
jg near .yloops_quarter_sse3
movq [r0], xmm7 ;restored the tailer for the unasigned size
%ifndef X86_32
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
pop r6
%endif
ret
;***********************************************************************
; void DyadicBilinearQuarterDownsampler_sse4( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4
%ifdef X86_32
push r6
%assign push_num 1
%else
%assign push_num 0
%endif
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
mov r12, r4
%endif
sar r5, $02 ; iSrcHeight >> 2
mov r6, r1 ;Save the tailer for the unasigned size
imul r6, r5
add r6, r0
movq xmm7, [r6]
movdqa xmm6, [shufb_mask_quarter] ;mask
.yloops_quarter_sse4:
%ifdef X86_32
mov r4, arg5
%else
mov r4, r12
%endif
mov r6, r0
; each loop = source bandwidth: 32 bytes
.xloops_quarter_sse4:
; 1st part horizonal loop: x16 bytes
; mem hi<- ->lo
;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
; xmm1: p P o O n N m M l L k K j J i I
;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
; xmm3: p P o O n N m M l L k K j J i I
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movntdqa xmm0, [r2] ; 1st_src_line
movntdqa xmm1, [r2+16] ; 1st_src_line + 16
movntdqa xmm2, [r2+r3] ; 2nd_src_line
movntdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16
pshufb xmm0, xmm6 ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A
pshufb xmm1, xmm6 ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I
pshufb xmm2, xmm6 ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A
pshufb xmm3, xmm6 ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I
movdqa xmm4, xmm0
movdqa xmm5, xmm2
punpckldq xmm0, xmm1 ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0
punpckhdq xmm4, xmm1 ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4
punpckldq xmm2, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2
punpckhdq xmm5, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5
pavgb xmm0, xmm4
pavgb xmm2, xmm5
pavgb xmm0, xmm2 ;average
; write pDst
movq [r0], xmm0
; next SMB
lea r2, [r2+32]
lea r0, [r0+8]
sub r4, 32
cmp r4, 0
jg near .xloops_quarter_sse4
sub r6, r0
lea r2, [r2+4*r3] ; next end of lines
lea r2, [r2+4*r6] ; reset to base 0 [- 2 * iDstWidth]
lea r0, [r0+r1]
lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
dec r5
jg near .yloops_quarter_sse4
movq [r0], xmm7 ;restore the tailer for the unasigned size
%ifndef X86_32
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
pop r6
%endif
ret

View File

@ -2512,7 +2512,7 @@ const uint32_t kiHeight = 96; //DO NOT CHANGE!
const uint32_t kiFrameRate = 12; //DO NOT CHANGE!
const uint32_t kiFrameNum = 100; //DO NOT CHANGE!
const char* pHashStr[] = { //DO NOT CHANGE!
"058076b265686fc85b2b99cf7a53106f216f16c3",
"585663f78cadb70d9c9f179b9b53b90ffddf3178",
"f350001c333902029800bd291fbed915a4bdf19a",
"eb9d853b7daec03052c4850027ac94adc84c3a7e"
};

View File

@ -131,7 +131,7 @@ static const EncodeFileParam kFileParamArray[] = {
},
{
"res/Cisco_Absolute_Power_1280x720_30fps.yuv",
"a4707845cacc437fb52010eb020fca6d4bc1102d", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
"2b5965c752e1f722592c3ce9a1eb82445c9dbaa3", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
},
// the following values may be adjusted for times since we start tuning the strategy
{

View File

@ -199,6 +199,79 @@ TEST (DownSampleTest, func) { \
} \
}
#define GENERATE_DyadicBilinearOneThirdDownsampler_UT(func, ASM, CPUFLAGS) \
TEST (DownSampleTest, func) { \
if (ASM) {\
int32_t iCpuCores = 0; \
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \
if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
return; \
} \
ENFORCE_STACK_ALIGN_1D (uint8_t, dst_c, 50000, 16); \
ENFORCE_STACK_ALIGN_1D (uint8_t, src_c, 50000, 16); \
int dst_stride_c; \
int src_stride_c; \
int src_width_c; \
int src_height_c; \
ENFORCE_STACK_ALIGN_1D (uint8_t, dst_a, 50000, 16); \
ENFORCE_STACK_ALIGN_1D (uint8_t, src_a, 50000, 16); \
int dst_stride_a; \
int src_stride_a; \
int src_width_a; \
int src_height_a; \
dst_stride_c = dst_stride_a = 560; \
src_stride_c = src_stride_a = 560; \
src_width_c = src_width_a = 480; \
src_height_c = src_height_a = 30; \
for (int j = 0; j < 50000; j++) { \
dst_c[j] = dst_a[j] = rand() % 256; \
src_c[j] = src_a[j] = rand() % 256; \
} \
DyadicBilinearOneThirdDownsampler_c (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c/3); \
func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a/3); \
for (int j = 0; j < (src_height_c /3 ); j++) { \
for (int m = 0; m < (src_width_c /3); m++) { \
ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
} \
} \
}
#define GENERATE_DyadicBilinearQuarterDownsampler_UT(func, ASM, CPUFLAGS) \
TEST (DownSampleTest, func) { \
if (ASM) {\
int32_t iCpuCores = 0; \
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \
if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
return; \
} \
ENFORCE_STACK_ALIGN_1D (uint8_t, dst_c, 50000, 16); \
ENFORCE_STACK_ALIGN_1D (uint8_t, src_c, 50000, 16); \
int dst_stride_c; \
int src_stride_c; \
int src_width_c; \
int src_height_c; \
ENFORCE_STACK_ALIGN_1D (uint8_t, dst_a, 50000, 16); \
ENFORCE_STACK_ALIGN_1D (uint8_t, src_a, 50000, 16); \
int dst_stride_a; \
int src_stride_a; \
int src_width_a; \
int src_height_a; \
dst_stride_c = dst_stride_a = 560; \
src_stride_c = src_stride_a = 560; \
src_width_c = src_width_a = 640; \
src_height_c = src_height_a = 80; \
for (int j = 0; j < 50000; j++) { \
dst_c[j] = dst_a[j] = rand() % 256; \
src_c[j] = src_a[j] = rand() % 256; \
} \
DyadicBilinearQuarterDownsampler_c (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \
func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a); \
for (int j = 0; j < (src_height_c >> 2); j++) { \
for (int m = 0; m < (src_width_c >> 2); m++) { \
ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
} \
} \
}
#define GENERATE_GeneralBilinearDownsampler_UT(func, ref, ASM, CPUFLAGS) \
TEST (DownSampleTest, func) { \
if (ASM) {\
@ -259,6 +332,13 @@ GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_ssse3,
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse4, 1, WELS_CPU_SSE41)
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse4, 1, WELS_CPU_SSE41)
GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_ssse3, 1, WELS_CPU_SSSE3)
GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_sse4, 1, WELS_CPU_SSE41)
GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_sse, 1, WELS_CPU_SSE)
GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_ssse3, 1, WELS_CPU_SSSE3)
GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_sse4, 1, WELS_CPU_SSE41)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_sse2, GeneralBilinearFastDownsampler_ref, 1,
WELS_CPU_SSE2)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse2,
@ -269,6 +349,10 @@ GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_s
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_neon, 1, WELS_CPU_NEON)
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsampler_neon, 1, WELS_CPU_NEON)
GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_neon, 1, WELS_CPU_NEON)
GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_neon, 1, WELS_CPU_NEON)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_neon,
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_NEON)
#endif
@ -277,6 +361,10 @@ GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_n
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_AArch64_neon, 1, WELS_CPU_NEON)
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsampler_AArch64_neon, 1, WELS_CPU_NEON)
GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_AArch64_neon, 1, WELS_CPU_NEON)
GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_AArch64_neon, 1, WELS_CPU_NEON)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_AArch64_neon,
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_NEON)
#endif