diff --git a/codec/processing/src/arm/down_sample_neon.S b/codec/processing/src/arm/down_sample_neon.S index a306a160..60b24cef 100644 --- a/codec/processing/src/arm/down_sample_neon.S +++ b/codec/processing/src/arm/down_sample_neon.S @@ -338,4 +338,121 @@ _LAST_ROW_WIDTH: ldmia sp!, {r4-r12, lr} WELS_ASM_FUNC_END +WELS_ASM_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_neon + stmdb sp!, {r4-r8, lr} + + //Get the width and height + ldr r4, [sp, #24] //src_width + ldr r5, [sp, #28] //src_height + + //Initialize the register + mov r6, r2 + mov r8, r0 + mov lr, #0 + + //Save the tailer for the un-aligned size + mla r7, r1, r5, r0 + vld1.32 {q15}, [r7] + + add r7, r2, r3 + //processing a colume data +comp_ds_bilinear_onethird_loop0: + + vld3.8 {d0, d1, d2}, [r2]! + vld3.8 {d3, d4, d5}, [r2]! + vld3.8 {d16, d17, d18}, [r7]! + vld3.8 {d19, d20, d21}, [r7]! + + vaddl.u8 q11, d0, d1 + vaddl.u8 q12, d3, d4 + vaddl.u8 q13, d16, d17 + vaddl.u8 q14, d19, d20 + vrshr.u16 q11, #1 + vrshr.u16 q12, #1 + vrshr.u16 q13, #1 + vrshr.u16 q14, #1 + + vrhadd.u16 q11, q13 + vrhadd.u16 q12, q14 + + vmovn.u16 d0, q11 + vmovn.u16 d1, q12 + vst1.8 {q0}, [r0]! + + add lr, #48 + cmp lr, r4 + movcs lr, #0 + addcs r6, r3, lsl #1 + addcs r6, r6, r3 + movcs r2, r6 + addcs r7, r2, r3 + addcs r8, r1 + movcs r0, r8 + subscs r5, #1 + bne comp_ds_bilinear_onethird_loop0 + + //restore the tailer for the un-aligned size + vst1.32 {q15}, [r0] + + ldmia sp!, {r4-r8,lr} +WELS_ASM_FUNC_END + +WELS_ASM_FUNC_BEGIN DyadicBilinearQuarterDownsampler_neon + stmdb sp!, {r4-r8, lr} + + //Get the width and height + ldr r4, [sp, #24] //src_width + ldr r5, [sp, #28] //src_height + + //Initialize the register + mov r6, r2 + mov r8, r0 + mov lr, #0 + lsr r5, #2 + + //Save the tailer for the un-aligned size + mla r7, r1, r5, r0 + vld1.32 {q15}, [r7] + + add r7, r2, r3 + //processing a colume data +comp_ds_bilinear_quarter_loop0: + + vld2.16 {q0, q1}, [r2]! + vld2.16 {q2, q3}, [r2]! + vld2.16 {q8, q9}, [r7]! + vld2.16 {q10, q11}, [r7]! + + vpaddl.u8 q0, q0 + vpaddl.u8 q2, q2 + vpaddl.u8 q8, q8 + vpaddl.u8 q10, q10 + vrshr.u16 q0, #1 + vrshr.u16 q2, #1 + vrshr.u16 q8, #1 + vrshr.u16 q10, #1 + + vrhadd.u16 q0, q8 + vrhadd.u16 q2, q10 + vmovn.u16 d0, q0 + vmovn.u16 d1, q2 + vst1.8 {q0}, [r0]! + + add lr, #64 + cmp lr, r4 + movcs lr, #0 + addcs r6, r3, lsl #2 + movcs r2, r6 + addcs r7, r2, r3 + addcs r8, r1 + movcs r0, r8 + subscs r5, #1 + bne comp_ds_bilinear_quarter_loop0 + + //restore the tailer for the un-aligned size + vst1.32 {q15}, [r0] + + ldmia sp!, {r4-r8,lr} +WELS_ASM_FUNC_END + #endif diff --git a/codec/processing/src/arm64/down_sample_aarch64_neon.S b/codec/processing/src/arm64/down_sample_aarch64_neon.S index ed509965..2fb96b2f 100644 --- a/codec/processing/src/arm64/down_sample_aarch64_neon.S +++ b/codec/processing/src/arm64/down_sample_aarch64_neon.S @@ -84,7 +84,6 @@ comp_ds_bilinear_loop0: WELS_ASM_AARCH64_FUNC_END - WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_AArch64_neon sub w9, w3, w4 sub w1, w1, w4, lsr #1 @@ -123,6 +122,113 @@ comp_ds_bilinear_w_x32_loop1: cbnz w5, comp_ds_bilinear_w_x32_loop0 WELS_ASM_AARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_AArch64_neon + + //Initialize the register + mov x6, x2 + mov x8, x0 + mov w9, #0 + + //Save the tailer for the unasigned size + smaddl x7, w1, w5, x0 + ld1 {v16.16b}, [x7] + + add x7, x2, w3, sxtw + //processing a colume data +comp_ds_bilinear_onethird_loop0: + + ld3 {v0.16b, v1.16b, v2.16b}, [x2], #48 + ld3 {v4.16b, v5.16b, v6.16b}, [x7], #48 + + uaddl v2.8h, v0.8b, v1.8b + uaddl2 v3.8h, v0.16b, v1.16b + uaddl v6.8h, v4.8b, v5.8b + uaddl2 v7.8h, v4.16b, v5.16b + urshr v2.8h, v2.8h, #1 + urshr v3.8h, v3.8h, #1 + urshr v6.8h, v6.8h, #1 + urshr v7.8h, v7.8h, #1 + + urhadd v0.8h, v2.8h, v6.8h + urhadd v1.8h, v3.8h, v7.8h + xtn v0.8b, v0.8h + xtn v1.8b, v1.8h + st1 {v0.8b,v1.8b}, [x0], #16 + + add w9, w9, #48 + + cmp w9, w4 + b.cc comp_ds_bilinear_onethird_loop0 + + mov w9, #0 + add x6, x6, w3, sxtw #1 + add x6, x6, w3, sxtw + mov x2, x6 + add x7, x2, w3, sxtw + add x8, x8, w1, sxtw + mov x0, x8 + sub w5, w5, #1 + + cbnz w5, comp_ds_bilinear_onethird_loop0 + + //restore the tailer for the unasigned size + st1 {v16.16b}, [x0] +WELS_ASM_AARCH64_FUNC_END + +WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearQuarterDownsampler_AArch64_neon + //Initialize the register + mov x6, x2 + mov x8, x0 + mov w9, #0 + lsr w5, w5, #2 + + //Save the tailer for the unasigned size + smaddl x7, w1, w5, x0 + ld1 {v16.16b}, [x7] + + add x7, x2, w3, sxtw + //processing a colume data +comp_ds_bilinear_quarter_loop0: + + ld2 {v0.8h, v1.8h}, [x2], #32 + ld2 {v2.8h, v3.8h}, [x2], #32 + ld2 {v4.8h, v5.8h}, [x7], #32 + ld2 {v6.8h, v7.8h}, [x7], #32 + + uaddlp v0.8h, v0.16b + uaddlp v1.8h, v2.16b + uaddlp v4.8h, v4.16b + uaddlp v5.8h, v6.16b + urshr v0.8h, v0.8h, #1 + urshr v1.8h, v1.8h, #1 + urshr v4.8h, v4.8h, #1 + urshr v5.8h, v5.8h, #1 + + urhadd v0.8h, v0.8h, v4.8h + urhadd v1.8h, v1.8h, v5.8h + xtn v0.8b, v0.8h + xtn v1.8b, v1.8h + st1 {v0.8b,v1.8b}, [x0], #16 + + add w9, w9, #64 + + cmp w9, w4 + b.cc comp_ds_bilinear_quarter_loop0 + + mov w9, #0 + add x6, x6, w3, sxtw #2 + mov x2, x6 + add x7, x2, w3, sxtw + add x8, x8, w1, sxtw + mov x0, x8 + sub w5, w5, #1 + + cbnz w5, comp_ds_bilinear_quarter_loop0 + + //restore the tailer for the unasigned size + st1 {v16.16b}, [x0] +WELS_ASM_AARCH64_FUNC_END + WELS_ASM_AARCH64_FUNC_BEGIN GeneralBilinearAccurateDownsampler_AArch64_neon mov w10, #32767 and w8, w6, w10 diff --git a/codec/processing/src/downsample/downsample.cpp b/codec/processing/src/downsample/downsample.cpp index 8765dfe1..0598c20c 100644 --- a/codec/processing/src/downsample/downsample.cpp +++ b/codec/processing/src/downsample/downsample.cpp @@ -53,6 +53,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c; sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c; sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c; + sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_c; + sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_c; sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsampler_c; sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsampler_c; #if defined(X86_ASM) @@ -60,6 +62,7 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse; sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse; sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsamplerWidthx8_sse; + sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse; } if (iCpuFlag & WELS_CPU_SSE2) { sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2; @@ -68,10 +71,14 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int if (iCpuFlag & WELS_CPU_SSSE3) { sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_ssse3; sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3; + sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_ssse3; + sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_ssse3; } if (iCpuFlag & WELS_CPU_SSE41) { sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4; sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4; + sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4; + sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse4; } #endif//X86_ASM @@ -81,6 +88,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon; sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon; sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon; + sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_neon; + sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_neon; sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon; sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_neon; } @@ -92,6 +101,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_AArch64_neon; sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_AArch64_neon; sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_AArch64_neon; + sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_AArch64_neon; + sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_AArch64_neon; sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon; sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon; } @@ -124,6 +135,28 @@ EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDs (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV); m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2], (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV); + } else if ((iSrcWidthY >> 2) == iDstWidthY && (iSrcHeightY >> 2) == iDstHeightY) { + + m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], + (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY); + + m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1], + (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV); + + m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2], + (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV); + + } else if ((iSrcWidthY / 3) == iDstWidthY && (iSrcHeightY / 3) == iDstHeightY) { + + m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], + (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iDstHeightY); + + m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1], + (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iDstHeightUV); + + m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2], + (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iDstHeightUV); + } else { m_pfDownsample.pfGeneralRatioLuma ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], iDstWidthY, iDstHeightY, (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY); diff --git a/codec/processing/src/downsample/downsample.h b/codec/processing/src/downsample/downsample.h index 09048f0b..795dade7 100644 --- a/codec/processing/src/downsample/downsample.h +++ b/codec/processing/src/downsample/downsample.h @@ -54,20 +54,29 @@ typedef void (HalveDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride, uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight); +typedef void (SpecificDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride, + uint8_t* pSrc, const int32_t kiSrcStride, + const int32_t kiSrcWidth, const int32_t kiHeight); + typedef void (GeneralDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight); typedef HalveDownsampleFunc* PHalveDownsampleFunc; +typedef SpecificDownsampleFunc* PSpecificDownsampleFunc; typedef GeneralDownsampleFunc* PGeneralDownsampleFunc; -HalveDownsampleFunc DyadicBilinearDownsampler_c; +HalveDownsampleFunc DyadicBilinearDownsampler_c; GeneralDownsampleFunc GeneralBilinearFastDownsampler_c; GeneralDownsampleFunc GeneralBilinearAccurateDownsampler_c; +SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_c; +SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_c; typedef struct { // align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left; PHalveDownsampleFunc pfHalfAverage[4]; + PSpecificDownsampleFunc pfOneThirdDownsampler; + PSpecificDownsampleFunc pfQuarterDownsampler; PGeneralDownsampleFunc pfGeneralRatioLuma; PGeneralDownsampleFunc pfGeneralRatioChroma; } SDownsampleFuncs; @@ -93,10 +102,19 @@ HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_sse4; GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2; GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2; +SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_ssse3; +SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_sse4; +SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_sse; +SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_ssse3; +SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_sse4; + void GeneralBilinearFastDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, - const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY); + const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, + const uint32_t kuiScaleY); void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, - const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY); + const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, + const uint32_t kuiScaleY); + WELSVP_EXTERN_C_END #endif @@ -109,6 +127,10 @@ HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_neon; GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_neon; +SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_neon; + +SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_neon; + void GeneralBilinearAccurateDownsampler_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY); @@ -125,8 +147,13 @@ HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_AArch64_neon; GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_AArch64_neon; -void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, - uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY); +SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_AArch64_neon; + +SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_AArch64_neon; + +void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride, + const int32_t kiDstWidth, const int32_t kiDstHeight, + uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY); WELSVP_EXTERN_C_END #endif diff --git a/codec/processing/src/downsample/downsamplefuncs.cpp b/codec/processing/src/downsample/downsamplefuncs.cpp index 4c3a0d46..aee939b0 100644 --- a/codec/processing/src/downsample/downsamplefuncs.cpp +++ b/codec/processing/src/downsample/downsamplefuncs.cpp @@ -68,6 +68,53 @@ void DyadicBilinearDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, } } +void DyadicBilinearQuarterDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, + uint8_t* pSrc, const int32_t kiSrcStride, + const int32_t kiSrcWidth, const int32_t kiSrcHeight) + +{ + uint8_t* pDstLine = pDst; + uint8_t* pSrcLine = pSrc; + const int32_t kiSrcStridex4 = kiSrcStride << 2; + const int32_t kiDstWidth = kiSrcWidth >> 2; + const int32_t kiDstHeight = kiSrcHeight >> 2; + + for (int32_t j = 0; j < kiDstHeight; j ++) { + for (int32_t i = 0; i < kiDstWidth; i ++) { + const int32_t kiSrcX = i << 2; + const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1; + const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1; + + pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1); + } + pDstLine += kiDstStride; + pSrcLine += kiSrcStridex4; + } +} + +void DyadicBilinearOneThirdDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, + uint8_t* pSrc, const int32_t kiSrcStride, + const int32_t kiSrcWidth, const int32_t kiDstHeight) + +{ + uint8_t* pDstLine = pDst; + uint8_t* pSrcLine = pSrc; + const int32_t kiSrcStridex3 = kiSrcStride * 3; + const int32_t kiDstWidth = kiSrcWidth / 3; + + for (int32_t j = 0; j < kiDstHeight; j ++) { + for (int32_t i = 0; i < kiDstWidth; i ++) { + const int32_t kiSrcX = i * 3; + const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1; + const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1; + + pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1); + } + pDstLine += kiDstStride; + pSrcLine += kiSrcStridex3; + } +} + void GeneralBilinearFastDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { diff --git a/codec/processing/src/x86/downsample_bilinear.asm b/codec/processing/src/x86/downsample_bilinear.asm index b80073a8..42a87d8d 100644 --- a/codec/processing/src/x86/downsample_bilinear.asm +++ b/codec/processing/src/x86/downsample_bilinear.asm @@ -67,6 +67,22 @@ shufb_mask_high: add_extra_half: dd 16384,0,0,0 +shufb_mask_quarter: +db 00h, 04h, 08h, 0ch, 80h, 80h, 80h, 80h, 01h, 05h, 09h, 0dh, 80h, 80h, 80h, 80h + +shufb_mask_onethird_low_1: +db 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h +shufb_mask_onethird_low_2: +db 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh, 80h, 80h, 80h, 80h, 80h +shufb_mask_onethird_low_3: +db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 01h, 04h, 07h, 0ah, 0dh + +shufb_mask_onethird_high_1: +db 01h, 04h, 07h, 0ah, 0dh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h +shufb_mask_onethird_high_2: +db 80h, 80h, 80h, 80h, 80h, 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h +shufb_mask_onethird_high_3: +db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh ;*********************************************************************** ; Code @@ -1896,3 +1912,686 @@ FAST_LAST_ROW_END: pop r12 ret %endif + +;*********************************************************************** +; void DyadicBilinearOneThirdDownsampler_ssse3( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); +;*********************************************************************** +WELS_EXTERN DyadicBilinearOneThirdDownsampler_ssse3 +%ifdef X86_32 + push r6 + %assign push_num 1 +%else + %assign push_num 0 +%endif + LOAD_6_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + SIGN_EXTENSION r5, r5d + +%ifndef X86_32 + push r12 + mov r12, r4 +%endif + + mov r6, r1 ;Save the tailer for the unasigned size + imul r6, r5 + add r6, r0 + movdqa xmm7, [r6] + +.yloops_onethird_sse3: +%ifdef X86_32 + mov r4, arg5 +%else + mov r4, r12 +%endif + + mov r6, r0 ;save base address + ; each loop = source bandwidth: 48 bytes +.xloops_onethird_sse3: + ; 1st part horizonal loop: x48 bytes + ; mem hi<- ->lo + ;1st Line Src: xmm0: F * e E * d D * c C * b B * a A + ; xmm2: k K * j J * i I * h H * g G * f + ; xmm2: * p P * o O * n N * m M * l L * + ; + ;2nd Line Src: xmm2: F' * e' E' * d' D' * c' C' * b' B' * a' A' + ; xmm1: k' K' * j' J' * i' I' * h' H' * g' G' * f' + ; xmm1: * p' P' * o' O' * n' N' * m' M' * l' L' * + ;=> target: + ;: P O N M L K J I H G F E D C B A + ;: p o n m l k j i h g f e d c b a + ;: P' .. A' + ;: p' .. a' + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;1st line + movdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A + movdqa xmm1, xmm0 + movdqa xmm5, [shufb_mask_onethird_low_1] + movdqa xmm6, [shufb_mask_onethird_high_1] + pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0 + pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1 + + movdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f + movdqa xmm3, xmm2 + movdqa xmm5, [shufb_mask_onethird_low_2] + movdqa xmm6, [shufb_mask_onethird_high_2] + pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2 + pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3 + + paddusb xmm0, xmm2 ;0 0 0 0 0 K J I H G F E D C B A -> xmm0 + paddusb xmm1, xmm3 ;0 0 0 0 0 k j i h g f e d c b a -> xmm1 + + movdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L * + movdqa xmm3, xmm2 + movdqa xmm5, [shufb_mask_onethird_low_3] + movdqa xmm6, [shufb_mask_onethird_high_3] + pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2 + pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3 + + paddusb xmm0, xmm2 ;P O N M L K J I H G F E D C B A -> xmm0 + paddusb xmm1, xmm3 ;p o n m l k j i h g f e d c b a -> xmm1 + pavgb xmm0, xmm1 ;1st line average -> xmm0 + + ;2nd line + movdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A' + movdqa xmm3, xmm2 + movdqa xmm5, [shufb_mask_onethird_low_1] + movdqa xmm6, [shufb_mask_onethird_high_1] + pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2 + pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3 + + movdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f' + movdqa xmm4, xmm1 + movdqa xmm5, [shufb_mask_onethird_low_2] + movdqa xmm6, [shufb_mask_onethird_high_2] + pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1 + pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4 + + paddusb xmm2, xmm1 ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2 + paddusb xmm3, xmm4 ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3 + + movdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' * + movdqa xmm4, xmm1 + movdqa xmm5, [shufb_mask_onethird_low_3] + movdqa xmm6, [shufb_mask_onethird_high_3] + pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1 + pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4 + + paddusb xmm2, xmm1 ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2 + paddusb xmm3, xmm4 ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3 + pavgb xmm2, xmm3 ;2nd line average -> xmm2 + + pavgb xmm0, xmm2 ; bytes-average(1st line , 2nd line ) + + ; write pDst + movdqa [r0], xmm0 ;write result in dst + + ; next SMB + lea r2, [r2+48] ;current src address + lea r0, [r0+16] ;current dst address + + sub r4, 48 ;xloops counter + cmp r4, 0 + jg near .xloops_onethird_sse3 + + sub r6, r0 ;offset = base address - current address + lea r2, [r2+2*r3] ; + lea r2, [r2+r3] ; + lea r2, [r2+2*r6] ;current line + 3 lines + lea r2, [r2+r6] + lea r0, [r0+r1] + lea r0, [r0+r6] ;current dst lien + 1 line + + dec r5 + jg near .yloops_onethird_sse3 + + movdqa [r0], xmm7 ;restore the tailer for the unasigned size + +%ifndef X86_32 + pop r12 +%endif + + POP_XMM + LOAD_6_PARA_POP +%ifdef X86_32 + pop r6 +%endif + ret + +;*********************************************************************** +; void DyadicBilinearOneThirdDownsampler_sse4( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); +;*********************************************************************** +WELS_EXTERN DyadicBilinearOneThirdDownsampler_sse4 +%ifdef X86_32 + push r6 + %assign push_num 1 +%else + %assign push_num 0 +%endif + LOAD_6_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + SIGN_EXTENSION r5, r5d + +%ifndef X86_32 + push r12 + mov r12, r4 +%endif + + mov r6, r1 ;Save the tailer for the unasigned size + imul r6, r5 + add r6, r0 + movdqa xmm7, [r6] + +.yloops_onethird_sse4: +%ifdef X86_32 + mov r4, arg5 +%else + mov r4, r12 +%endif + + mov r6, r0 ;save base address + ; each loop = source bandwidth: 48 bytes +.xloops_onethird_sse4: + ; 1st part horizonal loop: x48 bytes + ; mem hi<- ->lo + ;1st Line Src: xmm0: F * e E * d D * c C * b B * a A + ; xmm2: k K * j J * i I * h H * g G * f + ; xmm2: * p P * o O * n N * m M * l L * + ; + ;2nd Line Src: xmm2: F' * e' E' * d' D' * c' C' * b' B' * a' A' + ; xmm1: k' K' * j' J' * i' I' * h' H' * g' G' * f' + ; xmm1: * p' P' * o' O' * n' N' * m' M' * l' L' * + ;=> target: + ;: P O N M L K J I H G F E D C B A + ;: p o n m l k j i h g f e d c b a + ;: P' .. A' + ;: p' .. a' + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;1st line + movntdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A + movdqa xmm1, xmm0 + movdqa xmm5, [shufb_mask_onethird_low_1] + movdqa xmm6, [shufb_mask_onethird_high_1] + pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0 + pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1 + + movntdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f + movdqa xmm3, xmm2 + movdqa xmm5, [shufb_mask_onethird_low_2] + movdqa xmm6, [shufb_mask_onethird_high_2] + pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2 + pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3 + + paddusb xmm0, xmm2 ;0 0 0 0 0 K J I H G F E D C B A -> xmm0 + paddusb xmm1, xmm3 ;0 0 0 0 0 k j i h g f e d c b a -> xmm1 + + movntdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L * + movdqa xmm3, xmm2 + movdqa xmm5, [shufb_mask_onethird_low_3] + movdqa xmm6, [shufb_mask_onethird_high_3] + pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2 + pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3 + + paddusb xmm0, xmm2 ;P O N M L K J I H G F E D C B A -> xmm0 + paddusb xmm1, xmm3 ;p o n m l k j i h g f e d c b a -> xmm1 + pavgb xmm0, xmm1 ;1st line average -> xmm0 + + ;2nd line + movntdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A' + movdqa xmm3, xmm2 + movdqa xmm5, [shufb_mask_onethird_low_1] + movdqa xmm6, [shufb_mask_onethird_high_1] + pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2 + pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3 + + movntdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f' + movdqa xmm4, xmm1 + movdqa xmm5, [shufb_mask_onethird_low_2] + movdqa xmm6, [shufb_mask_onethird_high_2] + pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1 + pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4 + + paddusb xmm2, xmm1 ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2 + paddusb xmm3, xmm4 ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3 + + movntdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' * + movdqa xmm4, xmm1 + movdqa xmm5, [shufb_mask_onethird_low_3] + movdqa xmm6, [shufb_mask_onethird_high_3] + pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1 + pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4 + + paddusb xmm2, xmm1 ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2 + paddusb xmm3, xmm4 ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3 + pavgb xmm2, xmm3 ;2nd line average -> xmm2 + + pavgb xmm0, xmm2 ; bytes-average(1st line , 2nd line ) + + ; write pDst + movdqa [r0], xmm0 ;write result in dst + + ; next SMB + lea r2, [r2+48] ;current src address + lea r0, [r0+16] ;current dst address + + sub r4, 48 ;xloops counter + cmp r4, 0 + jg near .xloops_onethird_sse4 + + sub r6, r0 ;offset = base address - current address + lea r2, [r2+2*r3] ; + lea r2, [r2+r3] ; + lea r2, [r2+2*r6] ;current line + 3 lines + lea r2, [r2+r6] + lea r0, [r0+r1] + lea r0, [r0+r6] ;current dst lien + 1 line + + dec r5 + jg near .yloops_onethird_sse4 + + movdqa [r0], xmm7 ;restore the tailer for the unasigned size + +%ifndef X86_32 + pop r12 +%endif + + POP_XMM + LOAD_6_PARA_POP +%ifdef X86_32 + pop r6 +%endif + ret + +;*********************************************************************** +; void DyadicBilinearQuarterDownsampler_sse( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); +;*********************************************************************** +WELS_EXTERN DyadicBilinearQuarterDownsampler_sse +%ifdef X86_32 + push r6 + %assign push_num 1 +%else + %assign push_num 0 +%endif + LOAD_6_PARA + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + SIGN_EXTENSION r5, r5d + +%ifndef X86_32 + push r12 + mov r12, r4 +%endif + sar r5, $02 ; iSrcHeight >> 2 + + mov r6, r1 ;Save the tailer for the unasigned size + imul r6, r5 + add r6, r0 + movq xmm7, [r6] + +.yloops_quarter_sse: +%ifdef X86_32 + mov r4, arg5 +%else + mov r4, r12 +%endif + + mov r6, r0 ;save base address + ; each loop = source bandwidth: 32 bytes +.xloops_quarter_sse: + ; 1st part horizonal loop: x16 bytes + ; mem hi<- ->lo + ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E + ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M + ; + ;=> target: + ;: G E C A, + ;: + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movq mm0, [r2] ; 1st pSrc line + movq mm1, [r2+8] ; 1st pSrc line + 8 + movq mm2, [r2+r3] ; 2nd pSrc line + movq mm3, [r2+r3+8] ; 2nd pSrc line + 8 + + pshufw mm0, mm0, 0d8h ; x X x X c C a A + pshufw mm1, mm1, 0d8h ; x X x X g G e E + pshufw mm2, mm2, 0d8h ; x X x X k K i I + pshufw mm3, mm3, 0d8h ; x X x X o O m M + + punpckldq mm0, mm1 ; g G e E c C a A + punpckldq mm2, mm3 ; o O m M k K i I + + ; to handle mm0,mm2 + pshufw mm4, mm0, 0d8h ;g G c C e E a A + pshufw mm5, mm4, 04eh ;e E a A g G c C + punpcklbw mm4, mm5 ;g e G E c a C A -> mm4 + pshufw mm4, mm4, 0d8h ;g e c a G E C A -> mm4 + + pshufw mm5, mm2, 0d8h ;o O k K m M i I + pshufw mm6, mm5, 04eh ;m M i I o O k K + punpcklbw mm5, mm6 ;o m O M k i K I + pshufw mm5, mm5, 0d8h ;o m k i O M K I -> mm5 + + ; to handle mm4, mm5 + movq mm0, mm4 + punpckldq mm0, mm6 ;x x x x G E C A + punpckhdq mm4, mm6 ;x x x x g e c a + + movq mm1, mm5 + punpckldq mm1, mm6 ;x x x x O M K I + punpckhdq mm5, mm6 ;x x x x o m k i + + ; avg within MB horizon width (8 x 2 lines) + pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 + pavgb mm1, mm5 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 + pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once + + ; 2nd part horizonal loop: x16 bytes + movq mm1, [r2+16] ; 1st pSrc line + 16 + movq mm2, [r2+24] ; 1st pSrc line + 24 + movq mm3, [r2+r3+16] ; 2nd pSrc line + 16 + movq mm4, [r2+r3+24] ; 2nd pSrc line + 24 + + pshufw mm1, mm1, 0d8h + pshufw mm2, mm2, 0d8h + pshufw mm3, mm3, 0d8h + pshufw mm4, mm4, 0d8h + + punpckldq mm1, mm2 + punpckldq mm3, mm4 + + ; to handle mm1, mm3 + pshufw mm4, mm1, 0d8h + pshufw mm5, mm4, 04eh + punpcklbw mm4, mm5 + pshufw mm4, mm4, 0d8h + + pshufw mm5, mm3, 0d8h + pshufw mm6, mm5, 04eh + punpcklbw mm5, mm6 + pshufw mm5, mm5, 0d8h + + ; to handle mm4, mm5 + movq mm2, mm4 + punpckldq mm2, mm6 + punpckhdq mm4, mm6 + + movq mm3, mm5 + punpckldq mm3, mm6 + punpckhdq mm5, mm6 + + ; avg within MB horizon width (8 x 2 lines) + pavgb mm2, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 + pavgb mm3, mm5 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 + pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part + + movd [r0 ], mm0 + movd [r0+4], mm2 + + ; next SMB + lea r2, [r2+32] + lea r0, [r0+8] + + sub r4, 32 + cmp r4, 0 + jg near .xloops_quarter_sse + + sub r6, r0 + ; next line + lea r2, [r2+4*r3] ; next 4 end of lines + lea r2, [r2+4*r6] ; reset to base 0 [- 4 * iDstWidth] + lea r0, [r0+r1] + lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] + + dec r5 + jg near .yloops_quarter_sse + + movq [r0], xmm7 ;restored the tailer for the unasigned size + + WELSEMMS +%ifndef X86_32 + pop r12 +%endif + LOAD_6_PARA_POP +%ifdef X86_32 + pop r6 +%endif + ret + +;*********************************************************************** +; void DyadicBilinearQuarterDownsampler_ssse3( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); +;*********************************************************************** +WELS_EXTERN DyadicBilinearQuarterDownsampler_ssse3 + ;push ebx + ;push edx + ;push esi + ;push edi + ;push ebp + + ;mov edi, [esp+24] ; pDst + ;mov edx, [esp+28] ; iDstStride + ;mov esi, [esp+32] ; pSrc + ;mov ecx, [esp+36] ; iSrcStride + ;mov ebp, [esp+44] ; iSrcHeight +%ifdef X86_32 + push r6 + %assign push_num 1 +%else + %assign push_num 0 +%endif + LOAD_6_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + SIGN_EXTENSION r5, r5d + +%ifndef X86_32 + push r12 + mov r12, r4 +%endif + sar r5, $02 ; iSrcHeight >> 2 + + mov r6, r1 ;Save the tailer for the unasigned size + imul r6, r5 + add r6, r0 + movq xmm7, [r6] + + movdqa xmm6, [shufb_mask_quarter] +.yloops_quarter_sse3: + ;mov eax, [esp+40] ; iSrcWidth + ;sar eax, $02 ; iSrcWidth >> 2 + ;mov ebx, eax ; iDstWidth restored at ebx + ;sar eax, $04 ; (iSrcWidth >> 2) / 16 ; loop count = num_of_mb + ;neg ebx ; - (iSrcWidth >> 2) +%ifdef X86_32 + mov r4, arg5 +%else + mov r4, r12 +%endif + + mov r6, r0 + ; each loop = source bandwidth: 32 bytes +.xloops_quarter_sse3: + ; 1st part horizonal loop: x32 bytes + ; mem hi<- ->lo + ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A + ; xmm1: p P o O n N m M l L k K j J i I + ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A + ; xmm3: p P o O n N m M l L k K j J i I + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqa xmm0, [r2] ; 1st_src_line + movdqa xmm1, [r2+16] ; 1st_src_line + 16 + movdqa xmm2, [r2+r3] ; 2nd_src_line + movdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16 + + pshufb xmm0, xmm6 ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A + pshufb xmm1, xmm6 ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I + pshufb xmm2, xmm6 ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A + pshufb xmm3, xmm6 ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I + + movdqa xmm4, xmm0 + movdqa xmm5, xmm2 + punpckldq xmm0, xmm1 ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0 + punpckhdq xmm4, xmm1 ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4 + punpckldq xmm2, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2 + punpckhdq xmm5, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5 + + pavgb xmm0, xmm4 + pavgb xmm2, xmm5 + pavgb xmm0, xmm2 ;average + + ; write pDst + movq [r0], xmm0 + + ; next SMB + lea r2, [r2+32] + lea r0, [r0+8] + + sub r4, 32 + cmp r4, 0 + jg near .xloops_quarter_sse3 + + sub r6, r0 + ; next line + lea r2, [r2+4*r3] ; next end of lines + lea r2, [r2+4*r6] ; reset to base 0 [- 4 * iDstWidth] + lea r0, [r0+r1] + lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] + + dec r5 + jg near .yloops_quarter_sse3 + + movq [r0], xmm7 ;restored the tailer for the unasigned size + +%ifndef X86_32 + pop r12 +%endif + + POP_XMM + LOAD_6_PARA_POP +%ifdef X86_32 + pop r6 +%endif + ret + +;*********************************************************************** +; void DyadicBilinearQuarterDownsampler_sse4( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); +;*********************************************************************** +WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4 +%ifdef X86_32 + push r6 + %assign push_num 1 +%else + %assign push_num 0 +%endif + LOAD_6_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + SIGN_EXTENSION r5, r5d + +%ifndef X86_32 + push r12 + mov r12, r4 +%endif + sar r5, $02 ; iSrcHeight >> 2 + + mov r6, r1 ;Save the tailer for the unasigned size + imul r6, r5 + add r6, r0 + movq xmm7, [r6] + + movdqa xmm6, [shufb_mask_quarter] ;mask + +.yloops_quarter_sse4: +%ifdef X86_32 + mov r4, arg5 +%else + mov r4, r12 +%endif + + mov r6, r0 + ; each loop = source bandwidth: 32 bytes +.xloops_quarter_sse4: + ; 1st part horizonal loop: x16 bytes + ; mem hi<- ->lo + ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A + ; xmm1: p P o O n N m M l L k K j J i I + ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A + ; xmm3: p P o O n N m M l L k K j J i I + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movntdqa xmm0, [r2] ; 1st_src_line + movntdqa xmm1, [r2+16] ; 1st_src_line + 16 + movntdqa xmm2, [r2+r3] ; 2nd_src_line + movntdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16 + + pshufb xmm0, xmm6 ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A + pshufb xmm1, xmm6 ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I + pshufb xmm2, xmm6 ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A + pshufb xmm3, xmm6 ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I + + movdqa xmm4, xmm0 + movdqa xmm5, xmm2 + punpckldq xmm0, xmm1 ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0 + punpckhdq xmm4, xmm1 ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4 + punpckldq xmm2, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2 + punpckhdq xmm5, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5 + + pavgb xmm0, xmm4 + pavgb xmm2, xmm5 + pavgb xmm0, xmm2 ;average + + ; write pDst + movq [r0], xmm0 + + ; next SMB + lea r2, [r2+32] + lea r0, [r0+8] + + sub r4, 32 + cmp r4, 0 + jg near .xloops_quarter_sse4 + + sub r6, r0 + lea r2, [r2+4*r3] ; next end of lines + lea r2, [r2+4*r6] ; reset to base 0 [- 2 * iDstWidth] + lea r0, [r0+r1] + lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] + + dec r5 + jg near .yloops_quarter_sse4 + + movq [r0], xmm7 ;restore the tailer for the unasigned size + +%ifndef X86_32 + pop r12 +%endif + + POP_XMM + LOAD_6_PARA_POP +%ifdef X86_32 + pop r6 +%endif + ret + diff --git a/test/api/encode_decode_api_test.cpp b/test/api/encode_decode_api_test.cpp index 386ede0b..65b3ec0d 100644 --- a/test/api/encode_decode_api_test.cpp +++ b/test/api/encode_decode_api_test.cpp @@ -2512,7 +2512,7 @@ const uint32_t kiHeight = 96; //DO NOT CHANGE! const uint32_t kiFrameRate = 12; //DO NOT CHANGE! const uint32_t kiFrameNum = 100; //DO NOT CHANGE! const char* pHashStr[] = { //DO NOT CHANGE! - "058076b265686fc85b2b99cf7a53106f216f16c3", + "585663f78cadb70d9c9f179b9b53b90ffddf3178", "f350001c333902029800bd291fbed915a4bdf19a", "eb9d853b7daec03052c4850027ac94adc84c3a7e" }; diff --git a/test/api/encoder_test.cpp b/test/api/encoder_test.cpp index 9b55d516..a7f781ae 100644 --- a/test/api/encoder_test.cpp +++ b/test/api/encoder_test.cpp @@ -131,7 +131,7 @@ static const EncodeFileParam kFileParamArray[] = { }, { "res/Cisco_Absolute_Power_1280x720_30fps.yuv", - "a4707845cacc437fb52010eb020fca6d4bc1102d", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false + "2b5965c752e1f722592c3ce9a1eb82445c9dbaa3", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false }, // the following values may be adjusted for times since we start tuning the strategy { diff --git a/test/processing/ProcessUT_DownSample.cpp b/test/processing/ProcessUT_DownSample.cpp index d29b6fbc..ca04c049 100644 --- a/test/processing/ProcessUT_DownSample.cpp +++ b/test/processing/ProcessUT_DownSample.cpp @@ -199,6 +199,79 @@ TEST (DownSampleTest, func) { \ } \ } +#define GENERATE_DyadicBilinearOneThirdDownsampler_UT(func, ASM, CPUFLAGS) \ +TEST (DownSampleTest, func) { \ + if (ASM) {\ + int32_t iCpuCores = 0; \ + uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \ + if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \ + return; \ + } \ + ENFORCE_STACK_ALIGN_1D (uint8_t, dst_c, 50000, 16); \ + ENFORCE_STACK_ALIGN_1D (uint8_t, src_c, 50000, 16); \ + int dst_stride_c; \ + int src_stride_c; \ + int src_width_c; \ + int src_height_c; \ + ENFORCE_STACK_ALIGN_1D (uint8_t, dst_a, 50000, 16); \ + ENFORCE_STACK_ALIGN_1D (uint8_t, src_a, 50000, 16); \ + int dst_stride_a; \ + int src_stride_a; \ + int src_width_a; \ + int src_height_a; \ + dst_stride_c = dst_stride_a = 560; \ + src_stride_c = src_stride_a = 560; \ + src_width_c = src_width_a = 480; \ + src_height_c = src_height_a = 30; \ + for (int j = 0; j < 50000; j++) { \ + dst_c[j] = dst_a[j] = rand() % 256; \ + src_c[j] = src_a[j] = rand() % 256; \ + } \ + DyadicBilinearOneThirdDownsampler_c (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c/3); \ + func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a/3); \ + for (int j = 0; j < (src_height_c /3 ); j++) { \ + for (int m = 0; m < (src_width_c /3); m++) { \ + ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \ + } \ + } \ +} + +#define GENERATE_DyadicBilinearQuarterDownsampler_UT(func, ASM, CPUFLAGS) \ +TEST (DownSampleTest, func) { \ + if (ASM) {\ + int32_t iCpuCores = 0; \ + uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \ + if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \ + return; \ + } \ + ENFORCE_STACK_ALIGN_1D (uint8_t, dst_c, 50000, 16); \ + ENFORCE_STACK_ALIGN_1D (uint8_t, src_c, 50000, 16); \ + int dst_stride_c; \ + int src_stride_c; \ + int src_width_c; \ + int src_height_c; \ + ENFORCE_STACK_ALIGN_1D (uint8_t, dst_a, 50000, 16); \ + ENFORCE_STACK_ALIGN_1D (uint8_t, src_a, 50000, 16); \ + int dst_stride_a; \ + int src_stride_a; \ + int src_width_a; \ + int src_height_a; \ + dst_stride_c = dst_stride_a = 560; \ + src_stride_c = src_stride_a = 560; \ + src_width_c = src_width_a = 640; \ + src_height_c = src_height_a = 80; \ + for (int j = 0; j < 50000; j++) { \ + dst_c[j] = dst_a[j] = rand() % 256; \ + src_c[j] = src_a[j] = rand() % 256; \ + } \ + DyadicBilinearQuarterDownsampler_c (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \ + func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a); \ + for (int j = 0; j < (src_height_c >> 2); j++) { \ + for (int m = 0; m < (src_width_c >> 2); m++) { \ + ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \ + } \ + } \ +} #define GENERATE_GeneralBilinearDownsampler_UT(func, ref, ASM, CPUFLAGS) \ TEST (DownSampleTest, func) { \ if (ASM) {\ @@ -259,6 +332,13 @@ GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_ssse3, GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse4, 1, WELS_CPU_SSE41) GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse4, 1, WELS_CPU_SSE41) +GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_ssse3, 1, WELS_CPU_SSSE3) +GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_sse4, 1, WELS_CPU_SSE41) + +GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_sse, 1, WELS_CPU_SSE) +GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_ssse3, 1, WELS_CPU_SSSE3) +GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_sse4, 1, WELS_CPU_SSE41) + GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_sse2, GeneralBilinearFastDownsampler_ref, 1, WELS_CPU_SSE2) GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse2, @@ -269,6 +349,10 @@ GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_s GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_neon, 1, WELS_CPU_NEON) GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsampler_neon, 1, WELS_CPU_NEON) +GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_neon, 1, WELS_CPU_NEON) + +GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_neon, 1, WELS_CPU_NEON) + GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_neon, GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_NEON) #endif @@ -277,6 +361,10 @@ GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_n GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_AArch64_neon, 1, WELS_CPU_NEON) GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsampler_AArch64_neon, 1, WELS_CPU_NEON) +GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_AArch64_neon, 1, WELS_CPU_NEON) + +GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_AArch64_neon, 1, WELS_CPU_NEON) + GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_AArch64_neon, GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_NEON) #endif