Merge pull request #2111 from GuangweiWang/downsampler

add new c and assembly functions to optimize downsampler when downscale equal 1:3/1:4
2015-09-11 17:36:13 +08:00 · 2015-09-11 17:36:13 +08:00 · 5373b8a3aa
commit 5373b8a3aa
parent 69a62ea58e 64657d3cfd
9 changed files with 1125 additions and 8 deletions
--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@ -338,4 +338,121 @@ _LAST_ROW_WIDTH:
    ldmia sp!, {r4-r12, lr}
 WELS_ASM_FUNC_END

+WELS_ASM_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_neon
+    stmdb sp!, {r4-r8, lr}
+
+    //Get the width and height
+	ldr  r4, [sp, #24]  //src_width
+	ldr  r5, [sp, #28]	//src_height
+
+	//Initialize the register
+	mov r6, r2
+	mov r8, r0
+	mov lr, #0
+
+	//Save the tailer for the un-aligned size
+	mla  r7, r1, r5, r0
+	vld1.32 {q15}, [r7]
+
+	add r7, r2, r3
+	//processing a colume data
+comp_ds_bilinear_onethird_loop0:
+
+    vld3.8 {d0, d1, d2}, [r2]!
+    vld3.8 {d3, d4, d5}, [r2]!
+    vld3.8 {d16, d17, d18}, [r7]!
+    vld3.8 {d19, d20, d21}, [r7]!
+
+    vaddl.u8 q11, d0, d1
+    vaddl.u8 q12, d3, d4
+    vaddl.u8 q13, d16, d17
+    vaddl.u8 q14, d19, d20
+    vrshr.u16 q11, #1
+    vrshr.u16 q12, #1
+    vrshr.u16 q13, #1
+    vrshr.u16 q14, #1
+
+    vrhadd.u16 q11, q13
+    vrhadd.u16 q12, q14
+
+    vmovn.u16 d0, q11
+    vmovn.u16 d1, q12
+    vst1.8 {q0}, [r0]!
+
+    add lr, #48
+    cmp lr, r4
+    movcs lr, #0
+    addcs r6, r3, lsl #1
+    addcs r6, r6, r3
+    movcs r2, r6
+    addcs r7, r2, r3
+    addcs r8, r1
+    movcs r0, r8
+    subscs r5, #1
+    bne	comp_ds_bilinear_onethird_loop0
+
+	//restore the tailer for the un-aligned size
+	vst1.32 {q15}, [r0]
+
+    ldmia sp!, {r4-r8,lr}
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN DyadicBilinearQuarterDownsampler_neon
+    stmdb sp!, {r4-r8, lr}
+
+    //Get the width and height
+	ldr  r4, [sp, #24]  //src_width
+	ldr  r5, [sp, #28]	//src_height
+
+	//Initialize the register
+	mov r6, r2
+	mov r8, r0
+	mov lr, #0
+	lsr r5, #2
+
+	//Save the tailer for the un-aligned size
+	mla  r7, r1, r5, r0
+	vld1.32 {q15}, [r7]
+
+	add r7, r2, r3
+	//processing a colume data
+comp_ds_bilinear_quarter_loop0:
+
+	vld2.16 {q0, q1}, [r2]!
+    vld2.16 {q2, q3}, [r2]!
+	vld2.16 {q8, q9}, [r7]!
+    vld2.16 {q10, q11}, [r7]!
+
+    vpaddl.u8 q0, q0
+    vpaddl.u8 q2, q2
+    vpaddl.u8 q8, q8
+    vpaddl.u8 q10, q10
+    vrshr.u16 q0, #1
+    vrshr.u16 q2, #1
+    vrshr.u16 q8, #1
+    vrshr.u16 q10, #1
+
+    vrhadd.u16 q0, q8
+    vrhadd.u16 q2, q10
+    vmovn.u16 d0, q0
+    vmovn.u16 d1, q2
+    vst1.8 {q0}, [r0]!
+
+    add lr, #64
+    cmp lr, r4
+    movcs lr, #0
+    addcs r6, r3, lsl #2
+    movcs r2, r6
+    addcs r7, r2, r3
+    addcs r8, r1
+    movcs r0, r8
+    subscs r5, #1
+    bne	comp_ds_bilinear_quarter_loop0
+
+	//restore the tailer for the un-aligned size
+	vst1.32 {q15}, [r0]
+
+    ldmia sp!, {r4-r8,lr}
+WELS_ASM_FUNC_END
+
 #endif
--- a/codec/processing/src/arm64/down_sample_aarch64_neon.S
+++ b/codec/processing/src/arm64/down_sample_aarch64_neon.S
@ -84,7 +84,6 @@ comp_ds_bilinear_loop0:

 WELS_ASM_AARCH64_FUNC_END

-
 WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_AArch64_neon
    sub     w9, w3, w4
    sub     w1, w1, w4, lsr #1
@ -123,6 +122,113 @@ comp_ds_bilinear_w_x32_loop1:
    cbnz    w5, comp_ds_bilinear_w_x32_loop0
 WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_AArch64_neon
+
+    //Initialize the register
+    mov x6, x2
+    mov x8, x0
+    mov w9, #0
+
+    //Save the tailer   for the unasigned   size
+    smaddl  x7, w1, w5, x0
+    ld1 {v16.16b}, [x7]
+
+    add x7, x2, w3, sxtw
+    //processing a colume   data
+comp_ds_bilinear_onethird_loop0:
+
+    ld3     {v0.16b, v1.16b, v2.16b}, [x2], #48
+    ld3     {v4.16b, v5.16b, v6.16b}, [x7], #48
+
+    uaddl   v2.8h, v0.8b, v1.8b
+    uaddl2  v3.8h, v0.16b, v1.16b
+    uaddl   v6.8h, v4.8b, v5.8b
+    uaddl2  v7.8h, v4.16b, v5.16b
+    urshr   v2.8h, v2.8h, #1
+    urshr   v3.8h, v3.8h, #1
+    urshr   v6.8h, v6.8h, #1
+    urshr   v7.8h, v7.8h, #1
+
+    urhadd  v0.8h, v2.8h, v6.8h
+    urhadd  v1.8h, v3.8h, v7.8h
+    xtn     v0.8b, v0.8h
+    xtn     v1.8b, v1.8h
+    st1     {v0.8b,v1.8b}, [x0], #16
+
+    add     w9, w9, #48
+
+    cmp     w9, w4
+    b.cc    comp_ds_bilinear_onethird_loop0
+
+    mov     w9, #0
+    add     x6, x6, w3, sxtw #1
+    add     x6, x6, w3, sxtw
+    mov     x2, x6
+    add     x7, x2, w3, sxtw
+    add     x8, x8, w1, sxtw
+    mov     x0, x8
+    sub     w5, w5, #1
+
+    cbnz    w5, comp_ds_bilinear_onethird_loop0
+
+    //restore   the tailer for the unasigned size
+    st1     {v16.16b}, [x0]
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearQuarterDownsampler_AArch64_neon
+    //Initialize the register
+    mov x6, x2
+    mov x8, x0
+    mov w9, #0
+    lsr w5, w5, #2
+
+    //Save the tailer   for the unasigned   size
+    smaddl  x7, w1, w5, x0
+    ld1 {v16.16b}, [x7]
+
+    add x7, x2, w3, sxtw
+    //processing a colume   data
+comp_ds_bilinear_quarter_loop0:
+
+    ld2     {v0.8h, v1.8h}, [x2], #32
+    ld2     {v2.8h, v3.8h}, [x2], #32
+    ld2     {v4.8h, v5.8h}, [x7], #32
+    ld2     {v6.8h, v7.8h}, [x7], #32
+
+    uaddlp  v0.8h, v0.16b
+    uaddlp  v1.8h, v2.16b
+    uaddlp  v4.8h, v4.16b
+    uaddlp  v5.8h, v6.16b
+    urshr   v0.8h, v0.8h, #1
+    urshr   v1.8h, v1.8h, #1
+    urshr   v4.8h, v4.8h, #1
+    urshr   v5.8h, v5.8h, #1
+
+    urhadd  v0.8h, v0.8h, v4.8h
+    urhadd  v1.8h, v1.8h, v5.8h
+    xtn     v0.8b, v0.8h
+    xtn     v1.8b, v1.8h
+    st1     {v0.8b,v1.8b}, [x0], #16
+
+    add     w9, w9, #64
+
+    cmp     w9, w4
+    b.cc    comp_ds_bilinear_quarter_loop0
+
+    mov     w9, #0
+    add     x6, x6, w3, sxtw #2
+    mov     x2, x6
+    add     x7, x2, w3, sxtw
+    add     x8, x8, w1, sxtw
+    mov     x0, x8
+    sub     w5, w5, #1
+
+    cbnz    w5, comp_ds_bilinear_quarter_loop0
+
+    //restore   the tailer for the unasigned size
+    st1     {v16.16b}, [x0]
+WELS_ASM_AARCH64_FUNC_END
+
 WELS_ASM_AARCH64_FUNC_BEGIN GeneralBilinearAccurateDownsampler_AArch64_neon
    mov     w10, #32767
    and     w8, w6, w10
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@ -53,6 +53,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int
  sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;
  sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;
  sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;
+  sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_c;
+  sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_c;
  sDownsampleFunc.pfGeneralRatioChroma  = GeneralBilinearAccurateDownsampler_c;
  sDownsampleFunc.pfGeneralRatioLuma    = GeneralBilinearFastDownsampler_c;
 #if defined(X86_ASM)
@ -60,6 +62,7 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int
    sDownsampleFunc.pfHalfAverage[0]    = DyadicBilinearDownsamplerWidthx32_sse;
    sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_sse;
    sDownsampleFunc.pfHalfAverage[2]    = DyadicBilinearDownsamplerWidthx8_sse;
+    sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse;
  }
  if (iCpuFlag & WELS_CPU_SSE2) {
    sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
@ -68,10 +71,14 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int
  if (iCpuFlag & WELS_CPU_SSSE3) {
    sDownsampleFunc.pfHalfAverage[0]    = DyadicBilinearDownsamplerWidthx32_ssse3;
    sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_ssse3;
+    sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_ssse3;
+    sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_ssse3;
  }
  if (iCpuFlag & WELS_CPU_SSE41) {
    sDownsampleFunc.pfHalfAverage[0]    = DyadicBilinearDownsamplerWidthx32_sse4;
    sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_sse4;
+    sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
+    sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_sse4;
  }
 #endif//X86_ASM

@ -81,6 +88,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int
    sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon;
    sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon;
    sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon;
+    sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_neon;
+    sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_neon;
    sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon;
    sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearAccurateDownsamplerWrap_neon;
  }
@ -92,6 +101,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int
    sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_AArch64_neon;
    sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_AArch64_neon;
    sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_AArch64_neon;
+    sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_AArch64_neon;
+    sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_AArch64_neon;
    sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
    sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
  }
@ -124,6 +135,28 @@ EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDs
        (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
    m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
        (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
+  } else if ((iSrcWidthY >> 2) == iDstWidthY && (iSrcHeightY >> 2) == iDstHeightY) {
+
+    m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
+                                         (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
+
+    m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
+                                         (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
+
+    m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
+                                         (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
+
+  } else if ((iSrcWidthY / 3) == iDstWidthY && (iSrcHeightY / 3) == iDstHeightY) {
+
+    m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
+                                          (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iDstHeightY);
+
+    m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
+                                          (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iDstHeightUV);
+
+    m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
+                                          (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iDstHeightUV);
+
  } else {
    m_pfDownsample.pfGeneralRatioLuma ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], iDstWidthY, iDstHeightY,
                                       (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@ -54,20 +54,29 @@ typedef void (HalveDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride,
                                    uint8_t* pSrc, const int32_t kiSrcStride,
                                    const int32_t kiSrcWidth, const int32_t kiSrcHeight);

+typedef void (SpecificDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride,
+                                       uint8_t* pSrc, const int32_t kiSrcStride,
+                                       const int32_t kiSrcWidth, const int32_t kiHeight);
+
 typedef void (GeneralDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
                                      const int32_t kiDstHeight,
                                      uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight);

 typedef HalveDownsampleFunc*    PHalveDownsampleFunc;
+typedef SpecificDownsampleFunc* PSpecificDownsampleFunc;
 typedef GeneralDownsampleFunc*  PGeneralDownsampleFunc;

-HalveDownsampleFunc   DyadicBilinearDownsampler_c;
+HalveDownsampleFunc		DyadicBilinearDownsampler_c;
 GeneralDownsampleFunc GeneralBilinearFastDownsampler_c;
 GeneralDownsampleFunc GeneralBilinearAccurateDownsampler_c;
+SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_c;
+SpecificDownsampleFunc	DyadicBilinearQuarterDownsampler_c;

 typedef struct {
  // align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;
  PHalveDownsampleFunc          pfHalfAverage[4];
+  PSpecificDownsampleFunc       pfOneThirdDownsampler;
+  PSpecificDownsampleFunc       pfQuarterDownsampler;
  PGeneralDownsampleFunc        pfGeneralRatioLuma;
  PGeneralDownsampleFunc        pfGeneralRatioChroma;
 } SDownsampleFuncs;
@ -93,10 +102,19 @@ HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx32_sse4;
 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;

+SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_ssse3;
+SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_sse4;
+SpecificDownsampleFunc  DyadicBilinearQuarterDownsampler_sse;
+SpecificDownsampleFunc  DyadicBilinearQuarterDownsampler_ssse3;
+SpecificDownsampleFunc  DyadicBilinearQuarterDownsampler_sse4;
+
 void GeneralBilinearFastDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-    const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+    const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,
+    const uint32_t kuiScaleY);
 void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-    const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+    const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,
+    const uint32_t kuiScaleY);
+
 WELSVP_EXTERN_C_END
 #endif

@ -109,6 +127,10 @@ HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx32_neon;

 GeneralDownsampleFunc   GeneralBilinearAccurateDownsamplerWrap_neon;

+SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_neon;
+
+SpecificDownsampleFunc  DyadicBilinearQuarterDownsampler_neon;
+
 void GeneralBilinearAccurateDownsampler_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
    const int32_t kiDstHeight,
    uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
@ -125,8 +147,13 @@ HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx32_AArch64_neon;

 GeneralDownsampleFunc   GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;

-void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
-                                                      uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_AArch64_neon;
+
+SpecificDownsampleFunc  DyadicBilinearQuarterDownsampler_AArch64_neon;
+
+void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride,
+    const int32_t kiDstWidth, const int32_t kiDstHeight,
+    uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);

 WELSVP_EXTERN_C_END
 #endif
--- a/codec/processing/src/downsample/downsamplefuncs.cpp
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@ -68,6 +68,53 @@ void DyadicBilinearDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
  }
 }

+void DyadicBilinearQuarterDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
+    uint8_t* pSrc, const int32_t kiSrcStride,
+    const int32_t kiSrcWidth, const int32_t kiSrcHeight)
+
+{
+  uint8_t* pDstLine     = pDst;
+  uint8_t* pSrcLine     = pSrc;
+  const int32_t kiSrcStridex4   = kiSrcStride << 2;
+  const int32_t kiDstWidth      = kiSrcWidth  >> 2;
+  const int32_t kiDstHeight     = kiSrcHeight >> 2;
+
+  for (int32_t j = 0; j < kiDstHeight; j ++) {
+    for (int32_t i = 0; i < kiDstWidth; i ++) {
+      const int32_t kiSrcX = i << 2;
+      const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
+      const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
+
+      pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
+    }
+    pDstLine    += kiDstStride;
+    pSrcLine    += kiSrcStridex4;
+  }
+}
+
+void DyadicBilinearOneThirdDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
+    uint8_t* pSrc, const int32_t kiSrcStride,
+    const int32_t kiSrcWidth, const int32_t kiDstHeight)
+
+{
+  uint8_t* pDstLine     = pDst;
+  uint8_t* pSrcLine     = pSrc;
+  const int32_t kiSrcStridex3   = kiSrcStride * 3;
+  const int32_t kiDstWidth      = kiSrcWidth / 3;
+
+  for (int32_t j = 0; j < kiDstHeight; j ++) {
+    for (int32_t i = 0; i < kiDstWidth; i ++) {
+      const int32_t kiSrcX = i * 3;
+      const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
+      const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
+
+      pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
+    }
+    pDstLine    += kiDstStride;
+    pSrcLine    += kiSrcStridex3;
+  }
+}
+
 void GeneralBilinearFastDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
                                       const int32_t kiDstHeight,
                                       uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@ -67,6 +67,22 @@ shufb_mask_high:
 add_extra_half:
    dd 16384,0,0,0

+shufb_mask_quarter:
+db 00h, 04h, 08h, 0ch, 80h, 80h, 80h, 80h, 01h, 05h, 09h, 0dh, 80h, 80h, 80h, 80h
+
+shufb_mask_onethird_low_1:
+db 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h
+shufb_mask_onethird_low_2:
+db 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh, 80h, 80h, 80h, 80h, 80h
+shufb_mask_onethird_low_3:
+db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 01h, 04h, 07h, 0ah, 0dh
+
+shufb_mask_onethird_high_1:
+db 01h, 04h, 07h, 0ah, 0dh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h
+shufb_mask_onethird_high_2:
+db 80h, 80h, 80h, 80h, 80h, 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h
+shufb_mask_onethird_high_3:
+db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh

 ;***********************************************************************
 ; Code
@ -1896,3 +1912,686 @@ FAST_LAST_ROW_END:
    pop     r12
    ret
 %endif
+
+;***********************************************************************
+;   void DyadicBilinearOneThirdDownsampler_ssse3(    unsigned char* pDst, const int iDstStride,
+;                   unsigned char* pSrc, const int iSrcStride,
+;                   const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearOneThirdDownsampler_ssse3
+%ifdef X86_32
+    push r6
+    %assign push_num 1
+%else
+    %assign push_num 0
+%endif
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    SIGN_EXTENSION r5, r5d
+
+%ifndef X86_32
+    push r12
+    mov r12, r4
+%endif
+
+    mov r6, r1             ;Save the tailer for the unasigned size
+    imul r6, r5
+    add r6, r0
+    movdqa xmm7, [r6]
+
+.yloops_onethird_sse3:
+%ifdef X86_32
+    mov r4, arg5
+%else
+    mov r4, r12
+%endif
+
+    mov r6, r0        ;save base address
+    ; each loop = source bandwidth: 48 bytes
+.xloops_onethird_sse3:
+    ; 1st part horizonal loop: x48 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  xmm0: F * e E * d D * c C * b B * a A
+    ;               xmm2: k K * j J * i I * h H * g G * f
+    ;               xmm2: * p P * o O * n N * m M * l L *
+    ;
+    ;2nd Line Src:  xmm2: F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
+    ;               xmm1: k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
+    ;               xmm1: *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
+    ;=> target:
+    ;: P O N M L K J I H G F E D C B A
+    ;: p o n m l k j i h g f e d c b a
+    ;: P' ..                          A'
+    ;: p' ..                          a'
+
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ;1st line
+    movdqa xmm0, [r2]                         ;F * e E * d D * c C * b B * a A
+    movdqa xmm1, xmm0
+    movdqa xmm5, [shufb_mask_onethird_low_1]
+    movdqa xmm6, [shufb_mask_onethird_high_1]
+    pshufb xmm0, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
+    pshufb xmm1, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
+
+    movdqa xmm2, [r2+16]                      ;k K * j J * i I * h H * g G * f
+    movdqa xmm3, xmm2
+    movdqa xmm5, [shufb_mask_onethird_low_2]
+    movdqa xmm6, [shufb_mask_onethird_high_2]
+    pshufb xmm2, xmm5                           ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
+    pshufb xmm3, xmm6                           ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
+
+    paddusb xmm0, xmm2                          ;0 0 0 0 0 K J I H G F E D C B A -> xmm0
+    paddusb xmm1, xmm3                          ;0 0 0 0 0 k j i h g f e d c b a -> xmm1
+
+    movdqa xmm2, [r2+32]                      ;* p P * o O * n N * m M * l L *
+    movdqa xmm3, xmm2
+    movdqa xmm5, [shufb_mask_onethird_low_3]
+    movdqa xmm6, [shufb_mask_onethird_high_3]
+    pshufb xmm2, xmm5                           ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
+    pshufb xmm3, xmm6                           ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
+
+    paddusb xmm0, xmm2                          ;P O N M L K J I H G F E D C B A -> xmm0
+    paddusb xmm1, xmm3                          ;p o n m l k j i h g f e d c b a -> xmm1
+    pavgb xmm0, xmm1                            ;1st line average                -> xmm0
+
+    ;2nd line
+    movdqa xmm2, [r2+r3]                      ;F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
+    movdqa xmm3, xmm2
+    movdqa xmm5, [shufb_mask_onethird_low_1]
+    movdqa xmm6, [shufb_mask_onethird_high_1]
+    pshufb xmm2, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
+    pshufb xmm3, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0  e' d' c' b' a' -> xmm3
+
+    movdqa xmm1, [r2+r3+16]                   ;k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
+    movdqa xmm4, xmm1
+    movdqa xmm5, [shufb_mask_onethird_low_2]
+    movdqa xmm6, [shufb_mask_onethird_high_2]
+    pshufb xmm1, xmm5                           ;0 0 0 0 0 K' J' I' H' G' 0  0 0 0 0 0 -> xmm1
+    pshufb xmm4, xmm6                           ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
+
+    paddusb xmm2, xmm1                          ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2
+    paddusb xmm3, xmm4                          ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3
+
+    movdqa xmm1, [r2+r3+32]                   ; *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
+    movdqa xmm4, xmm1
+    movdqa xmm5, [shufb_mask_onethird_low_3]
+    movdqa xmm6, [shufb_mask_onethird_high_3]
+    pshufb xmm1, xmm5                           ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
+    pshufb xmm4, xmm6                           ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
+
+    paddusb xmm2, xmm1                          ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2
+    paddusb xmm3, xmm4                          ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3
+    pavgb xmm2, xmm3                            ;2nd line average                                -> xmm2
+
+    pavgb xmm0, xmm2                            ; bytes-average(1st line , 2nd line )
+
+    ; write pDst
+    movdqa [r0], xmm0                           ;write result in dst
+
+    ; next SMB
+    lea r2, [r2+48]                             ;current src address
+    lea r0, [r0+16]                             ;current dst address
+
+    sub r4, 48                                  ;xloops counter
+    cmp r4, 0
+    jg near .xloops_onethird_sse3
+
+    sub r6, r0                                  ;offset = base address - current address
+    lea r2, [r2+2*r3]                           ;
+    lea r2, [r2+r3]                             ;
+    lea r2, [r2+2*r6]                           ;current line + 3 lines
+    lea r2, [r2+r6]
+    lea r0, [r0+r1]
+    lea r0, [r0+r6]                             ;current dst lien + 1 line
+
+    dec r5
+    jg near .yloops_onethird_sse3
+
+    movdqa [r0], xmm7                           ;restore the tailer for the unasigned size
+
+%ifndef X86_32
+    pop r12
+%endif
+
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop r6
+%endif
+    ret
+
+;***********************************************************************
+;   void DyadicBilinearOneThirdDownsampler_sse4(    unsigned char* pDst, const int iDstStride,
+;                   unsigned char* pSrc, const int iSrcStride,
+;                   const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearOneThirdDownsampler_sse4
+%ifdef X86_32
+    push r6
+    %assign push_num 1
+%else
+    %assign push_num 0
+%endif
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    SIGN_EXTENSION r5, r5d
+
+%ifndef X86_32
+    push r12
+    mov r12, r4
+%endif
+
+    mov r6, r1             ;Save the tailer for the unasigned size
+    imul r6, r5
+    add r6, r0
+    movdqa xmm7, [r6]
+
+.yloops_onethird_sse4:
+%ifdef X86_32
+    mov r4, arg5
+%else
+    mov r4, r12
+%endif
+
+    mov r6, r0        ;save base address
+    ; each loop = source bandwidth: 48 bytes
+.xloops_onethird_sse4:
+    ; 1st part horizonal loop: x48 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  xmm0: F * e E * d D * c C * b B * a A
+    ;               xmm2: k K * j J * i I * h H * g G * f
+    ;               xmm2: * p P * o O * n N * m M * l L *
+    ;
+    ;2nd Line Src:  xmm2: F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
+    ;               xmm1: k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
+    ;               xmm1: *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
+    ;=> target:
+    ;: P O N M L K J I H G F E D C B A
+    ;: p o n m l k j i h g f e d c b a
+    ;: P' ..                          A'
+    ;: p' ..                          a'
+
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ;1st line
+    movntdqa xmm0, [r2]                         ;F * e E * d D * c C * b B * a A
+    movdqa xmm1, xmm0
+    movdqa xmm5, [shufb_mask_onethird_low_1]
+    movdqa xmm6, [shufb_mask_onethird_high_1]
+    pshufb xmm0, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
+    pshufb xmm1, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
+
+    movntdqa xmm2, [r2+16]                      ;k K * j J * i I * h H * g G * f
+    movdqa xmm3, xmm2
+    movdqa xmm5, [shufb_mask_onethird_low_2]
+    movdqa xmm6, [shufb_mask_onethird_high_2]
+    pshufb xmm2, xmm5                           ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
+    pshufb xmm3, xmm6                           ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
+
+    paddusb xmm0, xmm2                          ;0 0 0 0 0 K J I H G F E D C B A -> xmm0
+    paddusb xmm1, xmm3                          ;0 0 0 0 0 k j i h g f e d c b a -> xmm1
+
+    movntdqa xmm2, [r2+32]                      ;* p P * o O * n N * m M * l L *
+    movdqa xmm3, xmm2
+    movdqa xmm5, [shufb_mask_onethird_low_3]
+    movdqa xmm6, [shufb_mask_onethird_high_3]
+    pshufb xmm2, xmm5                           ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
+    pshufb xmm3, xmm6                           ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
+
+    paddusb xmm0, xmm2                          ;P O N M L K J I H G F E D C B A -> xmm0
+    paddusb xmm1, xmm3                          ;p o n m l k j i h g f e d c b a -> xmm1
+    pavgb xmm0, xmm1                            ;1st line average                -> xmm0
+
+    ;2nd line
+    movntdqa xmm2, [r2+r3]                      ;F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
+    movdqa xmm3, xmm2
+    movdqa xmm5, [shufb_mask_onethird_low_1]
+    movdqa xmm6, [shufb_mask_onethird_high_1]
+    pshufb xmm2, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
+    pshufb xmm3, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0  e' d' c' b' a' -> xmm3
+
+    movntdqa xmm1, [r2+r3+16]                   ;k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
+    movdqa xmm4, xmm1
+    movdqa xmm5, [shufb_mask_onethird_low_2]
+    movdqa xmm6, [shufb_mask_onethird_high_2]
+    pshufb xmm1, xmm5                           ;0 0 0 0 0 K' J' I' H' G' 0  0 0 0 0 0 -> xmm1
+    pshufb xmm4, xmm6                           ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
+
+    paddusb xmm2, xmm1                          ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2
+    paddusb xmm3, xmm4                          ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3
+
+    movntdqa xmm1, [r2+r3+32]                   ; *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
+    movdqa xmm4, xmm1
+    movdqa xmm5, [shufb_mask_onethird_low_3]
+    movdqa xmm6, [shufb_mask_onethird_high_3]
+    pshufb xmm1, xmm5                           ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
+    pshufb xmm4, xmm6                           ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
+
+    paddusb xmm2, xmm1                          ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2
+    paddusb xmm3, xmm4                          ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3
+    pavgb xmm2, xmm3                            ;2nd line average                                -> xmm2
+
+    pavgb xmm0, xmm2                            ; bytes-average(1st line , 2nd line )
+
+    ; write pDst
+    movdqa [r0], xmm0                           ;write result in dst
+
+    ; next SMB
+    lea r2, [r2+48]                             ;current src address
+    lea r0, [r0+16]                             ;current dst address
+
+    sub r4, 48                                  ;xloops counter
+    cmp r4, 0
+    jg near .xloops_onethird_sse4
+
+    sub r6, r0                                  ;offset = base address - current address
+    lea r2, [r2+2*r3]                           ;
+    lea r2, [r2+r3]                             ;
+    lea r2, [r2+2*r6]                           ;current line + 3 lines
+    lea r2, [r2+r6]
+    lea r0, [r0+r1]
+    lea r0, [r0+r6]                             ;current dst lien + 1 line
+
+    dec r5
+    jg near .yloops_onethird_sse4
+
+    movdqa [r0], xmm7                           ;restore the tailer for the unasigned size
+
+%ifndef X86_32
+    pop r12
+%endif
+
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop r6
+%endif
+    ret
+
+;***********************************************************************
+;   void DyadicBilinearQuarterDownsampler_sse( unsigned char* pDst, const int iDstStride,
+;                   unsigned char* pSrc, const int iSrcStride,
+;                   const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearQuarterDownsampler_sse
+%ifdef X86_32
+    push r6
+    %assign push_num 1
+%else
+    %assign push_num 0
+%endif
+    LOAD_6_PARA
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    SIGN_EXTENSION r5, r5d
+
+%ifndef X86_32
+    push r12
+    mov r12, r4
+%endif
+    sar r5, $02            ; iSrcHeight >> 2
+
+    mov r6, r1             ;Save the tailer for the unasigned size
+    imul r6, r5
+    add r6, r0
+    movq xmm7, [r6]
+
+.yloops_quarter_sse:
+%ifdef X86_32
+    mov r4, arg5
+%else
+    mov r4, r12
+%endif
+
+    mov r6, r0        ;save base address
+    ; each loop = source bandwidth: 32 bytes
+.xloops_quarter_sse:
+    ; 1st part horizonal loop: x16 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
+    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
+    ;
+    ;=> target:
+    ;: G E C A,
+    ;:
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movq mm0, [r2]         ; 1st pSrc line
+    movq mm1, [r2+8]       ; 1st pSrc line + 8
+    movq mm2, [r2+r3]     ; 2nd pSrc line
+    movq mm3, [r2+r3+8]   ; 2nd pSrc line + 8
+
+    pshufw mm0, mm0, 0d8h    ; x X x X c C a A
+    pshufw mm1, mm1, 0d8h    ; x X x X g G e E
+    pshufw mm2, mm2, 0d8h    ; x X x X k K i I
+    pshufw mm3, mm3, 0d8h    ; x X x X o O m M
+
+    punpckldq mm0, mm1       ; g G e E c C a A
+    punpckldq mm2, mm3       ; o O m M k K i I
+
+    ; to handle mm0,mm2
+    pshufw mm4, mm0, 0d8h       ;g G c C e E a A
+    pshufw mm5, mm4, 04eh       ;e E a A g G c C
+    punpcklbw mm4, mm5          ;g e G E c a C A  -> mm4
+    pshufw mm4, mm4, 0d8h       ;g e c a G E C A  -> mm4
+
+    pshufw mm5, mm2, 0d8h       ;o O k K m M i I
+    pshufw mm6, mm5, 04eh       ;m M i I o O k K
+    punpcklbw mm5, mm6          ;o m O M k i K I
+    pshufw mm5, mm5, 0d8h       ;o m k i O M K I  -> mm5
+
+    ; to handle mm4, mm5
+    movq mm0, mm4
+    punpckldq mm0, mm6          ;x x x x G E C A
+    punpckhdq mm4, mm6          ;x x x x g e c a
+
+    movq mm1, mm5
+    punpckldq mm1, mm6          ;x x x x O M K I
+    punpckhdq mm5, mm6          ;x x x x o m k i
+
+    ; avg within MB horizon width (8 x 2 lines)
+    pavgb mm0, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+    pavgb mm1, mm5      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+    ; 2nd part horizonal loop: x16 bytes
+    movq mm1, [r2+16]      ; 1st pSrc line + 16
+    movq mm2, [r2+24]      ; 1st pSrc line + 24
+    movq mm3, [r2+r3+16]  ; 2nd pSrc line + 16
+    movq mm4, [r2+r3+24]  ; 2nd pSrc line + 24
+
+    pshufw mm1, mm1, 0d8h
+    pshufw mm2, mm2, 0d8h
+    pshufw mm3, mm3, 0d8h
+    pshufw mm4, mm4, 0d8h
+
+    punpckldq mm1, mm2
+    punpckldq mm3, mm4
+
+    ; to handle mm1, mm3
+    pshufw mm4, mm1, 0d8h
+    pshufw mm5, mm4, 04eh
+    punpcklbw mm4, mm5
+    pshufw mm4, mm4, 0d8h
+
+    pshufw mm5, mm3, 0d8h
+    pshufw mm6, mm5, 04eh
+    punpcklbw mm5, mm6
+    pshufw mm5, mm5, 0d8h
+
+    ; to handle mm4, mm5
+    movq mm2, mm4
+    punpckldq mm2, mm6
+    punpckhdq mm4, mm6
+
+    movq mm3, mm5
+    punpckldq mm3, mm6
+    punpckhdq mm5, mm6
+
+    ; avg within MB horizon width (8 x 2 lines)
+    pavgb mm2, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+    pavgb mm3, mm5      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+    pavgb mm2, mm3      ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+
+    movd [r0  ], mm0
+    movd [r0+4], mm2
+
+    ; next SMB
+    lea r2, [r2+32]
+    lea r0, [r0+8]
+
+    sub r4, 32
+    cmp r4, 0
+    jg near .xloops_quarter_sse
+
+    sub  r6, r0
+    ; next line
+    lea r2, [r2+4*r3]    ; next 4 end of lines
+    lea r2, [r2+4*r6]    ; reset to base 0 [- 4 * iDstWidth]
+    lea r0, [r0+r1]
+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
+
+    dec r5
+    jg near .yloops_quarter_sse
+
+    movq [r0], xmm7      ;restored the tailer for the unasigned size
+
+    WELSEMMS
+%ifndef X86_32
+    pop r12
+%endif
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop r6
+%endif
+    ret
+
+;***********************************************************************
+;   void DyadicBilinearQuarterDownsampler_ssse3(   unsigned char* pDst, const int iDstStride,
+;                   unsigned char* pSrc, const int iSrcStride,
+;                   const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearQuarterDownsampler_ssse3
+    ;push ebx
+    ;push edx
+    ;push esi
+    ;push edi
+    ;push ebp
+
+    ;mov edi, [esp+24]   ; pDst
+    ;mov edx, [esp+28]   ; iDstStride
+    ;mov esi, [esp+32]   ; pSrc
+    ;mov ecx, [esp+36]   ; iSrcStride
+    ;mov ebp, [esp+44]   ; iSrcHeight
+%ifdef X86_32
+    push r6
+    %assign push_num 1
+%else
+    %assign push_num 0
+%endif
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    SIGN_EXTENSION r5, r5d
+
+%ifndef X86_32
+    push r12
+    mov r12, r4
+%endif
+    sar r5, $02            ; iSrcHeight >> 2
+
+    mov r6, r1             ;Save the tailer for the unasigned size
+    imul r6, r5
+    add r6, r0
+    movq xmm7, [r6]
+
+    movdqa xmm6, [shufb_mask_quarter]
+.yloops_quarter_sse3:
+    ;mov eax, [esp+40]   ; iSrcWidth
+    ;sar eax, $02            ; iSrcWidth >> 2
+    ;mov ebx, eax        ; iDstWidth restored at ebx
+    ;sar eax, $04            ; (iSrcWidth >> 2) / 16     ; loop count = num_of_mb
+    ;neg ebx             ; - (iSrcWidth >> 2)
+%ifdef X86_32
+    mov r4, arg5
+%else
+    mov r4, r12
+%endif
+
+    mov r6, r0
+    ; each loop = source bandwidth: 32 bytes
+.xloops_quarter_sse3:
+    ; 1st part horizonal loop: x32 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
+    ;               xmm1: p P o O n N m M l L k K j J i I
+    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A
+    ;               xmm3: p P o O n N m M l L k K j J i I
+
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movdqa xmm0, [r2]          ; 1st_src_line
+    movdqa xmm1, [r2+16]       ; 1st_src_line + 16
+    movdqa xmm2, [r2+r3]       ; 2nd_src_line
+    movdqa xmm3, [r2+r3+16]    ; 2nd_src_line + 16
+
+    pshufb xmm0, xmm6           ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A
+    pshufb xmm1, xmm6           ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I
+    pshufb xmm2, xmm6           ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A
+    pshufb xmm3, xmm6           ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I
+
+    movdqa xmm4, xmm0
+    movdqa xmm5, xmm2
+    punpckldq xmm0, xmm1        ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0
+    punpckhdq xmm4, xmm1        ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4
+    punpckldq xmm2, xmm3        ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2
+    punpckhdq xmm5, xmm3        ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5
+
+    pavgb xmm0, xmm4
+    pavgb xmm2, xmm5
+    pavgb xmm0, xmm2            ;average
+
+    ; write pDst
+    movq [r0], xmm0
+
+    ; next SMB
+    lea r2, [r2+32]
+    lea r0, [r0+8]
+
+    sub r4, 32
+    cmp r4, 0
+    jg near .xloops_quarter_sse3
+
+    sub r6, r0
+    ; next line
+    lea r2, [r2+4*r3]    ; next end of lines
+    lea r2, [r2+4*r6]    ; reset to base 0 [- 4 * iDstWidth]
+    lea r0, [r0+r1]
+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
+
+    dec r5
+    jg near .yloops_quarter_sse3
+
+    movq [r0], xmm7      ;restored the tailer for the unasigned size
+
+%ifndef X86_32
+    pop r12
+%endif
+
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop r6
+%endif
+    ret
+
+;***********************************************************************
+;   void DyadicBilinearQuarterDownsampler_sse4(    unsigned char* pDst, const int iDstStride,
+;                   unsigned char* pSrc, const int iSrcStride,
+;                   const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4
+%ifdef X86_32
+    push r6
+    %assign push_num 1
+%else
+    %assign push_num 0
+%endif
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    SIGN_EXTENSION r5, r5d
+
+%ifndef X86_32
+    push r12
+    mov r12, r4
+%endif
+    sar r5, $02            ; iSrcHeight >> 2
+
+    mov r6, r1             ;Save the tailer for the unasigned size
+    imul r6, r5
+    add r6, r0
+    movq xmm7, [r6]
+
+    movdqa xmm6, [shufb_mask_quarter]    ;mask
+
+.yloops_quarter_sse4:
+%ifdef X86_32
+    mov r4, arg5
+%else
+    mov r4, r12
+%endif
+
+    mov r6, r0
+    ; each loop = source bandwidth: 32 bytes
+.xloops_quarter_sse4:
+    ; 1st part horizonal loop: x16 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
+    ;               xmm1: p P o O n N m M l L k K j J i I
+    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A
+    ;               xmm3: p P o O n N m M l L k K j J i I
+
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movntdqa xmm0, [r2]            ; 1st_src_line
+    movntdqa xmm1, [r2+16]         ; 1st_src_line + 16
+    movntdqa xmm2, [r2+r3]         ; 2nd_src_line
+    movntdqa xmm3, [r2+r3+16]      ; 2nd_src_line + 16
+
+    pshufb xmm0, xmm6               ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A
+    pshufb xmm1, xmm6               ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I
+    pshufb xmm2, xmm6               ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A
+    pshufb xmm3, xmm6               ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I
+
+    movdqa xmm4, xmm0
+    movdqa xmm5, xmm2
+    punpckldq xmm0, xmm1            ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0
+    punpckhdq xmm4, xmm1            ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4
+    punpckldq xmm2, xmm3            ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2
+    punpckhdq xmm5, xmm3            ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5
+
+    pavgb xmm0, xmm4
+    pavgb xmm2, xmm5
+    pavgb xmm0, xmm2                ;average
+
+    ; write pDst
+    movq [r0], xmm0
+
+    ; next SMB
+    lea r2, [r2+32]
+    lea r0, [r0+8]
+
+    sub r4, 32
+    cmp r4, 0
+    jg near .xloops_quarter_sse4
+
+    sub r6, r0
+    lea r2, [r2+4*r3]    ; next end of lines
+    lea r2, [r2+4*r6]    ; reset to base 0 [- 2 * iDstWidth]
+    lea r0, [r0+r1]
+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
+
+    dec r5
+    jg near .yloops_quarter_sse4
+
+    movq [r0], xmm7      ;restore the tailer for the unasigned size
+
+%ifndef X86_32
+    pop r12
+%endif
+
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop r6
+%endif
+    ret
+
--- a/test/api/encode_decode_api_test.cpp
+++ b/test/api/encode_decode_api_test.cpp
@ -2512,7 +2512,7 @@ const uint32_t kiHeight = 96; //DO NOT CHANGE!
 const uint32_t kiFrameRate = 12; //DO NOT CHANGE!
 const uint32_t kiFrameNum = 100; //DO NOT CHANGE!
 const char* pHashStr[] = { //DO NOT CHANGE!
-  "058076b265686fc85b2b99cf7a53106f216f16c3",
+  "585663f78cadb70d9c9f179b9b53b90ffddf3178",
  "f350001c333902029800bd291fbed915a4bdf19a",
  "eb9d853b7daec03052c4850027ac94adc84c3a7e"
 };
--- a/test/api/encoder_test.cpp
+++ b/test/api/encoder_test.cpp
@ -131,7 +131,7 @@ static const EncodeFileParam kFileParamArray[] = {
  },
  {
    "res/Cisco_Absolute_Power_1280x720_30fps.yuv",
-    "a4707845cacc437fb52010eb020fca6d4bc1102d", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
+    "2b5965c752e1f722592c3ce9a1eb82445c9dbaa3", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
  },
  // the following values may be adjusted for times since we start tuning the strategy
  {
--- a/test/processing/ProcessUT_DownSample.cpp
+++ b/test/processing/ProcessUT_DownSample.cpp
@ -199,6 +199,79 @@ TEST (DownSampleTest, func) { \
  } \
 }

+#define GENERATE_DyadicBilinearOneThirdDownsampler_UT(func, ASM, CPUFLAGS) \
+TEST (DownSampleTest, func) { \
+  if (ASM) {\
+    int32_t iCpuCores = 0; \
+    uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \
+    if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
+    return; \
+  } \
+  ENFORCE_STACK_ALIGN_1D (uint8_t, dst_c, 50000, 16); \
+  ENFORCE_STACK_ALIGN_1D (uint8_t, src_c, 50000, 16); \
+  int dst_stride_c; \
+  int src_stride_c; \
+  int src_width_c; \
+  int src_height_c; \
+  ENFORCE_STACK_ALIGN_1D (uint8_t, dst_a, 50000, 16); \
+  ENFORCE_STACK_ALIGN_1D (uint8_t, src_a, 50000, 16); \
+  int dst_stride_a; \
+  int src_stride_a; \
+  int src_width_a; \
+  int src_height_a; \
+  dst_stride_c = dst_stride_a = 560; \
+  src_stride_c = src_stride_a = 560; \
+  src_width_c = src_width_a = 480; \
+  src_height_c = src_height_a = 30; \
+  for (int j = 0; j < 50000; j++) { \
+    dst_c[j] = dst_a[j] = rand() % 256; \
+    src_c[j] = src_a[j] = rand() % 256; \
+  } \
+  DyadicBilinearOneThirdDownsampler_c (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c/3); \
+  func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a/3); \
+  for (int j = 0; j < (src_height_c /3 ); j++) { \
+    for (int m = 0; m < (src_width_c /3); m++) { \
+      ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
+    } \
+  } \
+}
+
+#define GENERATE_DyadicBilinearQuarterDownsampler_UT(func, ASM, CPUFLAGS) \
+TEST (DownSampleTest, func) { \
+  if (ASM) {\
+    int32_t iCpuCores = 0; \
+    uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \
+    if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
+    return; \
+  } \
+  ENFORCE_STACK_ALIGN_1D (uint8_t, dst_c, 50000, 16); \
+  ENFORCE_STACK_ALIGN_1D (uint8_t, src_c, 50000, 16); \
+  int dst_stride_c; \
+  int src_stride_c; \
+  int src_width_c; \
+  int src_height_c; \
+  ENFORCE_STACK_ALIGN_1D (uint8_t, dst_a, 50000, 16); \
+  ENFORCE_STACK_ALIGN_1D (uint8_t, src_a, 50000, 16); \
+  int dst_stride_a; \
+  int src_stride_a; \
+  int src_width_a; \
+  int src_height_a; \
+  dst_stride_c = dst_stride_a = 560; \
+  src_stride_c = src_stride_a = 560; \
+  src_width_c = src_width_a = 640; \
+  src_height_c = src_height_a = 80; \
+  for (int j = 0; j < 50000; j++) { \
+    dst_c[j] = dst_a[j] = rand() % 256; \
+    src_c[j] = src_a[j] = rand() % 256; \
+  } \
+  DyadicBilinearQuarterDownsampler_c (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \
+  func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a); \
+  for (int j = 0; j < (src_height_c >> 2); j++) { \
+    for (int m = 0; m < (src_width_c >> 2); m++) { \
+      ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
+    } \
+  } \
+}
 #define GENERATE_GeneralBilinearDownsampler_UT(func, ref, ASM, CPUFLAGS) \
 TEST (DownSampleTest, func) { \
  if (ASM) {\
@ -259,6 +332,13 @@ GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_ssse3,
 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse4, 1, WELS_CPU_SSE41)
 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse4, 1, WELS_CPU_SSE41)

+GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_ssse3, 1, WELS_CPU_SSSE3)
+GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_sse4, 1, WELS_CPU_SSE41)
+
+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_sse, 1, WELS_CPU_SSE)
+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_ssse3, 1, WELS_CPU_SSSE3)
+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_sse4, 1, WELS_CPU_SSE41)
+
 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_sse2, GeneralBilinearFastDownsampler_ref, 1,
                                        WELS_CPU_SSE2)
 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse2,
@ -269,6 +349,10 @@ GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_s
 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_neon, 1, WELS_CPU_NEON)
 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsampler_neon, 1, WELS_CPU_NEON)

+GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_neon, 1, WELS_CPU_NEON)
+
+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_neon, 1, WELS_CPU_NEON)
+
 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_neon,
                                        GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_NEON)
 #endif
@ -277,6 +361,10 @@ GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_n
 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_AArch64_neon, 1, WELS_CPU_NEON)
 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsampler_AArch64_neon, 1, WELS_CPU_NEON)

+GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_AArch64_neon, 1, WELS_CPU_NEON)
+
+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_AArch64_neon, 1, WELS_CPU_NEON)
+
 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_AArch64_neon,
                                        GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_NEON)
 #endif