Merge pull request #2489 from saamas/processing-dyadic-bilinear-downsample-optimizations

[Processing] DyadicBilinearDownsample optimizations
2016-06-12 10:02:55 +08:00 · 2016-06-12 10:02:55 +08:00 · 4b6f037020
commit 4b6f037020
parent 94c94ca3b1 fe4a47a979
8 changed files with 144 additions and 431 deletions
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@ -485,7 +485,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-
 %endmacro

 %macro WELS_EXTERN 1
-    ALIGN 16
+    ALIGN 16, nop
    %ifdef PREFIX
        global _%1
        %define %1 _%1
--- a/codec/processing/src/common/util.h
+++ b/codec/processing/src/common/util.h
@ -83,10 +83,6 @@ WELSVP_NAMESPACE_BEGIN
 #define WELS_CLAMP(x, minv, maxv)  WELS_MIN(WELS_MAX(x, minv), maxv)

 #define ALIGNBYTES         (16)       /* Worst case is requiring alignment to an 16 byte boundary */
-#define WELS_ALIGN(iInput)   ((iInput+(ALIGNMENT-1)) & ~(ALIGNMENT-1))
-#define WELS_ALIGN2(iInput)  ((iInput+1) & ~1)
-#define WELS_ALIGN4(iInput)  ((iInput+3) & ~3)
-#define WELS_ALIGN8(iInput)  ((iInput+7) & ~7)

 #define WelsCastFromPointer(p)      (reinterpret_cast<intptr_t>(p))
 #define WelsStaticCast(type, p)  (static_cast<type>(p))
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@ -32,6 +32,7 @@

 #include "downsample.h"
 #include "cpu.h"
+#include <cassert>

 WELSVP_NAMESPACE_BEGIN
 #define MAX_SAMPLE_WIDTH 1920
@ -75,20 +76,18 @@ void CDownsampling::FreeSampleBuffer() {
    WelsFree (m_pSampleBuffer[i][2]);
  }
 }
+
 void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int32_t iCpuFlag) {
-  sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsampler_c;
-  sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;
-  sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;
-  sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;
+  sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsampler_c;
+  sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsampler_c;
  sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_c;
  sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_c;
  sDownsampleFunc.pfGeneralRatioChroma  = GeneralBilinearAccurateDownsampler_c;
  sDownsampleFunc.pfGeneralRatioLuma    = GeneralBilinearFastDownsampler_c;
 #if defined(X86_ASM)
  if (iCpuFlag & WELS_CPU_SSE) {
-    sDownsampleFunc.pfHalfAverage[0]    = DyadicBilinearDownsamplerWidthx32_sse;
-    sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_sse;
-    sDownsampleFunc.pfHalfAverage[2]    = DyadicBilinearDownsamplerWidthx8_sse;
+    sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_sse;
+    sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsamplerWidthx16_sse;
    sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse;
  }
  if (iCpuFlag & WELS_CPU_SSE2) {
@ -96,15 +95,13 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int
    sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearFastDownsamplerWrap_sse2;
  }
  if (iCpuFlag & WELS_CPU_SSSE3) {
-    sDownsampleFunc.pfHalfAverage[0]    = DyadicBilinearDownsamplerWidthx32_ssse3;
-    sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_ssse3;
+    sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_ssse3;
+    sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsamplerWidthx16_ssse3;
    sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_ssse3;
    sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_ssse3;
    sDownsampleFunc.pfGeneralRatioLuma    = GeneralBilinearFastDownsamplerWrap_ssse3;
  }
  if (iCpuFlag & WELS_CPU_SSE41) {
-    sDownsampleFunc.pfHalfAverage[0]    = DyadicBilinearDownsamplerWidthx32_sse4;
-    sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_sse4;
    sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
    sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_sse4;
    sDownsampleFunc.pfGeneralRatioChroma  = GeneralBilinearAccurateDownsamplerWrap_sse41;
@ -117,10 +114,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int

 #if defined(HAVE_NEON)
  if (iCpuFlag & WELS_CPU_NEON) {
-    sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_neon;
-    sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon;
-    sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon;
-    sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon;
+    sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_neon;
+    sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsampler_neon;
    sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_neon;
    sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_neon;
    sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon;
@ -130,10 +125,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int

 #if defined(HAVE_NEON_AARCH64)
  if (iCpuFlag & WELS_CPU_NEON) {
-    sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_AArch64_neon;
-    sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_AArch64_neon;
-    sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_AArch64_neon;
-    sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_AArch64_neon;
+    sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_AArch64_neon;
+    sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsampler_AArch64_neon;
    sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_AArch64_neon;
    sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_AArch64_neon;
    sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
@ -159,14 +152,11 @@ EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDs
  if (iSrcWidthY > MAX_SAMPLE_WIDTH || iSrcHeightY > MAX_SAMPLE_HEIGHT || m_bNoSampleBuffer) {
    if ((iSrcWidthY >> 1) == iDstWidthY && (iSrcHeightY >> 1) == iDstHeightY) {
      // use half average functions
-      uint8_t iAlignIndex = GetAlignedIndex (iSrcWidthY);
-      m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
+      DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
          (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
-
-      iAlignIndex = GetAlignedIndex (iSrcWidthUV);
-      m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
+      DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
          (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
-      m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
+      DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
          (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
    } else if ((iSrcWidthY >> 2) == iDstWidthY && (iSrcHeightY >> 2) == iDstHeightY) {

@ -223,29 +213,23 @@ EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDs
    do {
      if ((iHalfSrcWidth == iDstWidthY) && (iHalfSrcHeight == iDstHeightY)) { //end
        // use half average functions
-        uint8_t iAlignIndex = GetAlignedIndex (iSrcWidthY);
-        m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
+        DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
            (uint8_t*)pSrcY, iSrcStrideY, iSrcWidthY, iSrcHeightY);
-
-        iAlignIndex = GetAlignedIndex (iSrcWidthUV);
-        m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
+        DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
            (uint8_t*)pSrcU, iSrcStrideU, iSrcWidthUV, iSrcHeightUV);
-        m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
+        DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
            (uint8_t*)pSrcV, iSrcStrideV, iSrcWidthUV, iSrcHeightUV);
        break;
      } else if (((iHalfSrcWidth >> 1) >= iDstWidthY) && ((iHalfSrcHeight >> 1) >= iDstHeightY)) {
        // use half average functions
-        iDstStrideY = iHalfSrcWidth;
-        iDstStrideU = iHalfSrcWidth >> 1;
-        iDstStrideV = iHalfSrcWidth >> 1;
-        uint8_t iAlignIndex = GetAlignedIndex (iSrcWidthY);
-        m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstY, iDstStrideY,
+        iDstStrideY = WELS_ALIGN (iHalfSrcWidth, 32);
+        iDstStrideU = WELS_ALIGN (iHalfSrcWidth >> 1, 32);
+        iDstStrideV = WELS_ALIGN (iHalfSrcWidth >> 1, 32);
+        DownsampleHalfAverage ((uint8_t*)pDstY, iDstStrideY,
            (uint8_t*)pSrcY, iSrcStrideY, iSrcWidthY, iSrcHeightY);
-
-        iAlignIndex = GetAlignedIndex (iSrcWidthUV);
-        m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstU, iDstStrideU,
+        DownsampleHalfAverage ((uint8_t*)pDstU, iDstStrideU,
            (uint8_t*)pSrcU, iSrcStrideU, iSrcWidthUV, iSrcHeightUV);
-        m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstV, iDstStrideV,
+        DownsampleHalfAverage ((uint8_t*)pDstV, iDstStrideV,
            (uint8_t*)pSrcV, iSrcStrideV, iSrcWidthUV, iSrcHeightUV);

        pSrcY = (uint8_t*)pDstY;
@ -258,9 +242,9 @@ EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDs
        iSrcHeightY = iHalfSrcHeight;
        iSrcHeightUV = iHalfSrcHeight >> 1;

-        iSrcStrideY = iSrcWidthY;
-        iSrcStrideU = iSrcWidthUV;
-        iSrcStrideV = iSrcWidthUV;
+        iSrcStrideY = iDstStrideY;
+        iSrcStrideU = iDstStrideU;
+        iSrcStrideV = iDstStrideV;

        iHalfSrcWidth >>= 1;
        iHalfSrcHeight >>= 1;
@ -286,17 +270,18 @@ EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDs
  return RET_SUCCESS;
 }

-int32_t CDownsampling::GetAlignedIndex (const int32_t kiSrcWidth) {
-  int32_t iAlignIndex;
-  if ((kiSrcWidth & 0x1f) == 0)         // x32
-    iAlignIndex = 0;
-  else if ((kiSrcWidth & 0x0f) == 0)    // x16
-    iAlignIndex = 1;
-  else if ((kiSrcWidth & 0x07) == 0)    // x8
-    iAlignIndex = 2;
-  else
-    iAlignIndex = 3;
-  return iAlignIndex;
+void CDownsampling::DownsampleHalfAverage (uint8_t* pDst, int32_t iDstStride,
+        uint8_t* pSrc, int32_t iSrcStride, int32_t iSrcWidth, int32_t iSrcHeight) {
+  if ((iSrcStride & 31) == 0) {
+    assert ((iDstStride & 15) == 0);
+    m_pfDownsample.pfHalfAverageWidthx32 (pDst, iDstStride,
+        pSrc, iSrcStride, WELS_ALIGN (iSrcWidth & ~1, 32), iSrcHeight);
+  } else {
+    assert ((iSrcStride & 15) == 0);
+    assert ((iDstStride &  7) == 0);
+    m_pfDownsample.pfHalfAverageWidthx16 (pDst, iDstStride,
+        pSrc, iSrcStride, WELS_ALIGN (iSrcWidth & ~1, 16), iSrcHeight);
+  }
 }


--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@ -73,8 +73,8 @@ SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_c;
 SpecificDownsampleFunc  DyadicBilinearQuarterDownsampler_c;

 typedef struct {
-  // align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;
-  PHalveDownsampleFunc          pfHalfAverage[4];
+  PHalveDownsampleFunc          pfHalfAverageWidthx32;
+  PHalveDownsampleFunc          pfHalfAverageWidthx16;
  PSpecificDownsampleFunc       pfOneThirdDownsampler;
  PSpecificDownsampleFunc       pfQuarterDownsampler;
  PGeneralDownsampleFunc        pfGeneralRatioLuma;
@ -94,10 +94,6 @@ HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx32_sse;
 HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx16_ssse3;
 // iSrcWidth= x32 pixels
 HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx32_ssse3;
-// iSrcWidth= x16 pixels
-HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx16_sse4;
-// iSrcWidth= x32 pixels
-HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx32_sse4;

 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
@ -185,7 +181,8 @@ class CDownsampling : public IStrategy {
 private:
  void InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int32_t iCpuFlag);

-  int32_t GetAlignedIndex (const int32_t kiSrcWidth);
+  void DownsampleHalfAverage (uint8_t* pDst, int32_t iDstStride,
+      uint8_t* pSrc, int32_t iSrcStride, int32_t iSrcWidth, int32_t iSrcHeight);
  bool AllocateSampleBuffer();
  void FreeSampleBuffer();
 private:
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@ -40,6 +40,10 @@
 ;*************************************************************************/
 %include "asm_inc.asm"

+%ifdef __NASM_VER__
+    %use smartalign
+%endif
+
 ;***********************************************************************
 ; Macros and other preprocessor constants
 ;***********************************************************************
@ -471,24 +475,12 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse



-; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
 ;***********************************************************************
 ;   void DyadicBilinearDownsamplerWidthx32_ssse3(   unsigned char* pDst, const int iDstStride,
 ;                   unsigned char* pSrc, const int iSrcStride,
 ;                   const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
-    ;push ebx
-    ;push edx
-    ;push esi
-    ;push edi
-    ;push ebp
-
-    ;mov edi, [esp+24]   ; pDst
-    ;mov edx, [esp+28]   ; iDstStride
-    ;mov esi, [esp+32]   ; pSrc
-    ;mov ecx, [esp+36]   ; iSrcStride
-    ;mov ebp, [esp+44]   ; iSrcHeight
 %ifdef X86_32
    push r6
    %assign push_num 1
@ -496,7 +488,7 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
    %assign push_num 0
 %endif
    LOAD_6_PARA
-    PUSH_XMM 8
+    PUSH_XMM 4
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r4, r4d
@ -508,96 +500,44 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
 %endif
    sar r5, $01            ; iSrcHeight >> 1

-    movdqa xmm7, [shufb_mask_low]   ; mask low
-    movdqa xmm6, [shufb_mask_high]  ; mask high
+    WELS_DB1 xmm3
+    WELS_Zero xmm2
+    sar r4, $01            ; iSrcWidth >> 1
+    add r0, r4             ; pDst += iSrcWidth >> 1

 .yloops4:
-    ;mov eax, [esp+40]   ; iSrcWidth
-    ;sar eax, $01            ; iSrcWidth >> 1
-    ;mov ebx, eax        ; iDstWidth restored at ebx
-    ;sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
-    ;neg ebx             ; - (iSrcWidth >> 1)
 %ifdef X86_32
    mov r4, arg5
 %else
    mov r4, r12
 %endif
    sar r4, $01            ; iSrcWidth >> 1
-    mov r6, r4        ; iDstWidth restored at ebx
-    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
-    neg r6             ; - (iSrcWidth >> 1)
+    neg r4                 ; -(iSrcWidth >> 1)
+    mov r6, r4
+    align 16
    ; each loop = source bandwidth: 32 bytes
 .xloops4:
-    ; 1st part horizonal loop: x16 bytes
-    ;               mem  hi<-       ->lo
-    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
-    ;               xmm1: p P o O n N m M l L k K j J i I
-    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A
-    ;               xmm3: p P o O n N m M l L k K j J i I
-    ;=> target:
-    ;: P O N M L K J I H G F E D C B A
-    ;: p o n m l k j i h g f e d c b a
-    ;: P ..                          A
-    ;: p ..                          a
-
-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movdqa xmm0, [r2]          ; 1st_src_line
-    movdqa xmm1, [r2+16]       ; 1st_src_line + 16
-    movdqa xmm2, [r2+r3]      ; 2nd_src_line
-    movdqa xmm3, [r2+r3+16]   ; 2nd_src_line + 16
-
-    ; packing & avg
-    movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A
-    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-    pshufb xmm4, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-    ; another implementation for xmm4 high bits
-;   psubb xmm4, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;   psrlw xmm4, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-    pavgb xmm0, xmm4
-
-    movdqa xmm5, xmm1
-    pshufb xmm1, xmm7
-    pshufb xmm5, xmm6
-;   psubb xmm5, xmm1
-;   psrlw xmm5, 8
-    pavgb xmm1, xmm5
-
-    movdqa xmm4, xmm2
-    pshufb xmm2, xmm7
-    pshufb xmm4, xmm6
-;   psubb xmm4, xmm2
-;   psrlw xmm4, 8
-    pavgb xmm2, xmm4
-
-    movdqa xmm5, xmm3
-    pshufb xmm3, xmm7
-    pshufb xmm5, xmm6
-;   psubb xmm5, xmm3
-;   psrlw xmm5, 8
-    pavgb xmm3, xmm5
-
-    packuswb xmm0, xmm1
-    packuswb xmm2, xmm3
-    pavgb xmm0, xmm2
-
-    ; write pDst
-    movdqa [r0], xmm0
-
-    ; next SMB
-    lea r2, [r2+32]
-    lea r0, [r0+16]
-
-    dec r4
-    jg near .xloops4
+    movdqa xmm0, [r2+r3]
+    movdqa xmm1, [r2+r3+16]
+    pavgb  xmm0, [r2]          ; avg vertical pixels 0-15
+    pavgb  xmm1, [r2+16]       ; avg vertical pixels 16-31
+    add r2, 32                 ; pSrc += 32
+    pmaddubsw xmm0, xmm3       ; pairwise horizontal sum neighboring pixels 0-15
+    pmaddubsw xmm1, xmm3       ; pairwise horizontal sum neighboring pixels 16-31
+    pavgw xmm0, xmm2           ; (sum + 1) >> 1
+    pavgw xmm1, xmm2           ; (sum + 1) >> 1
+    packuswb xmm0, xmm1        ; pack words to bytes
+    movdqa [r0+r4], xmm0       ; store results
+    add r4, 16
+    jl .xloops4

    ; next line
    lea r2, [r2+2*r3]    ; next end of lines
    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
    lea r0, [r0+r1]
-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec r5
-    jg near .yloops4
+    sub r5, 1
+    jg .yloops4

 %ifndef X86_32
    pop r12
@ -623,7 +563,7 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
    %assign push_num 0
 %endif
    LOAD_6_PARA
-    PUSH_XMM 6
+    PUSH_XMM 4
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r4, r4d
@ -634,8 +574,11 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
    mov r12, r4
 %endif
    sar r5, $01            ; iSrcHeight >> 1
-    movdqa xmm5, [shufb_mask_low]   ; mask low
-    movdqa xmm4, [shufb_mask_high]  ; mask high
+    WELS_DB1 xmm3
+    WELS_Zero xmm2
+    add r2, r4             ; pSrc += iSrcWidth
+    sar r4, $01            ; iSrcWidth >> 1
+    add r0, r4             ; pDst += iSrcWidth >> 1

 .yloops5:
 %ifdef X86_32
@ -644,279 +587,26 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
    mov r4, r12
 %endif
    sar r4, $01            ; iSrcWidth >> 1
-    mov r6, r4        ; iDstWidth restored at ebx
-    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb
-    neg r6             ; - (iSrcWidth >> 1)
+    neg r4                 ; -(iSrcWidth >> 1)
+    lea r6, [r2+r3]        ; pSrc + iSrcStride
+    align 16
    ; each loop = source bandwidth: 16 bytes
 .xloops5:
-    ; horizonal loop: x16 bytes by source
-    ;               mem  hi<-       ->lo
-    ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
-    ;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-    ;=> target:
-    ;: H G F E D C B A, P O N M L K J I
-    ;: h g f e d c b a, p o n m l k j i
-
-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movdqa xmm0, [r2]          ; 1st_src_line
-    movdqa xmm1, [r2+r3]      ; 2nd_src_line
-
-    ; packing & avg
-    movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A
-    pshufb xmm0, xmm5           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-    pshufb xmm2, xmm4           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-    ; another implementation for xmm2 high bits
-;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-    pavgb xmm0, xmm2
-
-    movdqa xmm3, xmm1
-    pshufb xmm1, xmm5
-    pshufb xmm3, xmm4
-;   psubb xmm3, xmm1
-;   psrlw xmm3, 8
-    pavgb xmm1, xmm3
-
-    pavgb xmm0, xmm1
-    packuswb xmm0, xmm1
-
-    ; write pDst
-    movq [r0], xmm0
-
-    ; next SMB
-    lea r2, [r2+16]
-    lea r0, [r0+8]
-
-    dec r4
-    jg near .xloops5
-
-    lea r2, [r2+2*r3]    ; next end of lines
-    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
-    lea r0, [r0+r1]
-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
-
-    dec r5
-    jg near .yloops5
-
-%ifndef X86_32
-    pop r12
-%endif
-
-    POP_XMM
-    LOAD_6_PARA_POP
-%ifdef X86_32
-    pop r6
-%endif
-    ret
-
-; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
-;***********************************************************************
-;   void DyadicBilinearDownsamplerWidthx32_sse4(    unsigned char* pDst, const int iDstStride,
-;                   unsigned char* pSrc, const int iSrcStride,
-;                   const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
-%ifdef X86_32
-    push r6
-    %assign push_num 1
-%else
-    %assign push_num 0
-%endif
-    LOAD_6_PARA
-    PUSH_XMM 8
-    SIGN_EXTENSION r1, r1d
-    SIGN_EXTENSION r3, r3d
-    SIGN_EXTENSION r4, r4d
-    SIGN_EXTENSION r5, r5d
-
-%ifndef X86_32
-    push r12
-    mov r12, r4
-%endif
-    sar r5, $01            ; iSrcHeight >> 1
-
-    movdqa xmm7, [shufb_mask_low]   ; mask low
-    movdqa xmm6, [shufb_mask_high]  ; mask high
-
-.yloops6:
-%ifdef X86_32
-    mov r4, arg5
-%else
-    mov r4, r12
-%endif
-    sar r4, $01            ; iSrcWidth >> 1
-    mov r6, r4        ; iDstWidth restored at ebx
-    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
-    neg r6             ; - (iSrcWidth >> 1)
-    ; each loop = source bandwidth: 32 bytes
-.xloops6:
-    ; 1st part horizonal loop: x16 bytes
-    ;               mem  hi<-       ->lo
-    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
-    ;               xmm1: p P o O n N m M l L k K j J i I
-    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A
-    ;               xmm3: p P o O n N m M l L k K j J i I
-    ;=> target:
-    ;: P O N M L K J I H G F E D C B A
-    ;: p o n m l k j i h g f e d c b a
-    ;: P ..                          A
-    ;: p ..                          a
-
-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movntdqa xmm0, [r2]            ; 1st_src_line
-    movntdqa xmm1, [r2+16]     ; 1st_src_line + 16
-    movntdqa xmm2, [r2+r3]        ; 2nd_src_line
-    movntdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16
-
-    ; packing & avg
-    movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A
-    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-    pshufb xmm4, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;   psubb xmm4, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;   psrlw xmm4, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-    pavgb xmm0, xmm4
-
-    movdqa xmm5, xmm1
-    pshufb xmm1, xmm7
-    pshufb xmm5, xmm6
-;   psubb xmm5, xmm1
-;   psrlw xmm5, 8
-    pavgb xmm1, xmm5
-
-    movdqa xmm4, xmm2
-    pshufb xmm2, xmm7
-    pshufb xmm4, xmm6
-;   psubb xmm4, xmm2
-;   psrlw xmm4, 8
-    pavgb xmm2, xmm4
-
-    movdqa xmm5, xmm3
-    pshufb xmm3, xmm7
-    pshufb xmm5, xmm6
-;   psubb xmm5, xmm3
-;   psrlw xmm5, 8
-    pavgb xmm3, xmm5
-
-    packuswb xmm0, xmm1
-    packuswb xmm2, xmm3
-    pavgb xmm0, xmm2
-
-    ; write pDst
-    movdqa [r0], xmm0
-
-    ; next SMB
-    lea r2, [r2+32]
-    lea r0, [r0+16]
-
-    dec r4
-    jg near .xloops6
-
-    lea r2, [r2+2*r3]    ; next end of lines
-    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
-    lea r0, [r0+r1]
-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
-
-    dec r5
-    jg near .yloops6
-
-%ifndef X86_32
-    pop r12
-%endif
-
-    POP_XMM
-    LOAD_6_PARA_POP
-%ifdef X86_32
-    pop r6
-%endif
-    ret
-
-;***********************************************************************
-;   void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-;                     unsigned char* pSrc, const int iSrcStride,
-;                     const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
-%ifdef X86_32
-    push r6
-    %assign push_num 1
-%else
-    %assign push_num 0
-%endif
-    LOAD_6_PARA
-    PUSH_XMM 6
-    SIGN_EXTENSION r1, r1d
-    SIGN_EXTENSION r3, r3d
-    SIGN_EXTENSION r4, r4d
-    SIGN_EXTENSION r5, r5d
-
-%ifndef X86_32
-    push r12
-    mov r12, r4
-%endif
-    sar r5, $01            ; iSrcHeight >> 1
-    movdqa xmm5, [shufb_mask_low]   ; mask low
-    movdqa xmm4, [shufb_mask_high]  ; mask high
-
-.yloops7:
-%ifdef X86_32
-    mov r4, arg5
-%else
-    mov r4, r12
-%endif
-    sar r4, $01            ; iSrcWidth >> 1
-    mov r6, r4        ; iDstWidth restored at ebx
-    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb
-    neg r6             ; - (iSrcWidth >> 1)
-    ; each loop = source bandwidth: 16 bytes
-.xloops7:
-    ; horizonal loop: x16 bytes by source
-    ;               mem  hi<-       ->lo
-    ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
-    ;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-    ;=> target:
-    ;: H G F E D C B A, P O N M L K J I
-    ;: h g f e d c b a, p o n m l k j i
-
-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movntdqa xmm0, [r2]            ; 1st_src_line
-    movntdqa xmm1, [r2+r3]        ; 2nd_src_line
-
-    ; packing & avg
-    movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A
-    pshufb xmm0, xmm5           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-    pshufb xmm2, xmm4           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-    pavgb xmm0, xmm2
-
-    movdqa xmm3, xmm1
-    pshufb xmm1, xmm5
-    pshufb xmm3, xmm4
-;   psubb xmm3, xmm1
-;   psrlw xmm3, 8
-    pavgb xmm1, xmm3
-
-    pavgb xmm0, xmm1
-    packuswb xmm0, xmm1
-
-    ; write pDst
-    movq [r0], xmm0
-
-    ; next SMB
-    lea r2, [r2+16]
-    lea r0, [r0+8]
-
-    dec r4
-    jg near .xloops7
+    movdqa xmm0, [r2+2*r4]
+    pavgb  xmm0, [r6+2*r4]     ; avg vertical pixels
+    pmaddubsw xmm0, xmm3       ; pairwise horizontal sum neighboring pixels
+    pavgw xmm0, xmm2           ; (sum + 1) >> 1
+    packuswb xmm0, xmm0        ; pack words to bytes
+    movlps [r0+r4], xmm0       ; store results
+    add r4, 8
+    jl .xloops5

    ; next line
    lea r2, [r2+2*r3]    ; next end of lines
-    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
    lea r0, [r0+r1]
-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec r5
-    jg near .yloops7
+    sub r5, 1
+    jg .yloops5

 %ifndef X86_32
    pop r12
--- a/test/api/decode_api_test.cpp
+++ b/test/api/decode_api_test.cpp
@ -759,9 +759,17 @@ const uint32_t kiHeight = 96; //DO NOT CHANGE!
 const uint32_t kiFrameRate = 12; //DO NOT CHANGE!
 const uint32_t kiFrameNum = 100; //DO NOT CHANGE!
 const char* pHashStr[] = { //DO NOT CHANGE!
+// X86_ASM downsampling routines average vertically first, as opposed to
+// horizontally first, which results in different output.
+#ifdef X86_ASM
+  "244eebcb51f4c2a56e83fc5da3373cad9ec0e1e5",
+  "bbad99ef99e37b34bcb4f09a7ec4d144375f6be7",
+  "809f97e836650624d92f0b8e200a6ab25f810d6f"
+#else
  "9c4e6146b29bac5d5d4be3c5bbab9c072dcb3f3f",
  "f350001c333902029800bd291fbed915a4bdf19a",
  "eb9d853b7daec03052c4850027ac94adc84c3a7e"
+#endif
 };

 class DecodeParseAPI : public ::testing::TestWithParam<EncodeDecodeFileParamBase>, public EncodeDecodeTestBase {
--- a/test/api/encoder_test.cpp
+++ b/test/api/encoder_test.cpp
@ -123,7 +123,14 @@ static const EncodeFileParam kFileParamArray[] = {
  },
  {
    "res/CiscoVT2people_320x192_12fps.yuv",
-    "73156dfc1dc45924349b5b79f8debcac13d7231d", CAMERA_VIDEO_REAL_TIME, 320, 192, 12.0f, SM_SINGLE_SLICE, false, 2, false, false, false
+// X86_ASM downsampling routines average vertically first, as opposed to
+// horizontally first, which results in different output.
+#ifdef X86_ASM
+    "a5341d588b769809c1f1d983e5a0fcef7362f3ad",
+#else
+    "73156dfc1dc45924349b5b79f8debcac13d7231d",
+#endif
+    CAMERA_VIDEO_REAL_TIME, 320, 192, 12.0f, SM_SINGLE_SLICE, false, 2, false, false, false
  },
  {
    "res/Cisco_Absolute_Power_1280x720_30fps.yuv",
@ -131,7 +138,14 @@ static const EncodeFileParam kFileParamArray[] = {
  },
  {
    "res/Cisco_Absolute_Power_1280x720_30fps.yuv",
-    "3943145545a2bd27a642b2045d4e3dbae55c6870", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
+// X86_ASM downsampling routines average vertically first, as opposed to
+// horizontally first, which results in different output.
+#ifdef X86_ASM
+    "ec9d776a7d92cf0f6640065aee8af2450af0e993",
+#else
+    "3943145545a2bd27a642b2045d4e3dbae55c6870",
+#endif
+    CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
  },
  // the following values may be adjusted for times since we start tuning the strategy
  {
--- a/test/processing/ProcessUT_DownSample.cpp
+++ b/test/processing/ProcessUT_DownSample.cpp
@ -30,6 +30,27 @@ void DyadicBilinearDownsampler_ref (uint8_t* pDst, const int32_t kiDstStride,
  }
 }

+void DyadicBilinearDownsampler2_ref (uint8_t* pDst, const int32_t kiDstStride,
+                                     const uint8_t* pSrc, const int32_t kiSrcStride,
+                                     const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+  uint8_t* pDstLine = pDst;
+  const uint8_t* pSrcLine1 = pSrc;
+  const uint8_t* pSrcLine2 = pSrc + kiSrcStride;
+  const int32_t kiDstWidth  = kiSrcWidth >> 1;
+  const int32_t kiDstHeight = kiSrcHeight >> 1;
+
+  for (int32_t j = 0; j < kiDstHeight; j++) {
+    for (int32_t i = 0; i < kiDstWidth; i++) {
+      const int32_t kiTempCol1 = (pSrcLine1[2 * i + 0] + pSrcLine2[2 * i + 0] + 1) >> 1;
+      const int32_t kiTempCol2 = (pSrcLine1[2 * i + 1] + pSrcLine2[2 * i + 1] + 1) >> 1;
+      pDstLine[i] = (uint8_t) ((kiTempCol1 + kiTempCol2 + 1) >> 1);
+    }
+    pDstLine += kiDstStride;
+    pSrcLine1 += 2 * kiSrcStride;
+    pSrcLine2 += 2 * kiSrcStride;
+  }
+}
+
 void GeneralBilinearFastDownsampler_ref (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
    const int32_t kiDstHeight,
    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
@ -162,7 +183,7 @@ void GeneralBilinearAccurateDownsampler_ref (uint8_t* pDst, const int32_t kiDstS
  }
 }

-#define GENERATE_DyadicBilinearDownsampler_UT(func, ASM, CPUFLAGS) \
+#define GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, ref_func) \
 TEST (DownSampleTest, func) { \
  if (ASM) {\
    int32_t iCpuCores = 0; \
@ -190,7 +211,7 @@ TEST (DownSampleTest, func) { \
    dst_c[j] = dst_a[j] = rand() % 256; \
    src_c[j] = src_a[j] = rand() % 256; \
  } \
-  DyadicBilinearDownsampler_ref (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \
+  ref_func (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \
  func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a); \
  for (int j = 0; j < (src_height_c >> 1); j++) { \
    for (int m = 0; m < (src_width_c >> 1); m++) { \
@ -199,6 +220,11 @@ TEST (DownSampleTest, func) { \
  } \
 }

+#define GENERATE_DyadicBilinearDownsampler_UT(func, ASM, CPUFLAGS) \
+  GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, DyadicBilinearDownsampler_ref)
+#define GENERATE_DyadicBilinearDownsampler2_UT(func, ASM, CPUFLAGS) \
+  GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, DyadicBilinearDownsampler2_ref)
+
 #define GENERATE_DyadicBilinearOneThirdDownsampler_UT(func, ASM, CPUFLAGS) \
 TEST (DownSampleTest, func) { \
  if (ASM) {\
@ -328,11 +354,8 @@ GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse, 1,
 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse, 1, WELS_CPU_SSE)
 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx8_sse, 1, WELS_CPU_SSE)

-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_ssse3, 1, WELS_CPU_SSSE3)
-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_ssse3, 1, WELS_CPU_SSSE3)
-
-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse4, 1, WELS_CPU_SSE41)
-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse4, 1, WELS_CPU_SSE41)
+GENERATE_DyadicBilinearDownsampler2_UT (DyadicBilinearDownsamplerWidthx32_ssse3, 1, WELS_CPU_SSSE3)
+GENERATE_DyadicBilinearDownsampler2_UT (DyadicBilinearDownsamplerWidthx16_ssse3, 1, WELS_CPU_SSSE3)

 GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_ssse3, 1, WELS_CPU_SSSE3)
 GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_sse4, 1, WELS_CPU_SSE41)