[Processing/x86] DyadicBilinearDownsample optimizations

Average vertically before horizontally; horizontal averaging is more worksome. Doing the vertical averaging first reduces the number of horizontal averages by half. Use pmaddubsw and pavgw to do the horizontal averaging for a slight performance improvement. Minor tweaks. Improve the SSSE3 dyadic downsample routines and drop the SSE4 routines. The non-temporal loads used in the SSE4 routines do nothing for cache- backed memory AFAIK. Adjust tests because averaging vertically first gives slightly different output. ~2.39x speedup for the widthx32 routine on Haswell when not memory-bound. ~2.20x speedup for the widthx16 routine on Haswell when not memory-bound. Note that the widthx16 routine can be unrolled for further speedup.
2016-06-01 23:45:44 +02:00 · 2016-06-01 23:45:44 +02:00 · 8a0af4a3f2
commit 8a0af4a3f2
parent 7cbb75eac6
7 changed files with 95 additions and 372 deletions
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@ -485,7 +485,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-
 %endmacro

 %macro WELS_EXTERN 1
-    ALIGN 16
+    ALIGN 16, nop
    %ifdef PREFIX
        global _%1
        %define %1 _%1
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@ -102,8 +102,6 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int
    sDownsampleFunc.pfGeneralRatioLuma    = GeneralBilinearFastDownsamplerWrap_ssse3;
  }
  if (iCpuFlag & WELS_CPU_SSE41) {
-    sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_sse4;
-    sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsamplerWidthx16_sse4;
    sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
    sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_sse4;
    sDownsampleFunc.pfGeneralRatioChroma  = GeneralBilinearAccurateDownsamplerWrap_sse41;
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@ -94,10 +94,6 @@ HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx32_sse;
 HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx16_ssse3;
 // iSrcWidth= x32 pixels
 HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx32_ssse3;
-// iSrcWidth= x16 pixels
-HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx16_sse4;
-// iSrcWidth= x32 pixels
-HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx32_sse4;

 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@ -40,6 +40,10 @@
 ;*************************************************************************/
 %include "asm_inc.asm"

+%ifdef __NASM_VER__
+    %use smartalign
+%endif
+
 ;***********************************************************************
 ; Macros and other preprocessor constants
 ;***********************************************************************
@ -471,24 +475,12 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse



-; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
 ;***********************************************************************
 ;   void DyadicBilinearDownsamplerWidthx32_ssse3(   unsigned char* pDst, const int iDstStride,
 ;                   unsigned char* pSrc, const int iSrcStride,
 ;                   const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
-    ;push ebx
-    ;push edx
-    ;push esi
-    ;push edi
-    ;push ebp
-
-    ;mov edi, [esp+24]   ; pDst
-    ;mov edx, [esp+28]   ; iDstStride
-    ;mov esi, [esp+32]   ; pSrc
-    ;mov ecx, [esp+36]   ; iSrcStride
-    ;mov ebp, [esp+44]   ; iSrcHeight
 %ifdef X86_32
    push r6
    %assign push_num 1
@ -496,7 +488,7 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
    %assign push_num 0
 %endif
    LOAD_6_PARA
-    PUSH_XMM 8
+    PUSH_XMM 4
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r4, r4d
@ -508,96 +500,44 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
 %endif
    sar r5, $01            ; iSrcHeight >> 1

-    movdqa xmm7, [shufb_mask_low]   ; mask low
-    movdqa xmm6, [shufb_mask_high]  ; mask high
+    WELS_DB1 xmm3
+    WELS_Zero xmm2
+    sar r4, $01            ; iSrcWidth >> 1
+    add r0, r4             ; pDst += iSrcWidth >> 1

 .yloops4:
-    ;mov eax, [esp+40]   ; iSrcWidth
-    ;sar eax, $01            ; iSrcWidth >> 1
-    ;mov ebx, eax        ; iDstWidth restored at ebx
-    ;sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
-    ;neg ebx             ; - (iSrcWidth >> 1)
 %ifdef X86_32
    mov r4, arg5
 %else
    mov r4, r12
 %endif
    sar r4, $01            ; iSrcWidth >> 1
-    mov r6, r4        ; iDstWidth restored at ebx
-    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
-    neg r6             ; - (iSrcWidth >> 1)
+    neg r4                 ; -(iSrcWidth >> 1)
+    mov r6, r4
+    align 16
    ; each loop = source bandwidth: 32 bytes
 .xloops4:
-    ; 1st part horizonal loop: x16 bytes
-    ;               mem  hi<-       ->lo
-    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
-    ;               xmm1: p P o O n N m M l L k K j J i I
-    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A
-    ;               xmm3: p P o O n N m M l L k K j J i I
-    ;=> target:
-    ;: P O N M L K J I H G F E D C B A
-    ;: p o n m l k j i h g f e d c b a
-    ;: P ..                          A
-    ;: p ..                          a
-
-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movdqa xmm0, [r2]          ; 1st_src_line
-    movdqa xmm1, [r2+16]       ; 1st_src_line + 16
-    movdqa xmm2, [r2+r3]      ; 2nd_src_line
-    movdqa xmm3, [r2+r3+16]   ; 2nd_src_line + 16
-
-    ; packing & avg
-    movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A
-    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-    pshufb xmm4, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-    ; another implementation for xmm4 high bits
-;   psubb xmm4, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;   psrlw xmm4, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-    pavgb xmm0, xmm4
-
-    movdqa xmm5, xmm1
-    pshufb xmm1, xmm7
-    pshufb xmm5, xmm6
-;   psubb xmm5, xmm1
-;   psrlw xmm5, 8
-    pavgb xmm1, xmm5
-
-    movdqa xmm4, xmm2
-    pshufb xmm2, xmm7
-    pshufb xmm4, xmm6
-;   psubb xmm4, xmm2
-;   psrlw xmm4, 8
-    pavgb xmm2, xmm4
-
-    movdqa xmm5, xmm3
-    pshufb xmm3, xmm7
-    pshufb xmm5, xmm6
-;   psubb xmm5, xmm3
-;   psrlw xmm5, 8
-    pavgb xmm3, xmm5
-
-    packuswb xmm0, xmm1
-    packuswb xmm2, xmm3
-    pavgb xmm0, xmm2
-
-    ; write pDst
-    movdqa [r0], xmm0
-
-    ; next SMB
-    lea r2, [r2+32]
-    lea r0, [r0+16]
-
-    dec r4
-    jg near .xloops4
+    movdqa xmm0, [r2+r3]
+    movdqa xmm1, [r2+r3+16]
+    pavgb  xmm0, [r2]          ; avg vertical pixels 0-15
+    pavgb  xmm1, [r2+16]       ; avg vertical pixels 16-31
+    add r2, 32                 ; pSrc += 32
+    pmaddubsw xmm0, xmm3       ; pairwise horizontal sum neighboring pixels 0-15
+    pmaddubsw xmm1, xmm3       ; pairwise horizontal sum neighboring pixels 16-31
+    pavgw xmm0, xmm2           ; (sum + 1) >> 1
+    pavgw xmm1, xmm2           ; (sum + 1) >> 1
+    packuswb xmm0, xmm1        ; pack words to bytes
+    movdqa [r0+r4], xmm0       ; store results
+    add r4, 16
+    jl .xloops4

    ; next line
    lea r2, [r2+2*r3]    ; next end of lines
    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
    lea r0, [r0+r1]
-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec r5
-    jg near .yloops4
+    sub r5, 1
+    jg .yloops4

 %ifndef X86_32
    pop r12
@ -623,7 +563,7 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
    %assign push_num 0
 %endif
    LOAD_6_PARA
-    PUSH_XMM 6
+    PUSH_XMM 4
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r4, r4d
@ -634,8 +574,11 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
    mov r12, r4
 %endif
    sar r5, $01            ; iSrcHeight >> 1
-    movdqa xmm5, [shufb_mask_low]   ; mask low
-    movdqa xmm4, [shufb_mask_high]  ; mask high
+    WELS_DB1 xmm3
+    WELS_Zero xmm2
+    add r2, r4             ; pSrc += iSrcWidth
+    sar r4, $01            ; iSrcWidth >> 1
+    add r0, r4             ; pDst += iSrcWidth >> 1

 .yloops5:
 %ifdef X86_32
@ -644,279 +587,26 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
    mov r4, r12
 %endif
    sar r4, $01            ; iSrcWidth >> 1
-    mov r6, r4        ; iDstWidth restored at ebx
-    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb
-    neg r6             ; - (iSrcWidth >> 1)
+    neg r4                 ; -(iSrcWidth >> 1)
+    lea r6, [r2+r3]        ; pSrc + iSrcStride
+    align 16
    ; each loop = source bandwidth: 16 bytes
 .xloops5:
-    ; horizonal loop: x16 bytes by source
-    ;               mem  hi<-       ->lo
-    ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
-    ;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-    ;=> target:
-    ;: H G F E D C B A, P O N M L K J I
-    ;: h g f e d c b a, p o n m l k j i
-
-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movdqa xmm0, [r2]          ; 1st_src_line
-    movdqa xmm1, [r2+r3]      ; 2nd_src_line
-
-    ; packing & avg
-    movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A
-    pshufb xmm0, xmm5           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-    pshufb xmm2, xmm4           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-    ; another implementation for xmm2 high bits
-;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-    pavgb xmm0, xmm2
-
-    movdqa xmm3, xmm1
-    pshufb xmm1, xmm5
-    pshufb xmm3, xmm4
-;   psubb xmm3, xmm1
-;   psrlw xmm3, 8
-    pavgb xmm1, xmm3
-
-    pavgb xmm0, xmm1
-    packuswb xmm0, xmm1
-
-    ; write pDst
-    movq [r0], xmm0
-
-    ; next SMB
-    lea r2, [r2+16]
-    lea r0, [r0+8]
-
-    dec r4
-    jg near .xloops5
-
-    lea r2, [r2+2*r3]    ; next end of lines
-    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
-    lea r0, [r0+r1]
-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
-
-    dec r5
-    jg near .yloops5
-
-%ifndef X86_32
-    pop r12
-%endif
-
-    POP_XMM
-    LOAD_6_PARA_POP
-%ifdef X86_32
-    pop r6
-%endif
-    ret
-
-; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
-;***********************************************************************
-;   void DyadicBilinearDownsamplerWidthx32_sse4(    unsigned char* pDst, const int iDstStride,
-;                   unsigned char* pSrc, const int iSrcStride,
-;                   const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
-%ifdef X86_32
-    push r6
-    %assign push_num 1
-%else
-    %assign push_num 0
-%endif
-    LOAD_6_PARA
-    PUSH_XMM 8
-    SIGN_EXTENSION r1, r1d
-    SIGN_EXTENSION r3, r3d
-    SIGN_EXTENSION r4, r4d
-    SIGN_EXTENSION r5, r5d
-
-%ifndef X86_32
-    push r12
-    mov r12, r4
-%endif
-    sar r5, $01            ; iSrcHeight >> 1
-
-    movdqa xmm7, [shufb_mask_low]   ; mask low
-    movdqa xmm6, [shufb_mask_high]  ; mask high
-
-.yloops6:
-%ifdef X86_32
-    mov r4, arg5
-%else
-    mov r4, r12
-%endif
-    sar r4, $01            ; iSrcWidth >> 1
-    mov r6, r4        ; iDstWidth restored at ebx
-    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
-    neg r6             ; - (iSrcWidth >> 1)
-    ; each loop = source bandwidth: 32 bytes
-.xloops6:
-    ; 1st part horizonal loop: x16 bytes
-    ;               mem  hi<-       ->lo
-    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
-    ;               xmm1: p P o O n N m M l L k K j J i I
-    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A
-    ;               xmm3: p P o O n N m M l L k K j J i I
-    ;=> target:
-    ;: P O N M L K J I H G F E D C B A
-    ;: p o n m l k j i h g f e d c b a
-    ;: P ..                          A
-    ;: p ..                          a
-
-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movntdqa xmm0, [r2]            ; 1st_src_line
-    movntdqa xmm1, [r2+16]     ; 1st_src_line + 16
-    movntdqa xmm2, [r2+r3]        ; 2nd_src_line
-    movntdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16
-
-    ; packing & avg
-    movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A
-    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-    pshufb xmm4, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;   psubb xmm4, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;   psrlw xmm4, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-    pavgb xmm0, xmm4
-
-    movdqa xmm5, xmm1
-    pshufb xmm1, xmm7
-    pshufb xmm5, xmm6
-;   psubb xmm5, xmm1
-;   psrlw xmm5, 8
-    pavgb xmm1, xmm5
-
-    movdqa xmm4, xmm2
-    pshufb xmm2, xmm7
-    pshufb xmm4, xmm6
-;   psubb xmm4, xmm2
-;   psrlw xmm4, 8
-    pavgb xmm2, xmm4
-
-    movdqa xmm5, xmm3
-    pshufb xmm3, xmm7
-    pshufb xmm5, xmm6
-;   psubb xmm5, xmm3
-;   psrlw xmm5, 8
-    pavgb xmm3, xmm5
-
-    packuswb xmm0, xmm1
-    packuswb xmm2, xmm3
-    pavgb xmm0, xmm2
-
-    ; write pDst
-    movdqa [r0], xmm0
-
-    ; next SMB
-    lea r2, [r2+32]
-    lea r0, [r0+16]
-
-    dec r4
-    jg near .xloops6
-
-    lea r2, [r2+2*r3]    ; next end of lines
-    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
-    lea r0, [r0+r1]
-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
-
-    dec r5
-    jg near .yloops6
-
-%ifndef X86_32
-    pop r12
-%endif
-
-    POP_XMM
-    LOAD_6_PARA_POP
-%ifdef X86_32
-    pop r6
-%endif
-    ret
-
-;***********************************************************************
-;   void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-;                     unsigned char* pSrc, const int iSrcStride,
-;                     const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
-%ifdef X86_32
-    push r6
-    %assign push_num 1
-%else
-    %assign push_num 0
-%endif
-    LOAD_6_PARA
-    PUSH_XMM 6
-    SIGN_EXTENSION r1, r1d
-    SIGN_EXTENSION r3, r3d
-    SIGN_EXTENSION r4, r4d
-    SIGN_EXTENSION r5, r5d
-
-%ifndef X86_32
-    push r12
-    mov r12, r4
-%endif
-    sar r5, $01            ; iSrcHeight >> 1
-    movdqa xmm5, [shufb_mask_low]   ; mask low
-    movdqa xmm4, [shufb_mask_high]  ; mask high
-
-.yloops7:
-%ifdef X86_32
-    mov r4, arg5
-%else
-    mov r4, r12
-%endif
-    sar r4, $01            ; iSrcWidth >> 1
-    mov r6, r4        ; iDstWidth restored at ebx
-    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb
-    neg r6             ; - (iSrcWidth >> 1)
-    ; each loop = source bandwidth: 16 bytes
-.xloops7:
-    ; horizonal loop: x16 bytes by source
-    ;               mem  hi<-       ->lo
-    ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
-    ;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-    ;=> target:
-    ;: H G F E D C B A, P O N M L K J I
-    ;: h g f e d c b a, p o n m l k j i
-
-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movntdqa xmm0, [r2]            ; 1st_src_line
-    movntdqa xmm1, [r2+r3]        ; 2nd_src_line
-
-    ; packing & avg
-    movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A
-    pshufb xmm0, xmm5           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-    pshufb xmm2, xmm4           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-    pavgb xmm0, xmm2
-
-    movdqa xmm3, xmm1
-    pshufb xmm1, xmm5
-    pshufb xmm3, xmm4
-;   psubb xmm3, xmm1
-;   psrlw xmm3, 8
-    pavgb xmm1, xmm3
-
-    pavgb xmm0, xmm1
-    packuswb xmm0, xmm1
-
-    ; write pDst
-    movq [r0], xmm0
-
-    ; next SMB
-    lea r2, [r2+16]
-    lea r0, [r0+8]
-
-    dec r4
-    jg near .xloops7
+    movdqa xmm0, [r2+2*r4]
+    pavgb  xmm0, [r6+2*r4]     ; avg vertical pixels
+    pmaddubsw xmm0, xmm3       ; pairwise horizontal sum neighboring pixels
+    pavgw xmm0, xmm2           ; (sum + 1) >> 1
+    packuswb xmm0, xmm0        ; pack words to bytes
+    movlps [r0+r4], xmm0       ; store results
+    add r4, 8
+    jl .xloops5

    ; next line
    lea r2, [r2+2*r3]    ; next end of lines
-    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
    lea r0, [r0+r1]
-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec r5
-    jg near .yloops7
+    sub r5, 1
+    jg .yloops5

 %ifndef X86_32
    pop r12
--- a/test/api/decode_api_test.cpp
+++ b/test/api/decode_api_test.cpp
@ -759,9 +759,15 @@ const uint32_t kiHeight = 96; //DO NOT CHANGE!
 const uint32_t kiFrameRate = 12; //DO NOT CHANGE!
 const uint32_t kiFrameNum = 100; //DO NOT CHANGE!
 const char* pHashStr[] = { //DO NOT CHANGE!
+#ifdef X86_ASM
+  "244eebcb51f4c2a56e83fc5da3373cad9ec0e1e5",
+  "bbad99ef99e37b34bcb4f09a7ec4d144375f6be7",
+  "809f97e836650624d92f0b8e200a6ab25f810d6f"
+#else
  "9c4e6146b29bac5d5d4be3c5bbab9c072dcb3f3f",
  "f350001c333902029800bd291fbed915a4bdf19a",
  "eb9d853b7daec03052c4850027ac94adc84c3a7e"
+#endif
 };

 class DecodeParseAPI : public ::testing::TestWithParam<EncodeDecodeFileParamBase>, public EncodeDecodeTestBase {
--- a/test/api/encoder_test.cpp
+++ b/test/api/encoder_test.cpp
@ -123,7 +123,12 @@ static const EncodeFileParam kFileParamArray[] = {
  },
  {
    "res/CiscoVT2people_320x192_12fps.yuv",
-    "73156dfc1dc45924349b5b79f8debcac13d7231d", CAMERA_VIDEO_REAL_TIME, 320, 192, 12.0f, SM_SINGLE_SLICE, false, 2, false, false, false
+#ifdef X86_ASM
+    "a5341d588b769809c1f1d983e5a0fcef7362f3ad",
+#else
+    "73156dfc1dc45924349b5b79f8debcac13d7231d",
+#endif
+    CAMERA_VIDEO_REAL_TIME, 320, 192, 12.0f, SM_SINGLE_SLICE, false, 2, false, false, false
  },
  {
    "res/Cisco_Absolute_Power_1280x720_30fps.yuv",
@ -131,7 +136,12 @@ static const EncodeFileParam kFileParamArray[] = {
  },
  {
    "res/Cisco_Absolute_Power_1280x720_30fps.yuv",
-    "3943145545a2bd27a642b2045d4e3dbae55c6870", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
+#ifdef X86_ASM
+    "ec9d776a7d92cf0f6640065aee8af2450af0e993",
+#else
+    "3943145545a2bd27a642b2045d4e3dbae55c6870",
+#endif
+    CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
  },
  // the following values may be adjusted for times since we start tuning the strategy
  {
--- a/test/processing/ProcessUT_DownSample.cpp
+++ b/test/processing/ProcessUT_DownSample.cpp
@ -30,6 +30,27 @@ void DyadicBilinearDownsampler_ref (uint8_t* pDst, const int32_t kiDstStride,
  }
 }

+void DyadicBilinearDownsampler2_ref (uint8_t* pDst, const int32_t kiDstStride,
+                                     const uint8_t* pSrc, const int32_t kiSrcStride,
+                                     const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+  uint8_t* pDstLine = pDst;
+  const uint8_t* pSrcLine1 = pSrc;
+  const uint8_t* pSrcLine2 = pSrc + kiSrcStride;
+  const int32_t kiDstWidth  = kiSrcWidth >> 1;
+  const int32_t kiDstHeight = kiSrcHeight >> 1;
+
+  for (int32_t j = 0; j < kiDstHeight; j++) {
+    for (int32_t i = 0; i < kiDstWidth; i++) {
+      const int32_t kiTempCol1 = (pSrcLine1[2 * i + 0] + pSrcLine2[2 * i + 0] + 1) >> 1;
+      const int32_t kiTempCol2 = (pSrcLine1[2 * i + 1] + pSrcLine2[2 * i + 1] + 1) >> 1;
+      pDstLine[i] = (uint8_t) ((kiTempCol1 + kiTempCol2 + 1) >> 1);
+    }
+    pDstLine += kiDstStride;
+    pSrcLine1 += 2 * kiSrcStride;
+    pSrcLine2 += 2 * kiSrcStride;
+  }
+}
+
 void GeneralBilinearFastDownsampler_ref (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
    const int32_t kiDstHeight,
    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
@ -162,7 +183,7 @@ void GeneralBilinearAccurateDownsampler_ref (uint8_t* pDst, const int32_t kiDstS
  }
 }

-#define GENERATE_DyadicBilinearDownsampler_UT(func, ASM, CPUFLAGS) \
+#define GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, ref_func) \
 TEST (DownSampleTest, func) { \
  if (ASM) {\
    int32_t iCpuCores = 0; \
@ -190,7 +211,7 @@ TEST (DownSampleTest, func) { \
    dst_c[j] = dst_a[j] = rand() % 256; \
    src_c[j] = src_a[j] = rand() % 256; \
  } \
-  DyadicBilinearDownsampler_ref (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \
+  ref_func (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \
  func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a); \
  for (int j = 0; j < (src_height_c >> 1); j++) { \
    for (int m = 0; m < (src_width_c >> 1); m++) { \
@ -199,6 +220,11 @@ TEST (DownSampleTest, func) { \
  } \
 }

+#define GENERATE_DyadicBilinearDownsampler_UT(func, ASM, CPUFLAGS) \
+  GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, DyadicBilinearDownsampler_ref)
+#define GENERATE_DyadicBilinearDownsampler2_UT(func, ASM, CPUFLAGS) \
+  GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, DyadicBilinearDownsampler2_ref)
+
 #define GENERATE_DyadicBilinearOneThirdDownsampler_UT(func, ASM, CPUFLAGS) \
 TEST (DownSampleTest, func) { \
  if (ASM) {\
@ -328,11 +354,8 @@ GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse, 1,
 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse, 1, WELS_CPU_SSE)
 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx8_sse, 1, WELS_CPU_SSE)

-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_ssse3, 1, WELS_CPU_SSSE3)
-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_ssse3, 1, WELS_CPU_SSSE3)
-
-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse4, 1, WELS_CPU_SSE41)
-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse4, 1, WELS_CPU_SSE41)
+GENERATE_DyadicBilinearDownsampler2_UT (DyadicBilinearDownsamplerWidthx32_ssse3, 1, WELS_CPU_SSSE3)
+GENERATE_DyadicBilinearDownsampler2_UT (DyadicBilinearDownsamplerWidthx16_ssse3, 1, WELS_CPU_SSSE3)

 GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_ssse3, 1, WELS_CPU_SSSE3)
 GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_sse4, 1, WELS_CPU_SSE41)