[Processing/x86] Add an SSE4.1 implementation of GeneralBilinearAccurateDownsample

Keep track of relative pixel offsets and utilize pshufb to efficiently extract relevant pixels for horizontal scaling ratios <= 4. Fall back to a generic approach for ratios > 4. The use of blendps makes this require SSE4.1. The pshufb path can be backported to SSSE3 and the generic path to SSE2 for a minor reduction in performance by replacing blendps and preceding instructions with an equivalent sequence. The implementation assumes that data beyond the end of each line, before the next line begins, can be dirtied; which AFAICT is safe with the current usage of these routines. Speedup is ~5.32x/~4.25x (32-bit/64-bit) for horizontal ratios <= 2, ~5.06x/~3.97x for ratios within (2, 4], and ~3.93x/~3.13x for ratios > 4 when not memory-bound on Haswell as compared with the current SSE2 implementation.
2016-05-23 13:02:21 +02:00 · 2016-05-23 13:02:21 +02:00 · b1013095b1
commit b1013095b1
parent 1995e03d91
5 changed files with 455 additions and 0 deletions
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@ -107,6 +107,7 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int
    sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_sse4;
    sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
    sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_sse4;
+    sDownsampleFunc.pfGeneralRatioChroma  = GeneralBilinearAccurateDownsamplerWrap_sse41;
  }
 #endif//X86_ASM

--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@ -102,6 +102,7 @@ HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx32_sse4;
 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_ssse3;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse41;

 SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_ssse3;
 SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_sse4;
@ -118,6 +119,9 @@ void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDst
 void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
    uint32_t uiScaleY);
+void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+    uint32_t uiScaleY);

 WELSVP_EXTERN_C_END
 #endif
--- a/codec/processing/src/downsample/downsamplefuncs.cpp
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@ -283,6 +283,7 @@ static void GeneralBilinearDownsamplerWrap (uint8_t* pDst, const int32_t kiDstSt
 DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (sse2)
 DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2)
 DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3)
+DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41)
 #endif //X86_ASM

 #ifdef HAVE_NEON
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@ -2766,6 +2766,166 @@ WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4
    paddw           xmm_xfrac1, xmm_xfrac_inc
 %endmacro

+; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6
+%macro SSE2_BilinearIncXposw 6
+    pxor            %6, %6
+    paddw           %2, %4
+    pcmpgtw         %6, %2
+    paddb           %1, %3
+    psubb           %1, %6  ; add carry
+    pand            %2, %5
+%endmacro
+
+; outl=%1 outh=%2 in=%3 7FFFh=%4
+%macro SSE2_UnpckXFracw 4
+    movdqa          %1, %3
+    pxor            %1, %4
+    movdqa          %2, %1
+    punpcklwd       %1, %3
+    punpckhwd       %2, %3
+%endmacro
+
+; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6
+%macro SSE41_LinearAccurateInterpolateVerticalDwords 6
+    pshufd          %1, %2, 10110001b
+    pshufd          %6, %3, 10110001b
+    pmuludq         %1, %4
+    pmuludq         %6, %5
+    paddq           %1, %6
+    pmuludq         %2, %4
+    pmuludq         %3, %5
+    paddq           %2, %3
+    psllq           %1,  3
+    psrlq           %2, 29
+    blendps         %1, %2, 0101b
+%endmacro
+
+%macro SSE41_BilinearAccurateDownsample2xOrLess_8px 0
+    movdqa          xmm_tmp0, xmm_xpos_int
+    pshufb          xmm_tmp0, xmm_0
+    psubb           xmm_xpos_int, xmm_tmp0
+    SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
+    mov             r_tmp0, i_xpos
+    lea             i_xpos, [i_xpos + 8 * i_scalex]
+    shr             r_tmp0, 16
+    movdqu          xmm_tmp4, [p_src_row0 + r_tmp0]
+    pshufb          xmm_tmp4, xmm_xpos_int
+    movdqa          xmm_tmp5, xmm_tmp4
+    punpcklbw       xmm_tmp4, xmm_0
+    punpckhbw       xmm_tmp5, xmm_0
+    pmaddwd         xmm_tmp4, xmm_tmp0
+    pmaddwd         xmm_tmp5, xmm_tmp1
+    movdqu          xmm_tmp2, [p_src_row1 + r_tmp0]
+    pshufb          xmm_tmp2, xmm_xpos_int
+    movdqa          xmm_tmp3, xmm_tmp2
+    punpcklbw       xmm_tmp2, xmm_0
+    punpckhbw       xmm_tmp3, xmm_0
+    pmaddwd         xmm_tmp2, xmm_tmp0
+    pmaddwd         xmm_tmp3, xmm_tmp1
+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp1
+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp5, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
+    packssdw        xmm_tmp0, xmm_tmp1
+    pavgw           xmm_tmp0, xmm_0
+    packuswb        xmm_tmp0, xmm_tmp0
+    movlps          [p_dst], xmm_tmp0
+    add             p_dst, 8
+    SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0
+%endmacro
+
+%macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0
+    movdqa          xmm_tmp0, xmm_xpos_int
+    pshufb          xmm_tmp0, [shufb_0000000088888888]
+    psubb           xmm_xpos_int, xmm_tmp0
+    SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    movdqa          xmm_tmp3, xmm_xpos_int
+    punpcklbw       xmm_tmp3, [db80h_128]
+    movdqu          xmm_tmp4, [p_src_row0 + r_tmp0]
+    movdqu          xmm_tmp2, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 4 * i_scalex]
+    lea             i_xpos, [i_xpos + 8 * i_scalex]
+    shr             r_tmp0, 16
+    pshufb          xmm_tmp4, xmm_tmp3
+    pshufb          xmm_tmp2, xmm_tmp3
+    pmaddwd         xmm_tmp4, xmm_tmp0
+    pmaddwd         xmm_tmp2, xmm_tmp0
+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
+    movdqa          xmm_tmp2, xmm_xpos_int
+    punpckhbw       xmm_tmp2, [db80h_128]
+    movdqu          xmm_tmp4, [p_src_row0 + r_tmp0]
+    movdqu          xmm_tmp3, [p_src_row1 + r_tmp0]
+    pshufb          xmm_tmp4, xmm_tmp2
+    pshufb          xmm_tmp3, xmm_tmp2
+    pmaddwd         xmm_tmp4, xmm_tmp1
+    pmaddwd         xmm_tmp3, xmm_tmp1
+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
+    packssdw        xmm_tmp0, xmm_tmp1
+    pavgw           xmm_tmp0, xmm_0
+    packuswb        xmm_tmp0, xmm_tmp0
+    movlps          [p_dst], xmm_tmp0
+    add             p_dst, 8
+    SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0
+%endmacro
+
+%macro SSE41_GeneralBilinearAccurateDownsample_8px 0
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    movd            xmm_tmp4, [p_src_row0 + r_tmp0]
+    movd            xmm_tmp2, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 1 * i_scalex]
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 1
+    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 1
+    lea             r_tmp0, [i_xpos + 2 * i_scalex]
+    lea             i_xpos, [i_xpos + 4 * i_scalex]
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 2
+    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 2
+    mov             r_tmp0, i_xpos
+    sub             r_tmp0, i_scalex
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 3
+    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 3
+    punpcklbw       xmm_tmp4, xmm_0
+    punpcklbw       xmm_tmp2, xmm_0
+    pmaddwd         xmm_tmp4, xmm_xfrac0
+    pmaddwd         xmm_tmp2, xmm_xfrac0
+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    movd            xmm_tmp4, [p_src_row0 + r_tmp0]
+    movd            xmm_tmp3, [p_src_row1 + r_tmp0]
+    lea             r_tmp0, [i_xpos + 1 * i_scalex]
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 1
+    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 1
+    lea             r_tmp0, [i_xpos + 2 * i_scalex]
+    lea             i_xpos, [i_xpos + 4 * i_scalex]
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 2
+    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 2
+    mov             r_tmp0, i_xpos
+    sub             r_tmp0, i_scalex
+    shr             r_tmp0, 16
+    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 3
+    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 3
+    punpcklbw       xmm_tmp4, xmm_0
+    punpcklbw       xmm_tmp3, xmm_0
+    pmaddwd         xmm_tmp4, xmm_xfrac1
+    pmaddwd         xmm_tmp3, xmm_xfrac1
+    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
+    packssdw        xmm_tmp0, xmm_tmp1
+    pavgw           xmm_tmp0, xmm_0
+    packuswb        xmm_tmp0, xmm_tmp0
+    movlps          [p_dst], xmm_tmp0
+    add             p_dst, 8
+    paddw           xmm_xfrac0, xmm_xfrac_inc
+    paddw           xmm_xfrac1, xmm_xfrac_inc
+    pand            xmm_xfrac0, xmm_7fff
+    pand            xmm_xfrac1, xmm_7fff
+%endmacro
+
 ; downsample_8px_macro=%1 b_fast=%2
 %macro SSE2_GeneralBilinearDownsampler_loop 2
 %%height:
@ -3116,3 +3276,290 @@ WELS_EXTERN GeneralBilinearFastDownsampler_ssse3
 %undef xmm_xfrac0_begin
 %undef xmm_xfrac1_begin
 %undef xmm_xfrac_inc
+
+;**************************************************************************************************************
+;void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+;    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+;    uint32_t uiScaleY);
+;
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse41
+    %assign push_num 0
+%ifndef X86_32
+    push            r12
+    push            r13
+    push            rbx
+    push            rbp
+    %assign push_num 4
+%ifdef WIN64
+    push            rdi
+    push            rsi
+    %assign push_num push_num + 2
+%endif
+%endif
+    LOAD_7_PARA
+    PUSH_XMM 16
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r2, r2d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
+    ZERO_EXTENSION  r6d
+    sub             r1, r2                                            ; dst_stride - dst_width
+    add             r6, r6                                            ; 2 * scalex
+%ifdef X86_32
+    movd            xmm0, arg8
+    movd            xmm1, esp
+    and             esp, -16
+    sub             esp, 8 * 4 + 8 * 16
+    movd            [esp], xmm1
+    %define p_dst                   r0
+    %define i_dst_stride_less_width [esp + 1 * 4]
+    %define i_dst_width             [esp + 2 * 4]
+    %define i_dst_height            dword [esp + 3 * 4]
+    %define p_src                   [esp + 4 * 4]
+    %define i_src_stride            [esp + 5 * 4]
+    %define i_scalex                r6
+    %define i_scalexd               r6d
+    %define i_scaleyd               [esp + 6 * 4]
+    %define i_xpos                  r2
+    %define i_ypos                  dword [esp + 7 * 4]
+    %define i_yposd                 dword [esp + 7 * 4]
+    %define p_src_row0              r3
+    %define p_src_row1              r4
+    %define i_width_cnt             r5
+    %define r_tmp0                  r1
+    %define r_tmp0b                 r1b
+    %define xmm_xpos_frac           xmm1
+    %define xmm_xpos_frac_inc       [esp + 8 * 4]
+    %define xmm_xpos_int            xmm3
+    %define xmm_xpos_int_inc        [esp + 8 * 4 + 1 * 16]
+    %define xmm_yfrac0              [esp + 8 * 4 + 2 * 16]
+    %define xmm_yfrac1              [esp + 8 * 4 + 3 * 16]
+    %define xmm_tmp0                xmm7
+    %define xmm_tmp1                xmm0
+    %define xmm_tmp2                xmm2
+    %define xmm_tmp3                xmm4
+    %define xmm_tmp4                xmm5
+    %define xmm_tmp5                xmm6
+    %define xmm_0                   [esp + 8 * 4 + 4 * 16]
+    %define xmm_7fff                [esp + 8 * 4 + 5 * 16]
+    %define xmm_xpos_int_begin      [esp + 8 * 4 + 6 * 16]
+    %define xmm_xpos_frac_begin     [esp + 8 * 4 + 7 * 16]
+    mov             i_dst_stride_less_width, r1
+    mov             i_dst_width, r2
+    mov             i_dst_height, r3
+    mov             p_src, r4
+    mov             i_src_stride, r5
+    movd            i_scaleyd, xmm0
+    pxor            xmm_tmp5, xmm_tmp5
+    movdqa          xmm_0, xmm_tmp5
+    pcmpeqw         xmm_tmp5, xmm_tmp5
+    psrlw           xmm_tmp5, 1
+    movdqa          xmm_7fff, xmm_tmp5
+%else
+    %define p_dst                   r0
+    %define i_dst_stride_less_width r1
+    %define i_dst_width             r2
+    %define i_dst_height            r3
+    %define p_src                   r4
+    %define i_src_stride            r5
+    %define i_scalex                r6
+    %define i_scalexd               r6d
+    %define i_scaleyd               dword arg8d
+    %define i_xpos                  r12
+    %define i_ypos                  r13
+    %define i_yposd                 r13d
+    %define p_src_row0              rbp
+%ifdef WIN64
+    %define p_src_row1              rsi
+    %define i_width_cnt             rdi
+%else
+    %define p_src_row1              r11
+    %define i_width_cnt             rax
+%endif
+    %define r_tmp0                  rbx
+    %define r_tmp0b                 bl
+    %define xmm_0                   xmm0
+    %define xmm_xpos_frac           xmm1
+    %define xmm_xpos_frac_inc       xmm8
+    %define xmm_xpos_int            xmm3
+    %define xmm_xpos_int_inc        xmm10
+    %define xmm_yfrac0              xmm11
+    %define xmm_yfrac1              xmm12
+    %define xmm_tmp0                xmm7
+    %define xmm_tmp1                xmm2
+    %define xmm_tmp2                xmm9
+    %define xmm_tmp3                xmm4
+    %define xmm_tmp4                xmm5
+    %define xmm_tmp5                xmm6
+    %define xmm_7fff                xmm13
+    %define xmm_xpos_int_begin      xmm14
+    %define xmm_xpos_frac_begin     xmm15
+    pxor            xmm_0, xmm_0
+    pcmpeqw         xmm_7fff, xmm_7fff
+    psrlw           xmm_7fff, 1
+%endif
+
+    sub             i_dst_height, 1
+    je              .final_row
+    jl              .done
+
+    mov             i_ypos, 1 << 14
+    movd            xmm_xpos_frac, i_scalexd
+    pshufd          xmm_xpos_frac, xmm_xpos_frac, 0
+    movdqa          xmm_tmp0, xmm_xpos_frac
+    pslld           xmm_tmp0, 2
+    pslldq          xmm_xpos_frac, 4
+    paddd           xmm_tmp0, xmm_xpos_frac
+    movdqa          xmm_tmp1, xmm_xpos_frac
+    pslldq          xmm_tmp1, 4
+    paddd           xmm_xpos_frac, xmm_tmp1
+    paddd           xmm_tmp0, xmm_tmp1
+    pslldq          xmm_tmp1, 4
+    paddd           xmm_xpos_frac, xmm_tmp1
+    paddd           xmm_tmp0, xmm_tmp1
+    pcmpeqw         xmm_tmp1, xmm_tmp1
+    psrld           xmm_tmp1, 31
+    pslld           xmm_tmp1, 15
+    paddd           xmm_xpos_frac, xmm_tmp1
+    paddd           xmm_tmp0, xmm_tmp1
+    movdqa          xmm_xpos_int, xmm_xpos_frac
+    movdqa          xmm_tmp1, xmm_tmp0
+    psrld           xmm_xpos_int, 16
+    psrld           xmm_tmp1, 16
+    packssdw        xmm_xpos_int, xmm_tmp1
+    packuswb        xmm_xpos_int, xmm_xpos_int
+    movdqa          xmm_tmp1, xmm_xpos_int
+    pcmpeqw         xmm_tmp2, xmm_tmp2
+    psubb           xmm_tmp1, xmm_tmp2
+    punpcklbw       xmm_xpos_int, xmm_tmp1
+    pslld           xmm_xpos_frac, 16
+    pslld           xmm_tmp0, 16
+    psrad           xmm_xpos_frac, 16
+    psrad           xmm_tmp0, 16
+    packssdw        xmm_xpos_frac, xmm_tmp0
+    psrlw           xmm_xpos_frac, 1
+    movd            xmm_tmp0, i_scalexd
+    pslld           xmm_tmp0, 3
+    movdqa          xmm_tmp1, xmm_tmp0
+    punpcklwd       xmm_tmp0, xmm_tmp0
+    pshufd          xmm_tmp0, xmm_tmp0, 0
+    psrlw           xmm_tmp0, 1
+    movdqa          xmm_xpos_frac_inc, xmm_tmp0
+    psrld           xmm_tmp1, 16
+    pxor            xmm_tmp2, xmm_tmp2
+    pshufb          xmm_tmp1, xmm_tmp2
+    movdqa          xmm_xpos_int_inc, xmm_tmp1
+    movdqa          xmm_xpos_int_begin, xmm_xpos_int
+    movdqa          xmm_xpos_frac_begin, xmm_xpos_frac
+
+    cmp             i_scalex, 4 << 16
+    ja              .scalex_above4
+    cmp             i_scalex, 2 << 16
+    ja              .scalex_above2_beloweq4
+    SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample2xOrLess_8px, 0
+    jmp             .final_row
+%ifdef X86_32
+    %undef xmm_yfrac0
+    %xdefine xmm_yfrac0 xmm_tmp5
+    %undef xmm_tmp5
+%endif
+.scalex_above2_beloweq4:
+    SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample4xOrLess_8px, 0
+    jmp             .final_row
+.scalex_above4:
+%xdefine xmm_xfrac0 xmm_xpos_frac
+%xdefine xmm_xfrac1 xmm_xpos_int
+%xdefine xmm_xfrac0_begin xmm_xpos_int_begin
+%xdefine xmm_xfrac1_begin xmm_xpos_frac_begin
+%xdefine xmm_xfrac_inc xmm_xpos_frac_inc
+%undef xmm_xpos_int
+%undef xmm_xpos_frac
+%undef xmm_xpos_int_begin
+%undef xmm_xpos_frac_begin
+%undef xmm_xpos_int_inc
+%undef xmm_xpos_frac_inc
+    SSE2_UnpckXFracw xmm_tmp0, xmm_xfrac1, xmm_xfrac0, xmm_7fff
+    movdqa          xmm_xfrac0, xmm_tmp0
+    movdqa          xmm_xfrac0_begin, xmm_xfrac0
+    movdqa          xmm_xfrac1_begin, xmm_xfrac1
+    pcmpeqw         xmm_tmp0, xmm_tmp0
+    pmullw          xmm_tmp0, xmm_xfrac_inc
+    punpcklwd       xmm_tmp0, xmm_xfrac_inc
+    movdqa          xmm_xfrac_inc, xmm_tmp0
+    SSE2_GeneralBilinearDownsampler_loop SSE41_GeneralBilinearAccurateDownsample_8px, 0
+
+.final_row:
+    mov             p_src_row0, i_ypos
+    shr             p_src_row0, 15
+    imul            p_src_row0, i_src_stride
+    add             p_src_row0, p_src
+    mov             i_xpos, 1 << 15
+    mov             i_width_cnt, i_dst_width
+
+.final_row_width:
+    mov             r_tmp0, i_xpos
+    shr             r_tmp0, 16
+    movzx           r_tmp0, byte [p_src_row0 + r_tmp0]
+    mov             [p_dst], r_tmp0b
+    add             p_dst, 1
+    add             i_xpos, i_scalex
+    sub             i_width_cnt, 1
+    jg              .final_row_width
+
+.done:
+%ifdef X86_32
+    mov             esp, [esp]
+%endif
+    POP_XMM
+    LOAD_7_PARA_POP
+%ifndef X86_32
+%ifdef WIN64
+    pop             rsi
+    pop             rdi
+%endif
+    pop             rbp
+    pop             rbx
+    pop             r13
+    pop             r12
+%endif
+    ret
+%undef p_dst
+%undef i_dst_stride_less_width
+%undef i_dst_width
+%undef i_dst_height
+%undef p_src
+%undef i_src_stride
+%undef i_scalex
+%undef i_scalexd
+%undef i_scaleyd
+%undef i_xpos
+%undef i_ypos
+%undef i_yposd
+%undef p_src_row0
+%undef p_src_row1
+%undef i_width_cnt
+%undef r_tmp0
+%undef r_tmp0b
+%undef xmm_0
+%undef xmm_xpos_frac
+%undef xmm_xpos_frac_inc
+%undef xmm_xpos_int
+%undef xmm_xpos_int_inc
+%undef xmm_yfrac0
+%undef xmm_yfrac1
+%undef xmm_tmp0
+%undef xmm_tmp1
+%undef xmm_tmp2
+%undef xmm_tmp3
+%undef xmm_tmp4
+%undef xmm_tmp5
+%undef xmm_7fff
+%undef xmm_xpos_int_begin
+%undef xmm_xpos_frac_begin
+%undef xmm_xfrac0
+%undef xmm_xfrac1
+%undef xmm_xfrac0_begin
+%undef xmm_xfrac1_begin
+%undef xmm_xfrac_inc
--- a/test/processing/ProcessUT_DownSample.cpp
+++ b/test/processing/ProcessUT_DownSample.cpp
@ -345,6 +345,8 @@ GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_s
                                        GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE2)
 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_ssse3, GeneralBilinearFastDownsampler_ref, 1,
                                        WELS_CPU_SSSE3)
+GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse41,
+                                        GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE41)
 #endif

 #if defined(HAVE_NEON)