[Processing/x86] Add an AVX2 implementation of GeneralBilinearFastDownsample
Keep track of relative pixel offsets and utilize pshufb to efficiently extract relevant pixels for horizontal scaling ratios <= 8. Because pshufb does not cross 128-bit lanes, the overhead of address calculations and loads is relatively greater as compared with an SSSE3 implementation. Fall back to a generic approach for ratios > 8. The implementation assumes that data beyond the end of each line, before the next line begins, can be dirtied; which AFAICT is safe with the current usage of these routines. Speedup is ~10.42x/~5.23x (32-bit/64-bit) for horizontal ratios <= 2, ~9.49x/~4.64x for ratios within (2, 4], ~6.43x/~3.18x for ratios within (4, 8], and ~5.42x/~2.50x for ratios > 8 when not memory-bound on Haswell as compared with the current SSE2 implementation.
This commit is contained in:
parent
b1013095b1
commit
b43e58a366
@ -109,6 +109,9 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse4;
|
||||
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse41;
|
||||
}
|
||||
if (iCpuFlag & WELS_CPU_AVX2) {
|
||||
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_avx2;
|
||||
}
|
||||
#endif//X86_ASM
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
|
@ -103,6 +103,7 @@ GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
|
||||
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
|
||||
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_ssse3;
|
||||
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse41;
|
||||
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_avx2;
|
||||
|
||||
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_ssse3;
|
||||
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_sse4;
|
||||
@ -122,6 +123,9 @@ void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, in
|
||||
void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
|
||||
int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
|
||||
uint32_t uiScaleY);
|
||||
void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
|
||||
int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
|
||||
uint32_t uiScaleY);
|
||||
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
@ -284,6 +284,7 @@ DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (sse2)
|
||||
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2)
|
||||
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3)
|
||||
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41)
|
||||
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (avx2)
|
||||
#endif //X86_ASM
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
|
@ -53,15 +53,15 @@
|
||||
; Local Data (Read Only)
|
||||
;***********************************************************************
|
||||
|
||||
SECTION .rodata align=16
|
||||
SECTION .rodata align=32
|
||||
|
||||
;***********************************************************************
|
||||
; Various memory constants (trigonometric values or rounding values)
|
||||
;***********************************************************************
|
||||
|
||||
ALIGN 16
|
||||
db80h_128:
|
||||
times 16 db 80h
|
||||
ALIGN 32
|
||||
db80h_256:
|
||||
times 32 db 80h
|
||||
shufb_0000000088888888:
|
||||
times 8 db 0
|
||||
times 8 db 8
|
||||
@ -2682,7 +2682,7 @@ WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4
|
||||
movdqu xmm_tmp3, [p_src_row0 + r_tmp0]
|
||||
movdqu xmm_tmp4, [p_src_row1 + r_tmp0]
|
||||
movdqa xmm_tmp2, xmm_xpos_int
|
||||
punpcklbw xmm_tmp2, [db80h_128]
|
||||
punpcklbw xmm_tmp2, [db80h_256]
|
||||
pshufb xmm_tmp3, xmm_tmp2
|
||||
pshufb xmm_tmp4, xmm_tmp2
|
||||
SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
|
||||
@ -2695,7 +2695,7 @@ WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4
|
||||
movdqu xmm_tmp3, [p_src_row0 + r_tmp0]
|
||||
movdqu xmm_tmp4, [p_src_row1 + r_tmp0]
|
||||
movdqa xmm_tmp2, xmm_xpos_int
|
||||
punpckhbw xmm_tmp2, [db80h_128]
|
||||
punpckhbw xmm_tmp2, [db80h_256]
|
||||
pshufb xmm_tmp3, xmm_tmp2
|
||||
pshufb xmm_tmp4, xmm_tmp2
|
||||
SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
|
||||
@ -2840,7 +2840,7 @@ WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4
|
||||
mov r_tmp0, i_xpos
|
||||
shr r_tmp0, 16
|
||||
movdqa xmm_tmp3, xmm_xpos_int
|
||||
punpcklbw xmm_tmp3, [db80h_128]
|
||||
punpcklbw xmm_tmp3, [db80h_256]
|
||||
movdqu xmm_tmp4, [p_src_row0 + r_tmp0]
|
||||
movdqu xmm_tmp2, [p_src_row1 + r_tmp0]
|
||||
lea r_tmp0, [i_xpos + 4 * i_scalex]
|
||||
@ -2852,7 +2852,7 @@ WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4
|
||||
pmaddwd xmm_tmp2, xmm_tmp0
|
||||
SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
|
||||
movdqa xmm_tmp2, xmm_xpos_int
|
||||
punpckhbw xmm_tmp2, [db80h_128]
|
||||
punpckhbw xmm_tmp2, [db80h_256]
|
||||
movdqu xmm_tmp4, [p_src_row0 + r_tmp0]
|
||||
movdqu xmm_tmp3, [p_src_row1 + r_tmp0]
|
||||
pshufb xmm_tmp4, xmm_tmp2
|
||||
@ -3563,3 +3563,694 @@ WELS_EXTERN GeneralBilinearAccurateDownsampler_sse41
|
||||
%undef xmm_xfrac0_begin
|
||||
%undef xmm_xfrac1_begin
|
||||
%undef xmm_xfrac_inc
|
||||
|
||||
; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
|
||||
%macro AVX2_BilinearIncXposuw 5
|
||||
vpaddusw %5, %2, %4
|
||||
vpaddw %2, %2, %4
|
||||
vpcmpeqw %5, %5, %2
|
||||
vpaddb %1, %1, %3
|
||||
vpaddb %1, %1, %5 ; subtract 1 if no carry
|
||||
%endmacro
|
||||
|
||||
; outl=%1 outh=%2 in=%3 FFFFh/7FFFh=%4
|
||||
%macro AVX2_UnpckXFrac 4
|
||||
vpxor %1, %3, %4
|
||||
vpunpckhwd %2, %1, %3
|
||||
vpunpcklwd %1, %1, %3
|
||||
%endmacro
|
||||
|
||||
; out0=%1 out1=%2 xfrac=%3 yfrac0=%4 yfrac1=%5
|
||||
%macro AVX2_BilinearFastCalcXYFrac 5
|
||||
vpmulhuw %2, %3, %5
|
||||
vpmulhuw %1, %3, %4
|
||||
%endmacro
|
||||
|
||||
; [in:dwordsl out:bytes] dwordsh=%2 zero=%3
|
||||
%macro AVX2_BilinearFastPackDwordsToBytes 3
|
||||
vpsrld %1, %1, 14
|
||||
vpsrld %2, %2, 14
|
||||
vpackssdw %1, %1, %2
|
||||
vpavgw %1, %1, %3
|
||||
vpackuswb %1, %1, %1
|
||||
%endmacro
|
||||
|
||||
%macro AVX2_BilinearFastDownsample2xOrLess_16px 0
|
||||
vpshufb ymm_tmp0, ymm_xpos_int, ymm_0
|
||||
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
|
||||
AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
|
||||
mov r_tmp0, i_xpos
|
||||
shr r_tmp0, 16
|
||||
vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
|
||||
vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0]
|
||||
lea r_tmp0, [i_xpos + 4 * i_scalex2]
|
||||
lea i_xpos, [i_xpos + 8 * i_scalex2]
|
||||
shr r_tmp0, 16
|
||||
vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
|
||||
vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
|
||||
vpshufb ymm_tmp4, ymm_tmp4, ymm_xpos_int
|
||||
vpshufb ymm_tmp5, ymm_tmp5, ymm_xpos_int
|
||||
AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
|
||||
vpunpcklbw ymm_tmp3, ymm_tmp4, ymm_0
|
||||
vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp3
|
||||
vpunpcklbw ymm_tmp3, ymm_tmp5, ymm_0
|
||||
vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp3
|
||||
vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp2
|
||||
AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp3, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
|
||||
vpunpckhbw ymm_tmp2, ymm_tmp4, ymm_0
|
||||
vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp2
|
||||
vpunpckhbw ymm_tmp2, ymm_tmp5, ymm_0
|
||||
vpmaddwd ymm_tmp3, ymm_tmp3, ymm_tmp2
|
||||
vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3
|
||||
AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
|
||||
vmovlps [p_dst], xmm_tmp0
|
||||
vextracti128 [p_dst + 8], ymm_tmp0, 1
|
||||
add p_dst, 16
|
||||
AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
|
||||
%endmacro
|
||||
|
||||
%macro AVX2_BilinearFastDownsample4xOrLess_16px 0
|
||||
vbroadcasti128 ymm_tmp0, [shufb_0000000088888888]
|
||||
vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
|
||||
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
|
||||
AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
|
||||
mov r_tmp0, i_xpos
|
||||
shr r_tmp0, 16
|
||||
vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
|
||||
vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0]
|
||||
lea r_tmp0, [i_xpos + 4 * i_scalex2]
|
||||
shr r_tmp0, 16
|
||||
vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
|
||||
vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
|
||||
lea r_tmp0, [i_xpos + 2 * i_scalex2]
|
||||
lea i_xpos, [r_tmp0 + 4 * i_scalex2]
|
||||
shr r_tmp0, 16
|
||||
vpunpcklbw ymm_tmp2, ymm_xpos_int, ymm_ffff
|
||||
vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp2
|
||||
vpshufb ymm_tmp3, ymm_tmp3, ymm_tmp2
|
||||
AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
|
||||
vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp4
|
||||
vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp3
|
||||
vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp2
|
||||
vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
|
||||
vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0]
|
||||
mov r_tmp0, i_xpos
|
||||
lea i_xpos, [i_xpos + 2 * i_scalex2]
|
||||
shr r_tmp0, 16
|
||||
vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
|
||||
vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
|
||||
vpunpckhbw ymm_tmp2, ymm_xpos_int, ymm_ffff
|
||||
vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp2
|
||||
vpshufb ymm_tmp3, ymm_tmp3, ymm_tmp2
|
||||
AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp2, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
|
||||
vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp4
|
||||
vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp3
|
||||
vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp2
|
||||
AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
|
||||
vmovlps [p_dst], xmm_tmp0
|
||||
vextracti128 [p_dst + 8], ymm_tmp0, 1
|
||||
add p_dst, 16
|
||||
AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
|
||||
%endmacro
|
||||
|
||||
%macro AVX2_BilinearFastDownsample8xOrLess_16px 0
|
||||
vbroadcasti128 ymm_tmp0, [shufb_000044448888CCCC]
|
||||
vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
|
||||
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
|
||||
mov r_tmp0, i_xpos
|
||||
shr r_tmp0, 16
|
||||
vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
|
||||
vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0]
|
||||
lea r_tmp0, [i_xpos + 4 * i_scalex2]
|
||||
add i_xpos, i_scalex2
|
||||
shr r_tmp0, 16
|
||||
vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
|
||||
vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
|
||||
mov r_tmp0, i_xpos
|
||||
shr r_tmp0, 16
|
||||
vmovdqu xmm_tmp0, [p_src_row0 + r_tmp0]
|
||||
vmovdqu xmm_tmp1, [p_src_row1 + r_tmp0]
|
||||
lea r_tmp0, [i_xpos + 4 * i_scalex2]
|
||||
add i_xpos, i_scalex2
|
||||
shr r_tmp0, 16
|
||||
vinserti128 ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
|
||||
vinserti128 ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
|
||||
vpunpcklbw ymm_tmp3, ymm_xpos_int, ymm_ffff
|
||||
vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3
|
||||
vpshufb ymm_tmp5, ymm_tmp5, ymm_tmp3
|
||||
vpshufb ymm_tmp0, ymm_tmp0, ymm_tmp3
|
||||
vpshufb ymm_tmp1, ymm_tmp1, ymm_tmp3
|
||||
vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 11001100b
|
||||
vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp1, 11001100b
|
||||
AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
|
||||
AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
|
||||
vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp4
|
||||
vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp5
|
||||
vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp2
|
||||
mov r_tmp0, i_xpos
|
||||
shr r_tmp0, 16
|
||||
vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
|
||||
vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0]
|
||||
lea r_tmp0, [i_xpos + 4 * i_scalex2]
|
||||
add i_xpos, i_scalex2
|
||||
shr r_tmp0, 16
|
||||
vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
|
||||
vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
|
||||
mov r_tmp0, i_xpos
|
||||
lea i_xpos, [i_xpos + 4 * i_scalex2]
|
||||
shr r_tmp0, 16
|
||||
vmovdqu xmm_tmp2, [p_src_row0 + r_tmp0]
|
||||
vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0]
|
||||
mov r_tmp0, i_xpos
|
||||
add i_xpos, i_scalex2
|
||||
shr r_tmp0, 16
|
||||
vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row0 + r_tmp0], 1
|
||||
vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
|
||||
vpshufb ymm_tmp4, ymm_tmp4, ymm_xpos_int
|
||||
vpshufb ymm_tmp5, ymm_tmp5, ymm_xpos_int
|
||||
vpshufb ymm_tmp2, ymm_tmp2, ymm_xpos_int
|
||||
vpshufb ymm_tmp3, ymm_tmp3, ymm_xpos_int
|
||||
vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp2, 10001000b
|
||||
vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp3, 10001000b
|
||||
vpunpckhbw ymm_tmp4, ymm_tmp4, ymm_0
|
||||
vpunpckhbw ymm_tmp5, ymm_tmp5, ymm_0
|
||||
AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp3, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
|
||||
vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp4
|
||||
vpmaddwd ymm_tmp3, ymm_tmp3, ymm_tmp5
|
||||
vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3
|
||||
AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
|
||||
vmovlps [p_dst], xmm_tmp0
|
||||
vextracti128 [p_dst + 8], ymm_tmp0, 1
|
||||
add p_dst, 16
|
||||
AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
|
||||
%endmacro
|
||||
|
||||
%macro AVX2_GeneralBilinearFastDownsample_16px 0
|
||||
mov r_tmp0, i_xpos
|
||||
shr r_tmp0, 16
|
||||
vpbroadcastd ymm_tmp4, [p_src_row0 + r_tmp0]
|
||||
vpbroadcastd ymm_tmp5, [p_src_row1 + r_tmp0]
|
||||
lea r_tmp0, [i_xpos + 1 * i_scalex]
|
||||
shr r_tmp0, 16
|
||||
vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0]
|
||||
vpunpcklwd ymm_tmp4, ymm_tmp4, ymm_tmp0
|
||||
vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0]
|
||||
vpunpcklwd ymm_tmp5, ymm_tmp5, ymm_tmp0
|
||||
lea r_tmp0, [i_xpos + 2 * i_scalex]
|
||||
lea i_xpos, [i_xpos + 4 * i_scalex]
|
||||
shr r_tmp0, 16
|
||||
vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0]
|
||||
vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 00100010b
|
||||
vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0]
|
||||
vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp0, 00100010b
|
||||
mov r_tmp0, i_xpos
|
||||
sub r_tmp0, i_scalex
|
||||
shr r_tmp0, 16
|
||||
vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
|
||||
vpblendw ymm_tmp4, ymm_tmp4, ymm_tmp0, 1000b
|
||||
vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
|
||||
vpblendw ymm_tmp5, ymm_tmp5, ymm_tmp0, 1000b
|
||||
mov r_tmp0, i_xpos
|
||||
shr r_tmp0, 16
|
||||
vpbroadcastd ymm_tmp2, [p_src_row0 + r_tmp0]
|
||||
vpbroadcastd ymm_tmp3, [p_src_row1 + r_tmp0]
|
||||
lea r_tmp0, [i_xpos + 1 * i_scalex]
|
||||
shr r_tmp0, 16
|
||||
vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0]
|
||||
vpunpcklwd ymm_tmp2, ymm_tmp2, ymm_tmp0
|
||||
vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0]
|
||||
vpunpcklwd ymm_tmp3, ymm_tmp3, ymm_tmp0
|
||||
lea r_tmp0, [i_xpos + 2 * i_scalex]
|
||||
lea i_xpos, [i_xpos + 4 * i_scalex]
|
||||
shr r_tmp0, 16
|
||||
vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0]
|
||||
vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp0, 00100010b
|
||||
vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0]
|
||||
vpblendd ymm_tmp3, ymm_tmp3, ymm_tmp0, 00100010b
|
||||
mov r_tmp0, i_xpos
|
||||
sub r_tmp0, i_scalex
|
||||
shr r_tmp0, 16
|
||||
vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
|
||||
vpblendw ymm_tmp2, ymm_tmp2, ymm_tmp0, 1000b
|
||||
vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
|
||||
vpblendw ymm_tmp3, ymm_tmp3, ymm_tmp0, 1000b
|
||||
mov r_tmp0, i_xpos
|
||||
shr r_tmp0, 16
|
||||
vmovd xmm_tmp0, [p_src_row0 + r_tmp0]
|
||||
vmovd xmm_tmp1, [p_src_row1 + r_tmp0]
|
||||
lea r_tmp0, [i_xpos + i_scalex]
|
||||
shr r_tmp0, 16
|
||||
vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 1
|
||||
vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 1
|
||||
lea r_tmp0, [i_xpos + 2 * i_scalex]
|
||||
lea i_xpos, [i_xpos + 4 * i_scalex]
|
||||
shr r_tmp0, 16
|
||||
vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 2
|
||||
vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 2
|
||||
mov r_tmp0, i_xpos
|
||||
sub r_tmp0, i_scalex
|
||||
shr r_tmp0, 16
|
||||
vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 3
|
||||
vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 3
|
||||
vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 00001111b
|
||||
vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp1, 00001111b
|
||||
mov r_tmp0, i_xpos
|
||||
shr r_tmp0, 16
|
||||
vmovd xmm_tmp0, [p_src_row0 + r_tmp0]
|
||||
vmovd xmm_tmp1, [p_src_row1 + r_tmp0]
|
||||
lea r_tmp0, [i_xpos + i_scalex]
|
||||
shr r_tmp0, 16
|
||||
vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 1
|
||||
vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 1
|
||||
lea r_tmp0, [i_xpos + 2 * i_scalex]
|
||||
lea i_xpos, [i_xpos + 4 * i_scalex]
|
||||
shr r_tmp0, 16
|
||||
vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 2
|
||||
vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 2
|
||||
mov r_tmp0, i_xpos
|
||||
sub r_tmp0, i_scalex
|
||||
shr r_tmp0, 16
|
||||
vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 3
|
||||
vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 3
|
||||
vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp0, 00001111b
|
||||
vpblendd ymm_tmp3, ymm_tmp3, ymm_tmp1, 00001111b
|
||||
vpunpcklbw ymm_tmp4, ymm_tmp4, ymm_0
|
||||
vpunpcklbw ymm_tmp5, ymm_tmp5, ymm_0
|
||||
AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp1, ymm_xfrac0, ymm_yfrac0, ymm_yfrac1
|
||||
vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp4
|
||||
vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp5
|
||||
vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp1
|
||||
vpunpcklbw ymm_tmp4, ymm_tmp2, ymm_0
|
||||
vpunpcklbw ymm_tmp5, ymm_tmp3, ymm_0
|
||||
AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp2, ymm_xfrac1, ymm_yfrac0, ymm_yfrac1
|
||||
vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp4
|
||||
vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp5
|
||||
vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp2
|
||||
AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
|
||||
vpermq ymm_tmp0, ymm_tmp0, 0010b
|
||||
vmovdqu [p_dst], xmm_tmp0
|
||||
add p_dst, 16
|
||||
vpaddw ymm_xfrac0, ymm_xfrac0, ymm_xfrac_inc
|
||||
vpaddw ymm_xfrac1, ymm_xfrac1, ymm_xfrac_inc
|
||||
%endmacro
|
||||
|
||||
; downsample_16px_macro=%1 b_fast=%2
|
||||
%macro AVX2_GeneralBilinearDownsampler_loop 2
|
||||
%%height:
|
||||
mov p_src_row0, i_ypos
|
||||
shr p_src_row0, 15
|
||||
imul p_src_row0, i_src_stride
|
||||
add p_src_row0, p_src
|
||||
mov p_src_row1, p_src_row0
|
||||
add p_src_row1, i_src_stride
|
||||
%ifdef X86_32
|
||||
%if %2
|
||||
vpbroadcastw ymm_tmp1, i_ypos
|
||||
vpsllw ymm_tmp1, ymm_tmp1, 1
|
||||
vpsrlw ymm_tmp1, ymm_tmp1, 1
|
||||
vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0
|
||||
vpsrlw ymm_tmp0, ymm_tmp0, 1
|
||||
%else
|
||||
vpbroadcastd ymm_tmp1, i_ypos
|
||||
vpslld ymm_tmp1, ymm_tmp1, 17
|
||||
vpsrld ymm_tmp1, ymm_tmp1, 17
|
||||
vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0
|
||||
vpsrld ymm_tmp0, ymm_tmp0, 17
|
||||
%endif
|
||||
vpxor ymm_tmp0, ymm_tmp0, ymm_tmp1
|
||||
vmovdqa ymm_yfrac0, ymm_tmp0
|
||||
vmovdqa ymm_yfrac1, ymm_tmp1
|
||||
%else
|
||||
vmovd xmm_tmp0, i_yposd
|
||||
vpbroadcastw ymm_yfrac1, xmm_tmp0
|
||||
%if %2
|
||||
vpsllw ymm_yfrac1, ymm_yfrac1, 1
|
||||
vpsrlw ymm_yfrac1, ymm_yfrac1, 1
|
||||
vpcmpeqw ymm_yfrac0, ymm_yfrac0, ymm_yfrac0
|
||||
vpsrlw ymm_yfrac0, ymm_yfrac0, 1
|
||||
%else
|
||||
vpslld ymm_yfrac1, ymm_yfrac1, 17
|
||||
vpsrld ymm_yfrac1, ymm_yfrac1, 17
|
||||
vpcmpeqw ymm_yfrac0, ymm_yfrac0, ymm_yfrac0
|
||||
vpsrld ymm_yfrac0, ymm_yfrac0, 17
|
||||
%endif
|
||||
vpxor ymm_yfrac0, ymm_yfrac0, ymm_yfrac1
|
||||
%endif
|
||||
|
||||
mov i_xpos, 1 << 15
|
||||
mov i_width_cnt, i_dst_width
|
||||
sub i_width_cnt, 1
|
||||
|
||||
%ifdef ymm_xpos_int
|
||||
vmovdqa ymm_xpos_int, ymm_xpos_int_begin
|
||||
vmovdqa ymm_xpos_frac, ymm_xpos_frac_begin
|
||||
%else
|
||||
vmovdqa ymm_xfrac0, ymm_xfrac0_begin
|
||||
vmovdqa ymm_xfrac1, ymm_xfrac1_begin
|
||||
%endif
|
||||
|
||||
%%width:
|
||||
%1
|
||||
sub i_width_cnt, 16
|
||||
jg %%width
|
||||
|
||||
lea p_dst, [p_dst + i_width_cnt + 1]
|
||||
%ifdef i_scalex2
|
||||
mov r_tmp0, i_scalex2
|
||||
shr r_tmp0, 1
|
||||
imul i_width_cnt, r_tmp0
|
||||
%else
|
||||
imul i_width_cnt, i_scalex
|
||||
%endif
|
||||
add i_xpos, i_width_cnt
|
||||
shr i_xpos, 16
|
||||
movzx r_tmp0, byte [p_src_row0 + i_xpos]
|
||||
mov [p_dst - 1], r_tmp0b
|
||||
%ifdef X86_32
|
||||
mov r_tmp0, i_scaleyd
|
||||
add i_yposd, r_tmp0
|
||||
%else
|
||||
add i_yposd, i_scaleyd
|
||||
%endif
|
||||
add p_dst, i_dst_stride_less_width
|
||||
sub i_dst_height, 1
|
||||
jg %%height
|
||||
%endmacro
|
||||
|
||||
;**************************************************************************************************************
|
||||
;void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
|
||||
; int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
|
||||
; uint32_t uiScaleY);
|
||||
;
|
||||
;**************************************************************************************************************
|
||||
|
||||
WELS_EXTERN GeneralBilinearFastDownsampler_avx2
|
||||
%assign push_num 0
|
||||
%ifndef X86_32
|
||||
push r12
|
||||
push r13
|
||||
push rbx
|
||||
push rbp
|
||||
%assign push_num 4
|
||||
%ifdef WIN64
|
||||
push rdi
|
||||
push rsi
|
||||
%assign push_num push_num + 2
|
||||
%endif
|
||||
%endif
|
||||
LOAD_7_PARA
|
||||
PUSH_XMM 16
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r2, r2d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
ZERO_EXTENSION r6d
|
||||
sub r1, r2 ; dst_stride - dst_width
|
||||
%ifdef X86_32
|
||||
vmovd xmm0, arg8
|
||||
vmovd xmm1, esp
|
||||
and esp, -32
|
||||
sub esp, 8 * 4 + 8 * 32
|
||||
vmovd [esp], xmm1
|
||||
%define p_dst r0
|
||||
%define i_dst_stride_less_width [esp + 1 * 4]
|
||||
%define i_dst_width [esp + 2 * 4]
|
||||
%define i_dst_height dword [esp + 3 * 4]
|
||||
%define p_src [esp + 4 * 4]
|
||||
%define i_src_stride [esp + 5 * 4]
|
||||
%define i_scalex r6
|
||||
%define i_scalexd r6d
|
||||
%define i_scaleyd [esp + 6 * 4]
|
||||
%define i_xpos r2
|
||||
%define i_ypos [esp + 7 * 4]
|
||||
%define i_yposd dword [esp + 7 * 4]
|
||||
%define p_src_row0 r3
|
||||
%define p_src_row1 r4
|
||||
%define i_width_cnt r5
|
||||
%define r_tmp0 r1
|
||||
%define r_tmp0b r1b
|
||||
%define ymm_xpos_frac ymm1
|
||||
%define ymm_xpos_frac_inc [esp + 8 * 4]
|
||||
%define ymm_xpos_int ymm3
|
||||
%define ymm_xpos_int_inc [esp + 8 * 4 + 1 * 32]
|
||||
%define ymm_yfrac0 [esp + 8 * 4 + 2 * 32]
|
||||
%define ymm_yfrac1 [esp + 8 * 4 + 3 * 32]
|
||||
%define xmm_tmp0 xmm7
|
||||
%define ymm_tmp0 ymm7
|
||||
%define xmm_tmp1 xmm0
|
||||
%define ymm_tmp1 ymm0
|
||||
%define xmm_tmp2 xmm2
|
||||
%define ymm_tmp2 ymm2
|
||||
%define xmm_tmp3 xmm4
|
||||
%define ymm_tmp3 ymm4
|
||||
%define xmm_tmp4 xmm5
|
||||
%define ymm_tmp4 ymm5
|
||||
%define xmm_tmp5 xmm6
|
||||
%define ymm_tmp5 ymm6
|
||||
%define ymm_0 [esp + 8 * 4 + 4 * 32]
|
||||
%define ymm_ffff [esp + 8 * 4 + 5 * 32]
|
||||
%define ymm_xpos_int_begin [esp + 8 * 4 + 6 * 32]
|
||||
%define ymm_xpos_frac_begin [esp + 8 * 4 + 7 * 32]
|
||||
mov i_dst_stride_less_width, r1
|
||||
mov i_dst_width, r2
|
||||
mov i_dst_height, r3
|
||||
mov p_src, r4
|
||||
mov i_src_stride, r5
|
||||
vmovd i_scaleyd, xmm0
|
||||
vpxor xmm0, xmm0, xmm0
|
||||
vmovdqa ymm_0, ymm0
|
||||
vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0
|
||||
vmovdqa ymm_ffff, ymm_tmp0
|
||||
%else
|
||||
%define p_dst r0
|
||||
%define i_dst_stride_less_width r1
|
||||
%define i_dst_width r2
|
||||
%define i_dst_height r3
|
||||
%define p_src r4
|
||||
%define i_src_stride r5
|
||||
%define i_scalex r6
|
||||
%define i_scalexd r6d
|
||||
%define i_scaleyd dword arg8d
|
||||
%define i_xpos r12
|
||||
%define i_ypos r13
|
||||
%define i_yposd r13d
|
||||
%define p_src_row0 rbp
|
||||
%ifdef WIN64
|
||||
%define p_src_row1 rsi
|
||||
%define i_width_cnt rdi
|
||||
%else
|
||||
%define p_src_row1 r11
|
||||
%define i_width_cnt rax
|
||||
%endif
|
||||
%define r_tmp0 rbx
|
||||
%define r_tmp0b bl
|
||||
%define ymm_0 ymm0
|
||||
%define ymm_xpos_frac ymm1
|
||||
%define ymm_xpos_frac_inc ymm2
|
||||
%define ymm_xpos_int ymm3
|
||||
%define ymm_xpos_int_inc ymm4
|
||||
%define ymm_yfrac0 ymm5
|
||||
%define ymm_yfrac1 ymm6
|
||||
%define xmm_tmp0 xmm7
|
||||
%define ymm_tmp0 ymm7
|
||||
%define xmm_tmp1 xmm8
|
||||
%define ymm_tmp1 ymm8
|
||||
%define xmm_tmp2 xmm9
|
||||
%define ymm_tmp2 ymm9
|
||||
%define xmm_tmp3 xmm10
|
||||
%define ymm_tmp3 ymm10
|
||||
%define xmm_tmp4 xmm11
|
||||
%define ymm_tmp4 ymm11
|
||||
%define xmm_tmp5 xmm12
|
||||
%define ymm_tmp5 ymm12
|
||||
%define ymm_ffff ymm13
|
||||
%define ymm_xpos_int_begin ymm14
|
||||
%define ymm_xpos_frac_begin ymm15
|
||||
vpxor ymm_0, ymm_0, ymm_0
|
||||
vpcmpeqw ymm_ffff, ymm_ffff, ymm_ffff
|
||||
%endif
|
||||
|
||||
sub i_dst_height, 1
|
||||
je .final_row
|
||||
jl .done
|
||||
|
||||
mov i_yposd, 1 << 14
|
||||
vmovd xmm_tmp0, i_scalexd
|
||||
vpbroadcastd ymm_tmp0, xmm_tmp0
|
||||
vpslld ymm_tmp1, ymm_tmp0, 2
|
||||
vpslld ymm_tmp2, ymm_tmp0, 3
|
||||
vpaddd ymm_tmp3, ymm_tmp1, ymm_tmp2
|
||||
vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4
|
||||
vpblendd ymm_tmp1, ymm_tmp4, ymm_tmp1, 11110000b
|
||||
vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp3, 11110000b
|
||||
vpaddd ymm_tmp3, ymm_tmp0, ymm_tmp0
|
||||
vpblendd ymm_tmp3, ymm_tmp4, ymm_tmp3, 11001100b
|
||||
vpblendd ymm_tmp0, ymm_tmp4, ymm_tmp0, 10101010b
|
||||
vpaddd ymm_tmp0, ymm_tmp3, ymm_tmp0
|
||||
vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp0
|
||||
vpaddd ymm_tmp2, ymm_tmp2, ymm_tmp0
|
||||
vpcmpeqw ymm_tmp3, ymm_tmp3, ymm_tmp3
|
||||
vpsrld ymm_tmp3, ymm_tmp3, 31
|
||||
vpslld ymm_tmp3, ymm_tmp3, 15
|
||||
vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3
|
||||
vpaddd ymm_tmp2, ymm_tmp2, ymm_tmp3
|
||||
vpsrld ymm_xpos_int, ymm_tmp1, 16
|
||||
vpsrld ymm_tmp0, ymm_tmp2, 16
|
||||
vpackssdw ymm_xpos_int, ymm_xpos_int, ymm_tmp0
|
||||
vpermq ymm_xpos_int, ymm_xpos_int, 11011000b
|
||||
vpackuswb ymm_xpos_int, ymm_xpos_int, ymm_xpos_int
|
||||
vpcmpeqw ymm_tmp3, ymm_tmp3, ymm_tmp3
|
||||
vpsubb ymm_tmp0, ymm_xpos_int, ymm_tmp3
|
||||
vpunpcklbw ymm_xpos_int, ymm_xpos_int, ymm_tmp0
|
||||
vpslld ymm_tmp1, ymm_tmp1, 16
|
||||
vpsrld ymm_tmp1, ymm_tmp1, 16
|
||||
vpslld ymm_tmp2, ymm_tmp2, 16
|
||||
vpsrld ymm_tmp2, ymm_tmp2, 16
|
||||
vpackusdw ymm_xpos_frac, ymm_tmp1, ymm_tmp2
|
||||
vpermq ymm_xpos_frac, ymm_xpos_frac, 11011000b
|
||||
vmovd xmm_tmp0, i_scalexd
|
||||
vpslld xmm_tmp0, xmm_tmp0, 4
|
||||
vpbroadcastw ymm_tmp1, xmm_tmp0
|
||||
vmovdqa ymm_xpos_frac_inc, ymm_tmp1
|
||||
vpsrld xmm_tmp0, xmm_tmp0, 16
|
||||
vpsubw ymm_tmp0, ymm_tmp0, ymm_tmp3
|
||||
vpbroadcastb ymm_tmp0, xmm_tmp0
|
||||
vmovdqa ymm_xpos_int_inc, ymm_tmp0
|
||||
vmovdqa ymm_xpos_int_begin, ymm_xpos_int
|
||||
vmovdqa ymm_xpos_frac_begin, ymm_xpos_frac
|
||||
|
||||
cmp i_scalex, 4 << 16
|
||||
ja .scalex_above4
|
||||
cmp i_scalex, 2 << 16
|
||||
ja .scalex_above2_beloweq4
|
||||
add i_scalex, i_scalex
|
||||
%xdefine i_scalex2 i_scalex
|
||||
%undef i_scalex
|
||||
AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample2xOrLess_16px, 1
|
||||
shr i_scalex2, 1
|
||||
%xdefine i_scalex i_scalex2
|
||||
%undef i_scalex2
|
||||
jmp .final_row
|
||||
.scalex_above2_beloweq4:
|
||||
add i_scalex, i_scalex
|
||||
%xdefine i_scalex2 i_scalex
|
||||
%undef i_scalex
|
||||
AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample4xOrLess_16px, 1
|
||||
shr i_scalex2, 1
|
||||
%xdefine i_scalex i_scalex2
|
||||
%undef i_scalex2
|
||||
jmp .final_row
|
||||
.scalex_above4:
|
||||
cmp i_scalex, 8 << 16
|
||||
ja .scalex_above8
|
||||
add i_scalex, i_scalex
|
||||
%xdefine i_scalex2 i_scalex
|
||||
%undef i_scalex
|
||||
AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample8xOrLess_16px, 1
|
||||
shr i_scalex2, 1
|
||||
%xdefine i_scalex i_scalex2
|
||||
%undef i_scalex2
|
||||
jmp .final_row
|
||||
.scalex_above8:
|
||||
%xdefine ymm_xfrac0 ymm_xpos_frac
|
||||
%xdefine ymm_xfrac1 ymm_xpos_int
|
||||
%xdefine ymm_xfrac0_begin ymm_xpos_int_begin
|
||||
%xdefine ymm_xfrac1_begin ymm_xpos_frac_begin
|
||||
%xdefine ymm_xfrac_inc ymm_xpos_frac_inc
|
||||
%undef ymm_xpos_int
|
||||
%undef ymm_xpos_frac
|
||||
%undef ymm_xpos_int_begin
|
||||
%undef ymm_xpos_frac_begin
|
||||
%undef ymm_xpos_int_inc
|
||||
%undef ymm_xpos_frac_inc
|
||||
AVX2_UnpckXFrac ymm_tmp0, ymm_xfrac1, ymm_xfrac0, ymm_ffff
|
||||
vpermq ymm_xfrac0, ymm_tmp0, 01001110b
|
||||
vpermq ymm_xfrac1, ymm_xfrac1, 01001110b
|
||||
vmovdqa ymm_xfrac0_begin, ymm_xfrac0
|
||||
vmovdqa ymm_xfrac1_begin, ymm_xfrac1
|
||||
vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0
|
||||
vpmullw ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
|
||||
vpunpcklwd ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
|
||||
vmovdqa ymm_xfrac_inc, ymm_tmp0
|
||||
AVX2_GeneralBilinearDownsampler_loop AVX2_GeneralBilinearFastDownsample_16px, 1
|
||||
|
||||
.final_row:
|
||||
mov p_src_row0, i_ypos
|
||||
shr p_src_row0, 15
|
||||
imul p_src_row0, i_src_stride
|
||||
add p_src_row0, p_src
|
||||
mov i_xpos, 1 << 15
|
||||
mov i_width_cnt, i_dst_width
|
||||
|
||||
.final_row_width:
|
||||
mov r_tmp0, i_xpos
|
||||
shr r_tmp0, 16
|
||||
movzx r_tmp0, byte [p_src_row0 + r_tmp0]
|
||||
mov [p_dst], r_tmp0b
|
||||
add p_dst, 1
|
||||
add i_xpos, i_scalex
|
||||
sub i_width_cnt, 1
|
||||
jg .final_row_width
|
||||
|
||||
.done:
|
||||
vzeroupper
|
||||
%ifdef X86_32
|
||||
mov esp, [esp]
|
||||
%endif
|
||||
POP_XMM
|
||||
LOAD_7_PARA_POP
|
||||
%ifndef X86_32
|
||||
%ifdef WIN64
|
||||
pop rsi
|
||||
pop rdi
|
||||
%endif
|
||||
pop rbp
|
||||
pop rbx
|
||||
pop r13
|
||||
pop r12
|
||||
%endif
|
||||
ret
|
||||
%undef p_dst
|
||||
%undef i_dst_stride_less_width
|
||||
%undef i_dst_width
|
||||
%undef i_dst_height
|
||||
%undef p_src
|
||||
%undef i_src_stride
|
||||
%undef i_scalex
|
||||
%undef i_scalexd
|
||||
%undef i_scaleyd
|
||||
%undef i_xpos
|
||||
%undef i_ypos
|
||||
%undef i_yposd
|
||||
%undef p_src_row0
|
||||
%undef p_src_row1
|
||||
%undef i_width_cnt
|
||||
%undef r_tmp0
|
||||
%undef r_tmp0b
|
||||
%undef ymm_xpos_frac
|
||||
%undef ymm_xpos_frac_inc
|
||||
%undef ymm_xpos_int
|
||||
%undef ymm_xpos_int_inc
|
||||
%undef ymm_yfrac0
|
||||
%undef ymm_yfrac1
|
||||
%undef xmm_tmp0
|
||||
%undef ymm_tmp0
|
||||
%undef xmm_tmp1
|
||||
%undef ymm_tmp1
|
||||
%undef xmm_tmp2
|
||||
%undef ymm_tmp2
|
||||
%undef xmm_tmp3
|
||||
%undef ymm_tmp3
|
||||
%undef xmm_tmp4
|
||||
%undef ymm_tmp4
|
||||
%undef xmm_tmp5
|
||||
%undef ymm_tmp5
|
||||
%undef ymm_ffff
|
||||
%undef ymm_0
|
||||
%undef ymm_xpos_int_begin
|
||||
%undef ymm_xpos_frac_begin
|
||||
%undef ymm_xfrac0
|
||||
%undef ymm_xfrac1
|
||||
%undef ymm_xfrac0_begin
|
||||
%undef ymm_xfrac1_begin
|
||||
%undef ymm_xfrac_inc
|
||||
|
@ -347,6 +347,8 @@ GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_ssse3
|
||||
WELS_CPU_SSSE3)
|
||||
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse41,
|
||||
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE41)
|
||||
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_avx2, GeneralBilinearFastDownsampler_ref, 1,
|
||||
WELS_CPU_AVX2)
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
|
Loading…
x
Reference in New Issue
Block a user