Merge pull request #2489 from saamas/processing-dyadic-bilinear-downsample-optimizations
[Processing] DyadicBilinearDownsample optimizations
This commit is contained in:
commit
4b6f037020
@ -485,7 +485,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-
|
||||
%endmacro
|
||||
|
||||
%macro WELS_EXTERN 1
|
||||
ALIGN 16
|
||||
ALIGN 16, nop
|
||||
%ifdef PREFIX
|
||||
global _%1
|
||||
%define %1 _%1
|
||||
|
@ -83,10 +83,6 @@ WELSVP_NAMESPACE_BEGIN
|
||||
#define WELS_CLAMP(x, minv, maxv) WELS_MIN(WELS_MAX(x, minv), maxv)
|
||||
|
||||
#define ALIGNBYTES (16) /* Worst case is requiring alignment to an 16 byte boundary */
|
||||
#define WELS_ALIGN(iInput) ((iInput+(ALIGNMENT-1)) & ~(ALIGNMENT-1))
|
||||
#define WELS_ALIGN2(iInput) ((iInput+1) & ~1)
|
||||
#define WELS_ALIGN4(iInput) ((iInput+3) & ~3)
|
||||
#define WELS_ALIGN8(iInput) ((iInput+7) & ~7)
|
||||
|
||||
#define WelsCastFromPointer(p) (reinterpret_cast<intptr_t>(p))
|
||||
#define WelsStaticCast(type, p) (static_cast<type>(p))
|
||||
|
@ -32,6 +32,7 @@
|
||||
|
||||
#include "downsample.h"
|
||||
#include "cpu.h"
|
||||
#include <cassert>
|
||||
|
||||
WELSVP_NAMESPACE_BEGIN
|
||||
#define MAX_SAMPLE_WIDTH 1920
|
||||
@ -75,20 +76,18 @@ void CDownsampling::FreeSampleBuffer() {
|
||||
WelsFree (m_pSampleBuffer[i][2]);
|
||||
}
|
||||
}
|
||||
|
||||
void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int32_t iCpuFlag) {
|
||||
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsampler_c;
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;
|
||||
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;
|
||||
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;
|
||||
sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsampler_c;
|
||||
sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsampler_c;
|
||||
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_c;
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_c;
|
||||
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsampler_c;
|
||||
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsampler_c;
|
||||
#if defined(X86_ASM)
|
||||
if (iCpuFlag & WELS_CPU_SSE) {
|
||||
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse;
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse;
|
||||
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsamplerWidthx8_sse;
|
||||
sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_sse;
|
||||
sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsamplerWidthx16_sse;
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse;
|
||||
}
|
||||
if (iCpuFlag & WELS_CPU_SSE2) {
|
||||
@ -96,15 +95,13 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
|
||||
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_sse2;
|
||||
}
|
||||
if (iCpuFlag & WELS_CPU_SSSE3) {
|
||||
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_ssse3;
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3;
|
||||
sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_ssse3;
|
||||
sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsamplerWidthx16_ssse3;
|
||||
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_ssse3;
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_ssse3;
|
||||
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_ssse3;
|
||||
}
|
||||
if (iCpuFlag & WELS_CPU_SSE41) {
|
||||
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4;
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4;
|
||||
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse4;
|
||||
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse41;
|
||||
@ -117,10 +114,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
if (iCpuFlag & WELS_CPU_NEON) {
|
||||
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_neon;
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon;
|
||||
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon;
|
||||
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon;
|
||||
sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_neon;
|
||||
sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsampler_neon;
|
||||
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_neon;
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_neon;
|
||||
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon;
|
||||
@ -130,10 +125,8 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
if (iCpuFlag & WELS_CPU_NEON) {
|
||||
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_AArch64_neon;
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_AArch64_neon;
|
||||
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_AArch64_neon;
|
||||
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_AArch64_neon;
|
||||
sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_AArch64_neon;
|
||||
sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsampler_AArch64_neon;
|
||||
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_AArch64_neon;
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_AArch64_neon;
|
||||
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
|
||||
@ -159,14 +152,11 @@ EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDs
|
||||
if (iSrcWidthY > MAX_SAMPLE_WIDTH || iSrcHeightY > MAX_SAMPLE_HEIGHT || m_bNoSampleBuffer) {
|
||||
if ((iSrcWidthY >> 1) == iDstWidthY && (iSrcHeightY >> 1) == iDstHeightY) {
|
||||
// use half average functions
|
||||
uint8_t iAlignIndex = GetAlignedIndex (iSrcWidthY);
|
||||
m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
|
||||
DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
|
||||
(uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
|
||||
|
||||
iAlignIndex = GetAlignedIndex (iSrcWidthUV);
|
||||
m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
|
||||
DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
|
||||
(uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
|
||||
m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
|
||||
DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
|
||||
(uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
|
||||
} else if ((iSrcWidthY >> 2) == iDstWidthY && (iSrcHeightY >> 2) == iDstHeightY) {
|
||||
|
||||
@ -223,29 +213,23 @@ EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDs
|
||||
do {
|
||||
if ((iHalfSrcWidth == iDstWidthY) && (iHalfSrcHeight == iDstHeightY)) { //end
|
||||
// use half average functions
|
||||
uint8_t iAlignIndex = GetAlignedIndex (iSrcWidthY);
|
||||
m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
|
||||
DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
|
||||
(uint8_t*)pSrcY, iSrcStrideY, iSrcWidthY, iSrcHeightY);
|
||||
|
||||
iAlignIndex = GetAlignedIndex (iSrcWidthUV);
|
||||
m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
|
||||
DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
|
||||
(uint8_t*)pSrcU, iSrcStrideU, iSrcWidthUV, iSrcHeightUV);
|
||||
m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
|
||||
DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
|
||||
(uint8_t*)pSrcV, iSrcStrideV, iSrcWidthUV, iSrcHeightUV);
|
||||
break;
|
||||
} else if (((iHalfSrcWidth >> 1) >= iDstWidthY) && ((iHalfSrcHeight >> 1) >= iDstHeightY)) {
|
||||
// use half average functions
|
||||
iDstStrideY = iHalfSrcWidth;
|
||||
iDstStrideU = iHalfSrcWidth >> 1;
|
||||
iDstStrideV = iHalfSrcWidth >> 1;
|
||||
uint8_t iAlignIndex = GetAlignedIndex (iSrcWidthY);
|
||||
m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstY, iDstStrideY,
|
||||
iDstStrideY = WELS_ALIGN (iHalfSrcWidth, 32);
|
||||
iDstStrideU = WELS_ALIGN (iHalfSrcWidth >> 1, 32);
|
||||
iDstStrideV = WELS_ALIGN (iHalfSrcWidth >> 1, 32);
|
||||
DownsampleHalfAverage ((uint8_t*)pDstY, iDstStrideY,
|
||||
(uint8_t*)pSrcY, iSrcStrideY, iSrcWidthY, iSrcHeightY);
|
||||
|
||||
iAlignIndex = GetAlignedIndex (iSrcWidthUV);
|
||||
m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstU, iDstStrideU,
|
||||
DownsampleHalfAverage ((uint8_t*)pDstU, iDstStrideU,
|
||||
(uint8_t*)pSrcU, iSrcStrideU, iSrcWidthUV, iSrcHeightUV);
|
||||
m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstV, iDstStrideV,
|
||||
DownsampleHalfAverage ((uint8_t*)pDstV, iDstStrideV,
|
||||
(uint8_t*)pSrcV, iSrcStrideV, iSrcWidthUV, iSrcHeightUV);
|
||||
|
||||
pSrcY = (uint8_t*)pDstY;
|
||||
@ -258,9 +242,9 @@ EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDs
|
||||
iSrcHeightY = iHalfSrcHeight;
|
||||
iSrcHeightUV = iHalfSrcHeight >> 1;
|
||||
|
||||
iSrcStrideY = iSrcWidthY;
|
||||
iSrcStrideU = iSrcWidthUV;
|
||||
iSrcStrideV = iSrcWidthUV;
|
||||
iSrcStrideY = iDstStrideY;
|
||||
iSrcStrideU = iDstStrideU;
|
||||
iSrcStrideV = iDstStrideV;
|
||||
|
||||
iHalfSrcWidth >>= 1;
|
||||
iHalfSrcHeight >>= 1;
|
||||
@ -286,17 +270,18 @@ EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDs
|
||||
return RET_SUCCESS;
|
||||
}
|
||||
|
||||
int32_t CDownsampling::GetAlignedIndex (const int32_t kiSrcWidth) {
|
||||
int32_t iAlignIndex;
|
||||
if ((kiSrcWidth & 0x1f) == 0) // x32
|
||||
iAlignIndex = 0;
|
||||
else if ((kiSrcWidth & 0x0f) == 0) // x16
|
||||
iAlignIndex = 1;
|
||||
else if ((kiSrcWidth & 0x07) == 0) // x8
|
||||
iAlignIndex = 2;
|
||||
else
|
||||
iAlignIndex = 3;
|
||||
return iAlignIndex;
|
||||
void CDownsampling::DownsampleHalfAverage (uint8_t* pDst, int32_t iDstStride,
|
||||
uint8_t* pSrc, int32_t iSrcStride, int32_t iSrcWidth, int32_t iSrcHeight) {
|
||||
if ((iSrcStride & 31) == 0) {
|
||||
assert ((iDstStride & 15) == 0);
|
||||
m_pfDownsample.pfHalfAverageWidthx32 (pDst, iDstStride,
|
||||
pSrc, iSrcStride, WELS_ALIGN (iSrcWidth & ~1, 32), iSrcHeight);
|
||||
} else {
|
||||
assert ((iSrcStride & 15) == 0);
|
||||
assert ((iDstStride & 7) == 0);
|
||||
m_pfDownsample.pfHalfAverageWidthx16 (pDst, iDstStride,
|
||||
pSrc, iSrcStride, WELS_ALIGN (iSrcWidth & ~1, 16), iSrcHeight);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -73,8 +73,8 @@ SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_c;
|
||||
SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_c;
|
||||
|
||||
typedef struct {
|
||||
// align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;
|
||||
PHalveDownsampleFunc pfHalfAverage[4];
|
||||
PHalveDownsampleFunc pfHalfAverageWidthx32;
|
||||
PHalveDownsampleFunc pfHalfAverageWidthx16;
|
||||
PSpecificDownsampleFunc pfOneThirdDownsampler;
|
||||
PSpecificDownsampleFunc pfQuarterDownsampler;
|
||||
PGeneralDownsampleFunc pfGeneralRatioLuma;
|
||||
@ -94,10 +94,6 @@ HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_sse;
|
||||
HalveDownsampleFunc DyadicBilinearDownsamplerWidthx16_ssse3;
|
||||
// iSrcWidth= x32 pixels
|
||||
HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_ssse3;
|
||||
// iSrcWidth= x16 pixels
|
||||
HalveDownsampleFunc DyadicBilinearDownsamplerWidthx16_sse4;
|
||||
// iSrcWidth= x32 pixels
|
||||
HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_sse4;
|
||||
|
||||
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
|
||||
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
|
||||
@ -185,7 +181,8 @@ class CDownsampling : public IStrategy {
|
||||
private:
|
||||
void InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int32_t iCpuFlag);
|
||||
|
||||
int32_t GetAlignedIndex (const int32_t kiSrcWidth);
|
||||
void DownsampleHalfAverage (uint8_t* pDst, int32_t iDstStride,
|
||||
uint8_t* pSrc, int32_t iSrcStride, int32_t iSrcWidth, int32_t iSrcHeight);
|
||||
bool AllocateSampleBuffer();
|
||||
void FreeSampleBuffer();
|
||||
private:
|
||||
|
@ -40,6 +40,10 @@
|
||||
;*************************************************************************/
|
||||
%include "asm_inc.asm"
|
||||
|
||||
%ifdef __NASM_VER__
|
||||
%use smartalign
|
||||
%endif
|
||||
|
||||
;***********************************************************************
|
||||
; Macros and other preprocessor constants
|
||||
;***********************************************************************
|
||||
@ -471,24 +475,12 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
|
||||
|
||||
|
||||
|
||||
; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
|
||||
;***********************************************************************
|
||||
; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
|
||||
; unsigned char* pSrc, const int iSrcStride,
|
||||
; const int iSrcWidth, const int iSrcHeight );
|
||||
;***********************************************************************
|
||||
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
|
||||
;push ebx
|
||||
;push edx
|
||||
;push esi
|
||||
;push edi
|
||||
;push ebp
|
||||
|
||||
;mov edi, [esp+24] ; pDst
|
||||
;mov edx, [esp+28] ; iDstStride
|
||||
;mov esi, [esp+32] ; pSrc
|
||||
;mov ecx, [esp+36] ; iSrcStride
|
||||
;mov ebp, [esp+44] ; iSrcHeight
|
||||
%ifdef X86_32
|
||||
push r6
|
||||
%assign push_num 1
|
||||
@ -496,7 +488,7 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
|
||||
%assign push_num 0
|
||||
%endif
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
PUSH_XMM 4
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
@ -508,96 +500,44 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
|
||||
%endif
|
||||
sar r5, $01 ; iSrcHeight >> 1
|
||||
|
||||
movdqa xmm7, [shufb_mask_low] ; mask low
|
||||
movdqa xmm6, [shufb_mask_high] ; mask high
|
||||
WELS_DB1 xmm3
|
||||
WELS_Zero xmm2
|
||||
sar r4, $01 ; iSrcWidth >> 1
|
||||
add r0, r4 ; pDst += iSrcWidth >> 1
|
||||
|
||||
.yloops4:
|
||||
;mov eax, [esp+40] ; iSrcWidth
|
||||
;sar eax, $01 ; iSrcWidth >> 1
|
||||
;mov ebx, eax ; iDstWidth restored at ebx
|
||||
;sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
|
||||
;neg ebx ; - (iSrcWidth >> 1)
|
||||
%ifdef X86_32
|
||||
mov r4, arg5
|
||||
%else
|
||||
mov r4, r12
|
||||
%endif
|
||||
sar r4, $01 ; iSrcWidth >> 1
|
||||
mov r6, r4 ; iDstWidth restored at ebx
|
||||
sar r4, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
|
||||
neg r6 ; - (iSrcWidth >> 1)
|
||||
neg r4 ; -(iSrcWidth >> 1)
|
||||
mov r6, r4
|
||||
align 16
|
||||
; each loop = source bandwidth: 32 bytes
|
||||
.xloops4:
|
||||
; 1st part horizonal loop: x16 bytes
|
||||
; mem hi<- ->lo
|
||||
;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
|
||||
; xmm1: p P o O n N m M l L k K j J i I
|
||||
;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
|
||||
; xmm3: p P o O n N m M l L k K j J i I
|
||||
;=> target:
|
||||
;: P O N M L K J I H G F E D C B A
|
||||
;: p o n m l k j i h g f e d c b a
|
||||
;: P .. A
|
||||
;: p .. a
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
movdqa xmm0, [r2] ; 1st_src_line
|
||||
movdqa xmm1, [r2+16] ; 1st_src_line + 16
|
||||
movdqa xmm2, [r2+r3] ; 2nd_src_line
|
||||
movdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16
|
||||
|
||||
; packing & avg
|
||||
movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
|
||||
pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
|
||||
pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
|
||||
; another implementation for xmm4 high bits
|
||||
; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
|
||||
; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
|
||||
pavgb xmm0, xmm4
|
||||
|
||||
movdqa xmm5, xmm1
|
||||
pshufb xmm1, xmm7
|
||||
pshufb xmm5, xmm6
|
||||
; psubb xmm5, xmm1
|
||||
; psrlw xmm5, 8
|
||||
pavgb xmm1, xmm5
|
||||
|
||||
movdqa xmm4, xmm2
|
||||
pshufb xmm2, xmm7
|
||||
pshufb xmm4, xmm6
|
||||
; psubb xmm4, xmm2
|
||||
; psrlw xmm4, 8
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
movdqa xmm5, xmm3
|
||||
pshufb xmm3, xmm7
|
||||
pshufb xmm5, xmm6
|
||||
; psubb xmm5, xmm3
|
||||
; psrlw xmm5, 8
|
||||
pavgb xmm3, xmm5
|
||||
|
||||
packuswb xmm0, xmm1
|
||||
packuswb xmm2, xmm3
|
||||
pavgb xmm0, xmm2
|
||||
|
||||
; write pDst
|
||||
movdqa [r0], xmm0
|
||||
|
||||
; next SMB
|
||||
lea r2, [r2+32]
|
||||
lea r0, [r0+16]
|
||||
|
||||
dec r4
|
||||
jg near .xloops4
|
||||
movdqa xmm0, [r2+r3]
|
||||
movdqa xmm1, [r2+r3+16]
|
||||
pavgb xmm0, [r2] ; avg vertical pixels 0-15
|
||||
pavgb xmm1, [r2+16] ; avg vertical pixels 16-31
|
||||
add r2, 32 ; pSrc += 32
|
||||
pmaddubsw xmm0, xmm3 ; pairwise horizontal sum neighboring pixels 0-15
|
||||
pmaddubsw xmm1, xmm3 ; pairwise horizontal sum neighboring pixels 16-31
|
||||
pavgw xmm0, xmm2 ; (sum + 1) >> 1
|
||||
pavgw xmm1, xmm2 ; (sum + 1) >> 1
|
||||
packuswb xmm0, xmm1 ; pack words to bytes
|
||||
movdqa [r0+r4], xmm0 ; store results
|
||||
add r4, 16
|
||||
jl .xloops4
|
||||
|
||||
; next line
|
||||
lea r2, [r2+2*r3] ; next end of lines
|
||||
lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth]
|
||||
lea r0, [r0+r1]
|
||||
lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
|
||||
|
||||
dec r5
|
||||
jg near .yloops4
|
||||
sub r5, 1
|
||||
jg .yloops4
|
||||
|
||||
%ifndef X86_32
|
||||
pop r12
|
||||
@ -623,7 +563,7 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
|
||||
%assign push_num 0
|
||||
%endif
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 6
|
||||
PUSH_XMM 4
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
@ -634,8 +574,11 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
|
||||
mov r12, r4
|
||||
%endif
|
||||
sar r5, $01 ; iSrcHeight >> 1
|
||||
movdqa xmm5, [shufb_mask_low] ; mask low
|
||||
movdqa xmm4, [shufb_mask_high] ; mask high
|
||||
WELS_DB1 xmm3
|
||||
WELS_Zero xmm2
|
||||
add r2, r4 ; pSrc += iSrcWidth
|
||||
sar r4, $01 ; iSrcWidth >> 1
|
||||
add r0, r4 ; pDst += iSrcWidth >> 1
|
||||
|
||||
.yloops5:
|
||||
%ifdef X86_32
|
||||
@ -644,279 +587,26 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
|
||||
mov r4, r12
|
||||
%endif
|
||||
sar r4, $01 ; iSrcWidth >> 1
|
||||
mov r6, r4 ; iDstWidth restored at ebx
|
||||
sar r4, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
|
||||
neg r6 ; - (iSrcWidth >> 1)
|
||||
neg r4 ; -(iSrcWidth >> 1)
|
||||
lea r6, [r2+r3] ; pSrc + iSrcStride
|
||||
align 16
|
||||
; each loop = source bandwidth: 16 bytes
|
||||
.xloops5:
|
||||
; horizonal loop: x16 bytes by source
|
||||
; mem hi<- ->lo
|
||||
;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
|
||||
;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
|
||||
;=> target:
|
||||
;: H G F E D C B A, P O N M L K J I
|
||||
;: h g f e d c b a, p o n m l k j i
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
movdqa xmm0, [r2] ; 1st_src_line
|
||||
movdqa xmm1, [r2+r3] ; 2nd_src_line
|
||||
|
||||
; packing & avg
|
||||
movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
|
||||
pshufb xmm0, xmm5 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
|
||||
pshufb xmm2, xmm4 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
|
||||
; another implementation for xmm2 high bits
|
||||
; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
|
||||
; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
|
||||
pavgb xmm0, xmm2
|
||||
|
||||
movdqa xmm3, xmm1
|
||||
pshufb xmm1, xmm5
|
||||
pshufb xmm3, xmm4
|
||||
; psubb xmm3, xmm1
|
||||
; psrlw xmm3, 8
|
||||
pavgb xmm1, xmm3
|
||||
|
||||
pavgb xmm0, xmm1
|
||||
packuswb xmm0, xmm1
|
||||
|
||||
; write pDst
|
||||
movq [r0], xmm0
|
||||
|
||||
; next SMB
|
||||
lea r2, [r2+16]
|
||||
lea r0, [r0+8]
|
||||
|
||||
dec r4
|
||||
jg near .xloops5
|
||||
|
||||
lea r2, [r2+2*r3] ; next end of lines
|
||||
lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth]
|
||||
lea r0, [r0+r1]
|
||||
lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
|
||||
|
||||
dec r5
|
||||
jg near .yloops5
|
||||
|
||||
%ifndef X86_32
|
||||
pop r12
|
||||
%endif
|
||||
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
%endif
|
||||
ret
|
||||
|
||||
; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
|
||||
;***********************************************************************
|
||||
; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
|
||||
; unsigned char* pSrc, const int iSrcStride,
|
||||
; const int iSrcWidth, const int iSrcHeight );
|
||||
;***********************************************************************
|
||||
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
|
||||
%ifdef X86_32
|
||||
push r6
|
||||
%assign push_num 1
|
||||
%else
|
||||
%assign push_num 0
|
||||
%endif
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
|
||||
%ifndef X86_32
|
||||
push r12
|
||||
mov r12, r4
|
||||
%endif
|
||||
sar r5, $01 ; iSrcHeight >> 1
|
||||
|
||||
movdqa xmm7, [shufb_mask_low] ; mask low
|
||||
movdqa xmm6, [shufb_mask_high] ; mask high
|
||||
|
||||
.yloops6:
|
||||
%ifdef X86_32
|
||||
mov r4, arg5
|
||||
%else
|
||||
mov r4, r12
|
||||
%endif
|
||||
sar r4, $01 ; iSrcWidth >> 1
|
||||
mov r6, r4 ; iDstWidth restored at ebx
|
||||
sar r4, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
|
||||
neg r6 ; - (iSrcWidth >> 1)
|
||||
; each loop = source bandwidth: 32 bytes
|
||||
.xloops6:
|
||||
; 1st part horizonal loop: x16 bytes
|
||||
; mem hi<- ->lo
|
||||
;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
|
||||
; xmm1: p P o O n N m M l L k K j J i I
|
||||
;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
|
||||
; xmm3: p P o O n N m M l L k K j J i I
|
||||
;=> target:
|
||||
;: P O N M L K J I H G F E D C B A
|
||||
;: p o n m l k j i h g f e d c b a
|
||||
;: P .. A
|
||||
;: p .. a
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
movntdqa xmm0, [r2] ; 1st_src_line
|
||||
movntdqa xmm1, [r2+16] ; 1st_src_line + 16
|
||||
movntdqa xmm2, [r2+r3] ; 2nd_src_line
|
||||
movntdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16
|
||||
|
||||
; packing & avg
|
||||
movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
|
||||
pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
|
||||
pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
|
||||
; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
|
||||
; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
|
||||
pavgb xmm0, xmm4
|
||||
|
||||
movdqa xmm5, xmm1
|
||||
pshufb xmm1, xmm7
|
||||
pshufb xmm5, xmm6
|
||||
; psubb xmm5, xmm1
|
||||
; psrlw xmm5, 8
|
||||
pavgb xmm1, xmm5
|
||||
|
||||
movdqa xmm4, xmm2
|
||||
pshufb xmm2, xmm7
|
||||
pshufb xmm4, xmm6
|
||||
; psubb xmm4, xmm2
|
||||
; psrlw xmm4, 8
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
movdqa xmm5, xmm3
|
||||
pshufb xmm3, xmm7
|
||||
pshufb xmm5, xmm6
|
||||
; psubb xmm5, xmm3
|
||||
; psrlw xmm5, 8
|
||||
pavgb xmm3, xmm5
|
||||
|
||||
packuswb xmm0, xmm1
|
||||
packuswb xmm2, xmm3
|
||||
pavgb xmm0, xmm2
|
||||
|
||||
; write pDst
|
||||
movdqa [r0], xmm0
|
||||
|
||||
; next SMB
|
||||
lea r2, [r2+32]
|
||||
lea r0, [r0+16]
|
||||
|
||||
dec r4
|
||||
jg near .xloops6
|
||||
|
||||
lea r2, [r2+2*r3] ; next end of lines
|
||||
lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth]
|
||||
lea r0, [r0+r1]
|
||||
lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
|
||||
|
||||
dec r5
|
||||
jg near .yloops6
|
||||
|
||||
%ifndef X86_32
|
||||
pop r12
|
||||
%endif
|
||||
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
%endif
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
|
||||
; unsigned char* pSrc, const int iSrcStride,
|
||||
; const int iSrcWidth, const int iSrcHeight );
|
||||
;***********************************************************************
|
||||
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
|
||||
%ifdef X86_32
|
||||
push r6
|
||||
%assign push_num 1
|
||||
%else
|
||||
%assign push_num 0
|
||||
%endif
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 6
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
|
||||
%ifndef X86_32
|
||||
push r12
|
||||
mov r12, r4
|
||||
%endif
|
||||
sar r5, $01 ; iSrcHeight >> 1
|
||||
movdqa xmm5, [shufb_mask_low] ; mask low
|
||||
movdqa xmm4, [shufb_mask_high] ; mask high
|
||||
|
||||
.yloops7:
|
||||
%ifdef X86_32
|
||||
mov r4, arg5
|
||||
%else
|
||||
mov r4, r12
|
||||
%endif
|
||||
sar r4, $01 ; iSrcWidth >> 1
|
||||
mov r6, r4 ; iDstWidth restored at ebx
|
||||
sar r4, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
|
||||
neg r6 ; - (iSrcWidth >> 1)
|
||||
; each loop = source bandwidth: 16 bytes
|
||||
.xloops7:
|
||||
; horizonal loop: x16 bytes by source
|
||||
; mem hi<- ->lo
|
||||
;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
|
||||
;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
|
||||
;=> target:
|
||||
;: H G F E D C B A, P O N M L K J I
|
||||
;: h g f e d c b a, p o n m l k j i
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
movntdqa xmm0, [r2] ; 1st_src_line
|
||||
movntdqa xmm1, [r2+r3] ; 2nd_src_line
|
||||
|
||||
; packing & avg
|
||||
movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
|
||||
pshufb xmm0, xmm5 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
|
||||
pshufb xmm2, xmm4 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
|
||||
; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
|
||||
; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
|
||||
pavgb xmm0, xmm2
|
||||
|
||||
movdqa xmm3, xmm1
|
||||
pshufb xmm1, xmm5
|
||||
pshufb xmm3, xmm4
|
||||
; psubb xmm3, xmm1
|
||||
; psrlw xmm3, 8
|
||||
pavgb xmm1, xmm3
|
||||
|
||||
pavgb xmm0, xmm1
|
||||
packuswb xmm0, xmm1
|
||||
|
||||
; write pDst
|
||||
movq [r0], xmm0
|
||||
|
||||
; next SMB
|
||||
lea r2, [r2+16]
|
||||
lea r0, [r0+8]
|
||||
|
||||
dec r4
|
||||
jg near .xloops7
|
||||
movdqa xmm0, [r2+2*r4]
|
||||
pavgb xmm0, [r6+2*r4] ; avg vertical pixels
|
||||
pmaddubsw xmm0, xmm3 ; pairwise horizontal sum neighboring pixels
|
||||
pavgw xmm0, xmm2 ; (sum + 1) >> 1
|
||||
packuswb xmm0, xmm0 ; pack words to bytes
|
||||
movlps [r0+r4], xmm0 ; store results
|
||||
add r4, 8
|
||||
jl .xloops5
|
||||
|
||||
; next line
|
||||
lea r2, [r2+2*r3] ; next end of lines
|
||||
lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth]
|
||||
lea r0, [r0+r1]
|
||||
lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
|
||||
|
||||
dec r5
|
||||
jg near .yloops7
|
||||
sub r5, 1
|
||||
jg .yloops5
|
||||
|
||||
%ifndef X86_32
|
||||
pop r12
|
||||
|
@ -759,9 +759,17 @@ const uint32_t kiHeight = 96; //DO NOT CHANGE!
|
||||
const uint32_t kiFrameRate = 12; //DO NOT CHANGE!
|
||||
const uint32_t kiFrameNum = 100; //DO NOT CHANGE!
|
||||
const char* pHashStr[] = { //DO NOT CHANGE!
|
||||
// X86_ASM downsampling routines average vertically first, as opposed to
|
||||
// horizontally first, which results in different output.
|
||||
#ifdef X86_ASM
|
||||
"244eebcb51f4c2a56e83fc5da3373cad9ec0e1e5",
|
||||
"bbad99ef99e37b34bcb4f09a7ec4d144375f6be7",
|
||||
"809f97e836650624d92f0b8e200a6ab25f810d6f"
|
||||
#else
|
||||
"9c4e6146b29bac5d5d4be3c5bbab9c072dcb3f3f",
|
||||
"f350001c333902029800bd291fbed915a4bdf19a",
|
||||
"eb9d853b7daec03052c4850027ac94adc84c3a7e"
|
||||
#endif
|
||||
};
|
||||
|
||||
class DecodeParseAPI : public ::testing::TestWithParam<EncodeDecodeFileParamBase>, public EncodeDecodeTestBase {
|
||||
|
@ -123,7 +123,14 @@ static const EncodeFileParam kFileParamArray[] = {
|
||||
},
|
||||
{
|
||||
"res/CiscoVT2people_320x192_12fps.yuv",
|
||||
"73156dfc1dc45924349b5b79f8debcac13d7231d", CAMERA_VIDEO_REAL_TIME, 320, 192, 12.0f, SM_SINGLE_SLICE, false, 2, false, false, false
|
||||
// X86_ASM downsampling routines average vertically first, as opposed to
|
||||
// horizontally first, which results in different output.
|
||||
#ifdef X86_ASM
|
||||
"a5341d588b769809c1f1d983e5a0fcef7362f3ad",
|
||||
#else
|
||||
"73156dfc1dc45924349b5b79f8debcac13d7231d",
|
||||
#endif
|
||||
CAMERA_VIDEO_REAL_TIME, 320, 192, 12.0f, SM_SINGLE_SLICE, false, 2, false, false, false
|
||||
},
|
||||
{
|
||||
"res/Cisco_Absolute_Power_1280x720_30fps.yuv",
|
||||
@ -131,7 +138,14 @@ static const EncodeFileParam kFileParamArray[] = {
|
||||
},
|
||||
{
|
||||
"res/Cisco_Absolute_Power_1280x720_30fps.yuv",
|
||||
"3943145545a2bd27a642b2045d4e3dbae55c6870", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
|
||||
// X86_ASM downsampling routines average vertically first, as opposed to
|
||||
// horizontally first, which results in different output.
|
||||
#ifdef X86_ASM
|
||||
"ec9d776a7d92cf0f6640065aee8af2450af0e993",
|
||||
#else
|
||||
"3943145545a2bd27a642b2045d4e3dbae55c6870",
|
||||
#endif
|
||||
CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
|
||||
},
|
||||
// the following values may be adjusted for times since we start tuning the strategy
|
||||
{
|
||||
|
@ -30,6 +30,27 @@ void DyadicBilinearDownsampler_ref (uint8_t* pDst, const int32_t kiDstStride,
|
||||
}
|
||||
}
|
||||
|
||||
void DyadicBilinearDownsampler2_ref (uint8_t* pDst, const int32_t kiDstStride,
|
||||
const uint8_t* pSrc, const int32_t kiSrcStride,
|
||||
const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
|
||||
uint8_t* pDstLine = pDst;
|
||||
const uint8_t* pSrcLine1 = pSrc;
|
||||
const uint8_t* pSrcLine2 = pSrc + kiSrcStride;
|
||||
const int32_t kiDstWidth = kiSrcWidth >> 1;
|
||||
const int32_t kiDstHeight = kiSrcHeight >> 1;
|
||||
|
||||
for (int32_t j = 0; j < kiDstHeight; j++) {
|
||||
for (int32_t i = 0; i < kiDstWidth; i++) {
|
||||
const int32_t kiTempCol1 = (pSrcLine1[2 * i + 0] + pSrcLine2[2 * i + 0] + 1) >> 1;
|
||||
const int32_t kiTempCol2 = (pSrcLine1[2 * i + 1] + pSrcLine2[2 * i + 1] + 1) >> 1;
|
||||
pDstLine[i] = (uint8_t) ((kiTempCol1 + kiTempCol2 + 1) >> 1);
|
||||
}
|
||||
pDstLine += kiDstStride;
|
||||
pSrcLine1 += 2 * kiSrcStride;
|
||||
pSrcLine2 += 2 * kiSrcStride;
|
||||
}
|
||||
}
|
||||
|
||||
void GeneralBilinearFastDownsampler_ref (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
|
||||
const int32_t kiDstHeight,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
|
||||
@ -162,7 +183,7 @@ void GeneralBilinearAccurateDownsampler_ref (uint8_t* pDst, const int32_t kiDstS
|
||||
}
|
||||
}
|
||||
|
||||
#define GENERATE_DyadicBilinearDownsampler_UT(func, ASM, CPUFLAGS) \
|
||||
#define GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, ref_func) \
|
||||
TEST (DownSampleTest, func) { \
|
||||
if (ASM) {\
|
||||
int32_t iCpuCores = 0; \
|
||||
@ -190,7 +211,7 @@ TEST (DownSampleTest, func) { \
|
||||
dst_c[j] = dst_a[j] = rand() % 256; \
|
||||
src_c[j] = src_a[j] = rand() % 256; \
|
||||
} \
|
||||
DyadicBilinearDownsampler_ref (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \
|
||||
ref_func (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \
|
||||
func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a); \
|
||||
for (int j = 0; j < (src_height_c >> 1); j++) { \
|
||||
for (int m = 0; m < (src_width_c >> 1); m++) { \
|
||||
@ -199,6 +220,11 @@ TEST (DownSampleTest, func) { \
|
||||
} \
|
||||
}
|
||||
|
||||
#define GENERATE_DyadicBilinearDownsampler_UT(func, ASM, CPUFLAGS) \
|
||||
GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, DyadicBilinearDownsampler_ref)
|
||||
#define GENERATE_DyadicBilinearDownsampler2_UT(func, ASM, CPUFLAGS) \
|
||||
GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, DyadicBilinearDownsampler2_ref)
|
||||
|
||||
#define GENERATE_DyadicBilinearOneThirdDownsampler_UT(func, ASM, CPUFLAGS) \
|
||||
TEST (DownSampleTest, func) { \
|
||||
if (ASM) {\
|
||||
@ -328,11 +354,8 @@ GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse, 1,
|
||||
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse, 1, WELS_CPU_SSE)
|
||||
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx8_sse, 1, WELS_CPU_SSE)
|
||||
|
||||
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_ssse3, 1, WELS_CPU_SSSE3)
|
||||
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_ssse3, 1, WELS_CPU_SSSE3)
|
||||
|
||||
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse4, 1, WELS_CPU_SSE41)
|
||||
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse4, 1, WELS_CPU_SSE41)
|
||||
GENERATE_DyadicBilinearDownsampler2_UT (DyadicBilinearDownsamplerWidthx32_ssse3, 1, WELS_CPU_SSSE3)
|
||||
GENERATE_DyadicBilinearDownsampler2_UT (DyadicBilinearDownsamplerWidthx16_ssse3, 1, WELS_CPU_SSSE3)
|
||||
|
||||
GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_ssse3, 1, WELS_CPU_SSSE3)
|
||||
GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_sse4, 1, WELS_CPU_SSE41)
|
||||
|
Loading…
x
Reference in New Issue
Block a user