Add x86 64bit asm code for downsample

This commit is contained in:
zhiliang wang 2014-07-08 19:25:31 +08:00
parent ca61e286b4
commit ae12fbde1c
3 changed files with 1043 additions and 350 deletions

View File

@ -57,21 +57,21 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsampler_c;
#if defined(X86_ASM)
if (iCpuFlag & WELS_CPU_SSE) {
/* sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse;
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse;
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsamplerWidthx8_sse;*/
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse;
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse;
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsamplerWidthx8_sse;
}
if (iCpuFlag & WELS_CPU_SSE2) {
// sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
// sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_sse2;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_sse2;
}
if (iCpuFlag & WELS_CPU_SSSE3) {
// sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_ssse3;
// sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3;
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_ssse3;
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3;
}
if (iCpuFlag & WELS_CPU_SSE41) {
// sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4;
// sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4;
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4;
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4;
}
#endif//X86_ASM

View File

@ -202,31 +202,31 @@ void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStr
#ifdef X86_ASM
//void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
// const int32_t kiDstHeight,
// uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
// const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
// const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
//
// uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);
// uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);
//
// GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
// pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
//}
//
//void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
// const int32_t kiDstHeight,
// uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
// const int32_t kiScaleBit = 15;
// const uint32_t kuiScale = (1 << kiScaleBit);
//
// uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
// uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
//
// GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
// pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
//}
void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);
uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);
GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
}
void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
const int32_t kiScaleBit = 15;
const uint32_t kuiScale = (1 << kiScaleBit);
uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
}
#endif //X86_ASM
#ifdef HAVE_NEON

File diff suppressed because it is too large Load Diff