Add x86 64bit asm code for downsample

This commit is contained in:
zhiliang wang 2014-07-08 19:25:31 +08:00
parent ca61e286b4
commit ae12fbde1c
3 changed files with 1043 additions and 350 deletions

View File

@ -57,21 +57,21 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsampler_c; sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsampler_c;
#if defined(X86_ASM) #if defined(X86_ASM)
if (iCpuFlag & WELS_CPU_SSE) { if (iCpuFlag & WELS_CPU_SSE) {
/* sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse; sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse;
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse; sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse;
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsamplerWidthx8_sse;*/ sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsamplerWidthx8_sse;
} }
if (iCpuFlag & WELS_CPU_SSE2) { if (iCpuFlag & WELS_CPU_SSE2) {
// sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2; sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
// sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_sse2; sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_sse2;
} }
if (iCpuFlag & WELS_CPU_SSSE3) { if (iCpuFlag & WELS_CPU_SSSE3) {
// sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_ssse3; sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_ssse3;
// sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3; sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3;
} }
if (iCpuFlag & WELS_CPU_SSE41) { if (iCpuFlag & WELS_CPU_SSE41) {
// sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4; sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4;
// sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4; sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4;
} }
#endif//X86_ASM #endif//X86_ASM

View File

@ -202,31 +202,31 @@ void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStr
#ifdef X86_ASM #ifdef X86_ASM
//void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
// const int32_t kiDstHeight, const int32_t kiDstHeight,
// uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
// const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15; const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
// const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight); const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
//
// uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth); uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);
// uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight); uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);
//
// GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight, GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
// pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley); pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
//} }
//
//void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
// const int32_t kiDstHeight, const int32_t kiDstHeight,
// uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
// const int32_t kiScaleBit = 15; const int32_t kiScaleBit = 15;
// const uint32_t kuiScale = (1 << kiScaleBit); const uint32_t kuiScale = (1 << kiScaleBit);
//
// uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale); uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
// uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale); uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
//
// GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight, GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
// pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley); pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
//} }
#endif //X86_ASM #endif //X86_ASM
#ifdef HAVE_NEON #ifdef HAVE_NEON

File diff suppressed because it is too large Load Diff