webrtc/common_video/vplib/main/source/scale_bilinear_yuv.cc

344 lines
12 KiB
C++

/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "scale_bilinear_yuv.h"
#include <string.h>
namespace webrtc
{
// 16.16 fixed point arithmetic
const WebRtc_UWord32 kFractionBits = 16;
const WebRtc_UWord32 kFractionMax = 1 << kFractionBits;
const WebRtc_UWord32 kFractionMask = ((1 << kFractionBits) - 1);
#if USE_MMX
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <mmintrin.h>
#endif
#endif
#if USE_SSE2
#include <emmintrin.h>
#endif
#if USE_SSE2
// FilterHorizontal combines two rows of the image using linear interpolation.
// SSE2 version does 16 pixels at a time
static void FilterHorizontal(WebRtc_UWord8* ybuf,
const WebRtc_UWord8* y0_ptr,
const WebRtc_UWord8* y1_ptr,
WebRtc_UWord32 source_width,
WebRtc_UWord32 source_y_fraction)
{
__m128i zero = _mm_setzero_si128();
__m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
__m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
__m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
__m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
do
{
__m128i y0 = _mm_loadu_si128(y0_ptr128);
__m128i y1 = _mm_loadu_si128(y1_ptr128);
__m128i y2 = _mm_unpackhi_epi8(y0, zero);
__m128i y3 = _mm_unpackhi_epi8(y1, zero);
y0 = _mm_unpacklo_epi8(y0, zero);
y1 = _mm_unpacklo_epi8(y1, zero);
y0 = _mm_mullo_epi16(y0, y0_fraction);
y1 = _mm_mullo_epi16(y1, y1_fraction);
y2 = _mm_mullo_epi16(y2, y0_fraction);
y3 = _mm_mullo_epi16(y3, y1_fraction);
y0 = _mm_add_epi16(y0, y1);
y2 = _mm_add_epi16(y2, y3);
y0 = _mm_srli_epi16(y0, 8);
y2 = _mm_srli_epi16(y2, 8);
y0 = _mm_packus_epi16(y0, y2);
*dest128++ = y0;
++y0_ptr128;
++y1_ptr128;
}
while (dest128 < end128);
}
#elif USE_MMX
// MMX version does 8 pixels at a time
static void FilterHorizontal(WebRtc_UWord8* ybuf,
const WebRtc_UWord8* y0_ptr,
const WebRtc_UWord8* y1_ptr,
WebRtc_UWord32 source_width,
WebRtc_UWord32 source_y_fraction)
{
__m64 zero = _mm_setzero_si64();
__m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
__m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
__m64* dest64 = reinterpret_cast<__m64*>(ybuf);
__m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
do
{
__m64 y0 = *y0_ptr64++;
__m64 y1 = *y1_ptr64++;
__m64 y2 = _mm_unpackhi_pi8(y0, zero);
__m64 y3 = _mm_unpackhi_pi8(y1, zero);
y0 = _mm_unpacklo_pi8(y0, zero);
y1 = _mm_unpacklo_pi8(y1, zero);
y0 = _mm_mullo_pi16(y0, y0_fraction);
y1 = _mm_mullo_pi16(y1, y1_fraction);
y2 = _mm_mullo_pi16(y2, y0_fraction);
y3 = _mm_mullo_pi16(y3, y1_fraction);
y0 = _mm_add_pi16(y0, y1);
y2 = _mm_add_pi16(y2, y3);
y0 = _mm_srli_pi16(y0, 8);
y2 = _mm_srli_pi16(y2, 8);
y0 = _mm_packs_pu16(y0, y2);
*dest64++ = y0;
}
while (dest64 < end64);
}
#else // no MMX or SSE2
// C version does 8 at a time to mimic MMX code
static void FilterHorizontal(WebRtc_UWord8* ybuf,
const WebRtc_UWord8* y0_ptr,
const WebRtc_UWord8* y1_ptr,
WebRtc_UWord32 source_width,
WebRtc_UWord32 source_y_fraction)
{
WebRtc_UWord32 y1_fraction = source_y_fraction;
WebRtc_UWord32 y0_fraction = 256 - y1_fraction;
WebRtc_UWord8* end = ybuf + source_width;
do
{
ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8;
ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
y0_ptr += 8;
y1_ptr += 8;
ybuf += 8;
}
while (ybuf < end);
}
#endif
static void FilterVertical(WebRtc_UWord8* ybuf,
const WebRtc_UWord8* y0_ptr,
WebRtc_UWord32 width,
WebRtc_UWord32 source_dx)
{
WebRtc_UWord32 x = 0;
for (WebRtc_UWord32 i = 0; i < width; i ++)
{
WebRtc_UWord32 y0 = y0_ptr[x >> 16];
WebRtc_UWord32 y1 = y0_ptr[(x >> 16) + 1];
WebRtc_UWord32 y_frac = (x & 65535);
ybuf[i] = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
x += source_dx;
}
}
WebRtc_Word32
ScaleBilinear(const WebRtc_UWord8* srcFrame,
WebRtc_UWord8*& dstFrame,
WebRtc_UWord32 srcWidth,
WebRtc_UWord32 srcHeight,
WebRtc_UWord32 dstWidth,
WebRtc_UWord32 dstHeight)
{
// Setting source
const WebRtc_UWord8* src = srcFrame;
WebRtc_UWord8* srcTmp = NULL;
const WebRtc_UWord32 srcStride = (srcWidth + 15) & ~15;
const WebRtc_UWord32 srcUvStride = (((srcStride + 1 >> 1) + 15) & ~15);
const WebRtc_UWord32 srcStrideArray[3] = {srcStride,
srcUvStride,
srcUvStride
};
const WebRtc_UWord32 srcWidthArray[3] = {srcWidth,
(srcWidth + 1) >> 1,
(srcWidth + 1) >> 1
};
// if srcFrame isn't aligned to nice boundaries then copy it over
// int another buffer
if ((srcStride > srcWidth) || (srcUvStride > ((srcWidth + 1) >> 1)))
{
// allocate buffer that can accommodate the stride
srcTmp = new WebRtc_UWord8[srcStride*srcHeight*3 >> 1];
WebRtc_UWord8* tmpPlaneArray[3];
tmpPlaneArray[0] = srcTmp;
tmpPlaneArray[1] = tmpPlaneArray[0] + srcStride * srcHeight;
tmpPlaneArray[2] = tmpPlaneArray[1] + (srcStride >> 1)*(srcHeight >> 1);
WebRtc_UWord8* tmpPtr = srcTmp;
const WebRtc_UWord8* srcPtr = srcFrame;
for (WebRtc_UWord32 p = 0; p < 3; p++)
{
WebRtc_UWord8* dstPtr = tmpPlaneArray[p];
const WebRtc_UWord32 h = (p == 0) ? srcHeight : srcHeight >> 1;
for (WebRtc_UWord32 i = 0; i < h; i++)
{
memcpy(dstPtr, srcPtr, srcWidthArray[p]);
dstPtr += srcStrideArray[p];
srcPtr += srcWidthArray[p];
}
}
src = srcTmp;
}
const WebRtc_UWord8* srcPlaneArray[3];
srcPlaneArray[0] = src;
srcPlaneArray[1] = srcPlaneArray[0] + srcStride*srcHeight;
srcPlaneArray[2] = srcPlaneArray[1] + (srcStride >> 1)*(srcHeight >> 1);
// Setting destination
const WebRtc_UWord32 dstStride = (dstWidth + 31) & ~31;
const WebRtc_UWord32 dstUvStride = (((dstStride + 1 >> 1) + 31) & ~31);
if (dstFrame)
{
delete [] dstFrame;
dstFrame = NULL;
}
WebRtc_UWord32 dstRequiredSize = dstStride*dstHeight +
2*(dstUvStride*((dstHeight + 1) >> 1));
dstFrame = new WebRtc_UWord8[dstRequiredSize];
if (dstFrame == NULL)
return -1;
WebRtc_UWord8* dstPlaneArray[3] = {dstFrame,
dstPlaneArray[0] + dstStride*dstHeight,
dstPlaneArray[1] +
(dstUvStride*((dstHeight + 1) >> 1))
};
const WebRtc_UWord32 dstStrideArray[3] = {dstStride,
dstUvStride,
dstUvStride
};
const WebRtc_UWord32 dstWidthArray[3] = {dstWidth,
dstWidth>>1,
dstWidth>>1
};
for (WebRtc_UWord32 p = 0; p < 3; p++)
{
const WebRtc_UWord32 sh = (p == 0) ? srcHeight : srcHeight >> 1;
const WebRtc_UWord32 dh = (p == 0) ? dstHeight : dstHeight >> 1;
WebRtc_UWord8* filteredBuf = dstPlaneArray[p];
WebRtc_UWord8* horizontalFilteredBuf;
WebRtc_UWord8* intermediaryBuf = new WebRtc_UWord8[srcStrideArray[p]];
const WebRtc_UWord32 hscale_fixed = (sh << kFractionBits) / dh;
const WebRtc_UWord32 source_dx = srcWidthArray[p]*kFractionMax /
dstWidthArray[p];
for (WebRtc_UWord32 h = 0; h < dh; ++h)
{
horizontalFilteredBuf = filteredBuf;
if (source_dx != kFractionMax)
horizontalFilteredBuf = intermediaryBuf;
// horizontal filter
WebRtc_UWord32 source_h_subpixel = (h * hscale_fixed);
if (hscale_fixed >= (kFractionMax * 2))
// For 1/2 or less, center filter.
source_h_subpixel += kFractionMax / 2;
WebRtc_UWord32 source_h = source_h_subpixel >> kFractionBits;
const WebRtc_UWord8* ptr_0 = srcPlaneArray[p] +
source_h*srcStrideArray[p];
const WebRtc_UWord8* ptr_1 = ptr_0 + srcStrideArray[p];
// vertical scaler uses 16.8 fixed point
WebRtc_UWord32 source_h_fraction =
(source_h_subpixel & kFractionMask) >> 8;
if (hscale_fixed != kFractionMax &&
source_h_fraction && ((source_h + 1) < sh))
{
FilterHorizontal(horizontalFilteredBuf, ptr_0, ptr_1,
srcWidthArray[p], source_h_fraction);
}
else
{
memcpy(horizontalFilteredBuf, ptr_1, srcWidthArray[p]);
}
filteredBuf[srcWidthArray[p]] = filteredBuf[srcWidthArray[p]-1];
// vertical filter only if necessary
if (source_dx != kFractionMax)
FilterVertical(filteredBuf, horizontalFilteredBuf,
dstWidthArray[p], source_dx);
filteredBuf += dstStrideArray[p];
}
if (intermediaryBuf != NULL)
delete [] intermediaryBuf;
}
if (srcTmp != NULL)
delete [] srcTmp;
// Filtered image was placed in an aligned buffer. If the
// final output is not in an aligned buffer copy the image over.
if (dstStride > dstWidth)
{
WebRtc_UWord8* dstFinal =
new WebRtc_UWord8[(dstWidth*dstHeight*3) >> 1];
WebRtc_UWord8* dstPtr = dstFinal;
for (WebRtc_UWord32 p = 0; p < 3; p++)
{
WebRtc_UWord8* srcPtr = dstPlaneArray[p];
const WebRtc_UWord32 h = (p == 0) ? dstHeight : dstHeight >> 1;
for (WebRtc_UWord32 i = 0; i < h; i++)
{
memcpy(dstPtr, srcPtr, dstWidthArray[p]);
dstPtr += dstWidthArray[p];
srcPtr += dstStrideArray[p];
}
}
delete [] dstFrame;
dstFrame = dstFinal;
}
return dstHeight;
}
} // namespace webrtc