344 lines
12 KiB
C++
344 lines
12 KiB
C++
/*
|
|
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "scale_bilinear_yuv.h"
|
|
#include <string.h>
|
|
|
|
namespace webrtc
|
|
{
|
|
// 16.16 fixed point arithmetic
|
|
const WebRtc_UWord32 kFractionBits = 16;
|
|
const WebRtc_UWord32 kFractionMax = 1 << kFractionBits;
|
|
const WebRtc_UWord32 kFractionMask = ((1 << kFractionBits) - 1);
|
|
|
|
#if USE_MMX
|
|
#if defined(_MSC_VER)
|
|
#include <intrin.h>
|
|
#else
|
|
#include <mmintrin.h>
|
|
#endif
|
|
#endif
|
|
|
|
#if USE_SSE2
|
|
#include <emmintrin.h>
|
|
#endif
|
|
|
|
#if USE_SSE2
|
|
// FilterHorizontal combines two rows of the image using linear interpolation.
|
|
// SSE2 version does 16 pixels at a time
|
|
|
|
static void FilterHorizontal(WebRtc_UWord8* ybuf,
|
|
const WebRtc_UWord8* y0_ptr,
|
|
const WebRtc_UWord8* y1_ptr,
|
|
WebRtc_UWord32 source_width,
|
|
WebRtc_UWord32 source_y_fraction)
|
|
{
|
|
__m128i zero = _mm_setzero_si128();
|
|
__m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
|
|
__m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
|
|
|
|
const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
|
|
const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
|
|
__m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
|
|
__m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
|
|
|
|
do
|
|
{
|
|
__m128i y0 = _mm_loadu_si128(y0_ptr128);
|
|
__m128i y1 = _mm_loadu_si128(y1_ptr128);
|
|
__m128i y2 = _mm_unpackhi_epi8(y0, zero);
|
|
__m128i y3 = _mm_unpackhi_epi8(y1, zero);
|
|
y0 = _mm_unpacklo_epi8(y0, zero);
|
|
y1 = _mm_unpacklo_epi8(y1, zero);
|
|
y0 = _mm_mullo_epi16(y0, y0_fraction);
|
|
y1 = _mm_mullo_epi16(y1, y1_fraction);
|
|
y2 = _mm_mullo_epi16(y2, y0_fraction);
|
|
y3 = _mm_mullo_epi16(y3, y1_fraction);
|
|
y0 = _mm_add_epi16(y0, y1);
|
|
y2 = _mm_add_epi16(y2, y3);
|
|
y0 = _mm_srli_epi16(y0, 8);
|
|
y2 = _mm_srli_epi16(y2, 8);
|
|
y0 = _mm_packus_epi16(y0, y2);
|
|
*dest128++ = y0;
|
|
++y0_ptr128;
|
|
++y1_ptr128;
|
|
}
|
|
while (dest128 < end128);
|
|
}
|
|
#elif USE_MMX
|
|
// MMX version does 8 pixels at a time
|
|
static void FilterHorizontal(WebRtc_UWord8* ybuf,
|
|
const WebRtc_UWord8* y0_ptr,
|
|
const WebRtc_UWord8* y1_ptr,
|
|
WebRtc_UWord32 source_width,
|
|
WebRtc_UWord32 source_y_fraction)
|
|
{
|
|
__m64 zero = _mm_setzero_si64();
|
|
__m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
|
|
__m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
|
|
|
|
const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
|
|
const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
|
|
__m64* dest64 = reinterpret_cast<__m64*>(ybuf);
|
|
__m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
|
|
|
|
do
|
|
{
|
|
__m64 y0 = *y0_ptr64++;
|
|
__m64 y1 = *y1_ptr64++;
|
|
__m64 y2 = _mm_unpackhi_pi8(y0, zero);
|
|
__m64 y3 = _mm_unpackhi_pi8(y1, zero);
|
|
y0 = _mm_unpacklo_pi8(y0, zero);
|
|
y1 = _mm_unpacklo_pi8(y1, zero);
|
|
y0 = _mm_mullo_pi16(y0, y0_fraction);
|
|
y1 = _mm_mullo_pi16(y1, y1_fraction);
|
|
y2 = _mm_mullo_pi16(y2, y0_fraction);
|
|
y3 = _mm_mullo_pi16(y3, y1_fraction);
|
|
y0 = _mm_add_pi16(y0, y1);
|
|
y2 = _mm_add_pi16(y2, y3);
|
|
y0 = _mm_srli_pi16(y0, 8);
|
|
y2 = _mm_srli_pi16(y2, 8);
|
|
y0 = _mm_packs_pu16(y0, y2);
|
|
*dest64++ = y0;
|
|
}
|
|
while (dest64 < end64);
|
|
}
|
|
#else // no MMX or SSE2
|
|
// C version does 8 at a time to mimic MMX code
|
|
static void FilterHorizontal(WebRtc_UWord8* ybuf,
|
|
const WebRtc_UWord8* y0_ptr,
|
|
const WebRtc_UWord8* y1_ptr,
|
|
WebRtc_UWord32 source_width,
|
|
WebRtc_UWord32 source_y_fraction)
|
|
{
|
|
WebRtc_UWord32 y1_fraction = source_y_fraction;
|
|
WebRtc_UWord32 y0_fraction = 256 - y1_fraction;
|
|
WebRtc_UWord8* end = ybuf + source_width;
|
|
do
|
|
{
|
|
ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
|
|
ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
|
|
ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
|
|
ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
|
|
ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8;
|
|
ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
|
|
ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
|
|
ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
|
|
y0_ptr += 8;
|
|
y1_ptr += 8;
|
|
ybuf += 8;
|
|
}
|
|
while (ybuf < end);
|
|
}
|
|
#endif
|
|
|
|
static void FilterVertical(WebRtc_UWord8* ybuf,
|
|
const WebRtc_UWord8* y0_ptr,
|
|
WebRtc_UWord32 width,
|
|
WebRtc_UWord32 source_dx)
|
|
{
|
|
WebRtc_UWord32 x = 0;
|
|
|
|
for (WebRtc_UWord32 i = 0; i < width; i ++)
|
|
{
|
|
WebRtc_UWord32 y0 = y0_ptr[x >> 16];
|
|
WebRtc_UWord32 y1 = y0_ptr[(x >> 16) + 1];
|
|
|
|
WebRtc_UWord32 y_frac = (x & 65535);
|
|
ybuf[i] = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
|
|
|
|
x += source_dx;
|
|
}
|
|
}
|
|
|
|
|
|
WebRtc_Word32
|
|
ScaleBilinear(const WebRtc_UWord8* srcFrame,
|
|
WebRtc_UWord8*& dstFrame,
|
|
WebRtc_UWord32 srcWidth,
|
|
WebRtc_UWord32 srcHeight,
|
|
WebRtc_UWord32 dstWidth,
|
|
WebRtc_UWord32 dstHeight)
|
|
{
|
|
// Setting source
|
|
const WebRtc_UWord8* src = srcFrame;
|
|
WebRtc_UWord8* srcTmp = NULL;
|
|
|
|
const WebRtc_UWord32 srcStride = (srcWidth + 15) & ~15;
|
|
const WebRtc_UWord32 srcUvStride = (((srcStride + 1 >> 1) + 15) & ~15);
|
|
|
|
const WebRtc_UWord32 srcStrideArray[3] = {srcStride,
|
|
srcUvStride,
|
|
srcUvStride
|
|
};
|
|
const WebRtc_UWord32 srcWidthArray[3] = {srcWidth,
|
|
(srcWidth + 1) >> 1,
|
|
(srcWidth + 1) >> 1
|
|
};
|
|
|
|
// if srcFrame isn't aligned to nice boundaries then copy it over
|
|
// int another buffer
|
|
if ((srcStride > srcWidth) || (srcUvStride > ((srcWidth + 1) >> 1)))
|
|
{
|
|
// allocate buffer that can accommodate the stride
|
|
srcTmp = new WebRtc_UWord8[srcStride*srcHeight*3 >> 1];
|
|
WebRtc_UWord8* tmpPlaneArray[3];
|
|
tmpPlaneArray[0] = srcTmp;
|
|
tmpPlaneArray[1] = tmpPlaneArray[0] + srcStride * srcHeight;
|
|
tmpPlaneArray[2] = tmpPlaneArray[1] + (srcStride >> 1)*(srcHeight >> 1);
|
|
|
|
WebRtc_UWord8* tmpPtr = srcTmp;
|
|
const WebRtc_UWord8* srcPtr = srcFrame;
|
|
|
|
for (WebRtc_UWord32 p = 0; p < 3; p++)
|
|
{
|
|
WebRtc_UWord8* dstPtr = tmpPlaneArray[p];
|
|
const WebRtc_UWord32 h = (p == 0) ? srcHeight : srcHeight >> 1;
|
|
|
|
for (WebRtc_UWord32 i = 0; i < h; i++)
|
|
{
|
|
memcpy(dstPtr, srcPtr, srcWidthArray[p]);
|
|
dstPtr += srcStrideArray[p];
|
|
srcPtr += srcWidthArray[p];
|
|
}
|
|
}
|
|
src = srcTmp;
|
|
}
|
|
|
|
const WebRtc_UWord8* srcPlaneArray[3];
|
|
srcPlaneArray[0] = src;
|
|
srcPlaneArray[1] = srcPlaneArray[0] + srcStride*srcHeight;
|
|
srcPlaneArray[2] = srcPlaneArray[1] + (srcStride >> 1)*(srcHeight >> 1);
|
|
|
|
// Setting destination
|
|
const WebRtc_UWord32 dstStride = (dstWidth + 31) & ~31;
|
|
const WebRtc_UWord32 dstUvStride = (((dstStride + 1 >> 1) + 31) & ~31);
|
|
|
|
if (dstFrame)
|
|
{
|
|
delete [] dstFrame;
|
|
dstFrame = NULL;
|
|
}
|
|
|
|
WebRtc_UWord32 dstRequiredSize = dstStride*dstHeight +
|
|
2*(dstUvStride*((dstHeight + 1) >> 1));
|
|
dstFrame = new WebRtc_UWord8[dstRequiredSize];
|
|
if (dstFrame == NULL)
|
|
return -1;
|
|
|
|
WebRtc_UWord8* dstPlaneArray[3] = {dstFrame,
|
|
dstPlaneArray[0] + dstStride*dstHeight,
|
|
dstPlaneArray[1] +
|
|
(dstUvStride*((dstHeight + 1) >> 1))
|
|
};
|
|
|
|
const WebRtc_UWord32 dstStrideArray[3] = {dstStride,
|
|
dstUvStride,
|
|
dstUvStride
|
|
};
|
|
const WebRtc_UWord32 dstWidthArray[3] = {dstWidth,
|
|
dstWidth>>1,
|
|
dstWidth>>1
|
|
};
|
|
|
|
for (WebRtc_UWord32 p = 0; p < 3; p++)
|
|
{
|
|
const WebRtc_UWord32 sh = (p == 0) ? srcHeight : srcHeight >> 1;
|
|
const WebRtc_UWord32 dh = (p == 0) ? dstHeight : dstHeight >> 1;
|
|
WebRtc_UWord8* filteredBuf = dstPlaneArray[p];
|
|
WebRtc_UWord8* horizontalFilteredBuf;
|
|
WebRtc_UWord8* intermediaryBuf = new WebRtc_UWord8[srcStrideArray[p]];
|
|
|
|
const WebRtc_UWord32 hscale_fixed = (sh << kFractionBits) / dh;
|
|
const WebRtc_UWord32 source_dx = srcWidthArray[p]*kFractionMax /
|
|
dstWidthArray[p];
|
|
|
|
|
|
for (WebRtc_UWord32 h = 0; h < dh; ++h)
|
|
{
|
|
horizontalFilteredBuf = filteredBuf;
|
|
|
|
if (source_dx != kFractionMax)
|
|
horizontalFilteredBuf = intermediaryBuf;
|
|
|
|
// horizontal filter
|
|
WebRtc_UWord32 source_h_subpixel = (h * hscale_fixed);
|
|
if (hscale_fixed >= (kFractionMax * 2))
|
|
// For 1/2 or less, center filter.
|
|
source_h_subpixel += kFractionMax / 2;
|
|
|
|
WebRtc_UWord32 source_h = source_h_subpixel >> kFractionBits;
|
|
|
|
const WebRtc_UWord8* ptr_0 = srcPlaneArray[p] +
|
|
source_h*srcStrideArray[p];
|
|
|
|
const WebRtc_UWord8* ptr_1 = ptr_0 + srcStrideArray[p];
|
|
|
|
// vertical scaler uses 16.8 fixed point
|
|
WebRtc_UWord32 source_h_fraction =
|
|
(source_h_subpixel & kFractionMask) >> 8;
|
|
|
|
if (hscale_fixed != kFractionMax &&
|
|
source_h_fraction && ((source_h + 1) < sh))
|
|
{
|
|
FilterHorizontal(horizontalFilteredBuf, ptr_0, ptr_1,
|
|
srcWidthArray[p], source_h_fraction);
|
|
}
|
|
else
|
|
{
|
|
memcpy(horizontalFilteredBuf, ptr_1, srcWidthArray[p]);
|
|
}
|
|
filteredBuf[srcWidthArray[p]] = filteredBuf[srcWidthArray[p]-1];
|
|
|
|
// vertical filter only if necessary
|
|
if (source_dx != kFractionMax)
|
|
FilterVertical(filteredBuf, horizontalFilteredBuf,
|
|
dstWidthArray[p], source_dx);
|
|
|
|
filteredBuf += dstStrideArray[p];
|
|
}
|
|
|
|
if (intermediaryBuf != NULL)
|
|
delete [] intermediaryBuf;
|
|
}
|
|
|
|
if (srcTmp != NULL)
|
|
delete [] srcTmp;
|
|
|
|
// Filtered image was placed in an aligned buffer. If the
|
|
// final output is not in an aligned buffer copy the image over.
|
|
if (dstStride > dstWidth)
|
|
{
|
|
WebRtc_UWord8* dstFinal =
|
|
new WebRtc_UWord8[(dstWidth*dstHeight*3) >> 1];
|
|
WebRtc_UWord8* dstPtr = dstFinal;
|
|
|
|
for (WebRtc_UWord32 p = 0; p < 3; p++)
|
|
{
|
|
WebRtc_UWord8* srcPtr = dstPlaneArray[p];
|
|
const WebRtc_UWord32 h = (p == 0) ? dstHeight : dstHeight >> 1;
|
|
|
|
for (WebRtc_UWord32 i = 0; i < h; i++)
|
|
{
|
|
memcpy(dstPtr, srcPtr, dstWidthArray[p]);
|
|
dstPtr += dstWidthArray[p];
|
|
srcPtr += dstStrideArray[p];
|
|
}
|
|
}
|
|
|
|
delete [] dstFrame;
|
|
dstFrame = dstFinal;
|
|
}
|
|
|
|
return dstHeight;
|
|
}
|
|
|
|
} // namespace webrtc
|