Revert "libyuv: update to 2f101fdb"
Compile failures on linux platform.
BUG=webm:1253
This reverts commit aa81375d73.
Change-Id: Ibab2c4827bc21518dc03c6e9716b5015cff56fc7
This commit is contained in:
284
third_party/libyuv/source/scale_win.cc
vendored
284
third_party/libyuv/source/scale_win.cc
vendored
@@ -16,8 +16,9 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for 32 bit Visual C x86 and clangcl
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||||
// This module is for Visual C x86.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
|
||||
defined(_MSC_VER) && !defined(__clang__)
|
||||
|
||||
// Offsets for source bytes 0 to 9
|
||||
static uvec8 kShuf0 =
|
||||
@@ -95,8 +96,8 @@ static uvec16 kScaleAb2 =
|
||||
|
||||
// Reads 32 pixels, throws half away and writes 16 pixels.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_ptr
|
||||
// src_stride ignored
|
||||
@@ -121,28 +122,31 @@ void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
|
||||
// Blends 32x1 rectangle to 16x1.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_ptr
|
||||
// src_stride
|
||||
mov edx, [esp + 12] // dst_ptr
|
||||
mov ecx, [esp + 16] // dst_width
|
||||
|
||||
pcmpeqb xmm4, xmm4 // constant 0x0101
|
||||
psrlw xmm4, 15
|
||||
packuswb xmm4, xmm4
|
||||
pxor xmm5, xmm5 // constant 0
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
wloop:
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm1, [eax + 16]
|
||||
lea eax, [eax + 32]
|
||||
pmaddubsw xmm0, xmm4 // horizontal add
|
||||
pmaddubsw xmm1, xmm4
|
||||
pavgw xmm0, xmm5 // (x + 1) / 2
|
||||
pavgw xmm1, xmm5
|
||||
|
||||
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
||||
psrlw xmm0, 8
|
||||
movdqa xmm3, xmm1
|
||||
psrlw xmm1, 8
|
||||
pand xmm2, xmm5
|
||||
pand xmm3, xmm5
|
||||
pavgw xmm0, xmm2
|
||||
pavgw xmm1, xmm3
|
||||
packuswb xmm0, xmm1
|
||||
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
@@ -154,19 +158,16 @@ void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
|
||||
// Blends 32x2 rectangle to 16x1.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
push esi
|
||||
mov eax, [esp + 4 + 4] // src_ptr
|
||||
mov esi, [esp + 4 + 8] // src_stride
|
||||
mov edx, [esp + 4 + 12] // dst_ptr
|
||||
mov ecx, [esp + 4 + 16] // dst_width
|
||||
|
||||
pcmpeqb xmm4, xmm4 // constant 0x0101
|
||||
psrlw xmm4, 15
|
||||
packuswb xmm4, xmm4
|
||||
pxor xmm5, xmm5 // constant 0
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
wloop:
|
||||
movdqu xmm0, [eax]
|
||||
@@ -174,17 +175,19 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
movdqu xmm2, [eax + esi]
|
||||
movdqu xmm3, [eax + esi + 16]
|
||||
lea eax, [eax + 32]
|
||||
pmaddubsw xmm0, xmm4 // horizontal add
|
||||
pmaddubsw xmm1, xmm4
|
||||
pmaddubsw xmm2, xmm4
|
||||
pmaddubsw xmm3, xmm4
|
||||
paddw xmm0, xmm2 // vertical add
|
||||
paddw xmm1, xmm3
|
||||
psrlw xmm0, 1
|
||||
psrlw xmm1, 1
|
||||
pavgw xmm0, xmm5 // (x + 1) / 2
|
||||
pavgw xmm1, xmm5
|
||||
pavgb xmm0, xmm2 // average rows
|
||||
pavgb xmm1, xmm3
|
||||
|
||||
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
||||
psrlw xmm0, 8
|
||||
movdqa xmm3, xmm1
|
||||
psrlw xmm1, 8
|
||||
pand xmm2, xmm5
|
||||
pand xmm3, xmm5
|
||||
pavgw xmm0, xmm2
|
||||
pavgw xmm1, xmm3
|
||||
packuswb xmm0, xmm1
|
||||
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
@@ -243,12 +246,14 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
vmovdqu ymm0, [eax]
|
||||
vmovdqu ymm1, [eax + 32]
|
||||
lea eax, [eax + 64]
|
||||
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
|
||||
|
||||
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
|
||||
vpmaddubsw ymm1, ymm1, ymm4
|
||||
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
|
||||
vpavgw ymm1, ymm1, ymm5
|
||||
vpackuswb ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 32
|
||||
@@ -259,8 +264,6 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
// For rounding, average = (sum + 2) / 4
|
||||
// becomes average((sum >> 1), 0)
|
||||
// Blends 64x2 rectangle to 32x1.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
@@ -278,23 +281,19 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
vpxor ymm5, ymm5, ymm5 // constant 0
|
||||
|
||||
wloop:
|
||||
vmovdqu ymm0, [eax]
|
||||
vmovdqu ymm0, [eax] // average rows
|
||||
vmovdqu ymm1, [eax + 32]
|
||||
vmovdqu ymm2, [eax + esi]
|
||||
vmovdqu ymm3, [eax + esi + 32]
|
||||
vpavgb ymm0, ymm0, [eax + esi]
|
||||
vpavgb ymm1, ymm1, [eax + esi + 32]
|
||||
lea eax, [eax + 64]
|
||||
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
|
||||
|
||||
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
|
||||
vpmaddubsw ymm1, ymm1, ymm4
|
||||
vpmaddubsw ymm2, ymm2, ymm4
|
||||
vpmaddubsw ymm3, ymm3, ymm4
|
||||
vpaddw ymm0, ymm0, ymm2 // vertical add
|
||||
vpaddw ymm1, ymm1, ymm3
|
||||
vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
|
||||
vpsrlw ymm1, ymm1, 1
|
||||
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
|
||||
vpavgw ymm1, ymm1, ymm5
|
||||
vpackuswb ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 32
|
||||
@@ -309,7 +308,7 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
|
||||
// Point samples 32 pixels to 8 pixels.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_ptr
|
||||
@@ -340,7 +339,7 @@ void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
|
||||
// Blends 32x4 rectangle to 8x1.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
push esi
|
||||
@@ -350,40 +349,42 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
mov edx, [esp + 8 + 12] // dst_ptr
|
||||
mov ecx, [esp + 8 + 16] // dst_width
|
||||
lea edi, [esi + esi * 2] // src_stride * 3
|
||||
pcmpeqb xmm4, xmm4 // constant 0x0101
|
||||
psrlw xmm4, 15
|
||||
movdqa xmm5, xmm4
|
||||
packuswb xmm4, xmm4
|
||||
psllw xmm5, 3 // constant 0x0008
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
|
||||
psrlw xmm7, 8
|
||||
|
||||
wloop:
|
||||
movdqu xmm0, [eax] // average rows
|
||||
movdqu xmm1, [eax + 16]
|
||||
movdqu xmm2, [eax + esi]
|
||||
movdqu xmm3, [eax + esi + 16]
|
||||
pmaddubsw xmm0, xmm4 // horizontal add
|
||||
pmaddubsw xmm1, xmm4
|
||||
pmaddubsw xmm2, xmm4
|
||||
pmaddubsw xmm3, xmm4
|
||||
paddw xmm0, xmm2 // vertical add rows 0, 1
|
||||
paddw xmm1, xmm3
|
||||
pavgb xmm0, xmm2
|
||||
pavgb xmm1, xmm3
|
||||
movdqu xmm2, [eax + esi * 2]
|
||||
movdqu xmm3, [eax + esi * 2 + 16]
|
||||
pmaddubsw xmm2, xmm4
|
||||
pmaddubsw xmm3, xmm4
|
||||
paddw xmm0, xmm2 // add row 2
|
||||
paddw xmm1, xmm3
|
||||
movdqu xmm2, [eax + edi]
|
||||
movdqu xmm3, [eax + edi + 16]
|
||||
movdqu xmm4, [eax + edi]
|
||||
movdqu xmm5, [eax + edi + 16]
|
||||
lea eax, [eax + 32]
|
||||
pmaddubsw xmm2, xmm4
|
||||
pmaddubsw xmm3, xmm4
|
||||
paddw xmm0, xmm2 // add row 3
|
||||
paddw xmm1, xmm3
|
||||
phaddw xmm0, xmm1
|
||||
paddw xmm0, xmm5 // + 8 for round
|
||||
psrlw xmm0, 4 // /16 for average of 4 * 4
|
||||
pavgb xmm2, xmm4
|
||||
pavgb xmm3, xmm5
|
||||
pavgb xmm0, xmm2
|
||||
pavgb xmm1, xmm3
|
||||
|
||||
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
||||
psrlw xmm0, 8
|
||||
movdqa xmm3, xmm1
|
||||
psrlw xmm1, 8
|
||||
pand xmm2, xmm7
|
||||
pand xmm3, xmm7
|
||||
pavgw xmm0, xmm2
|
||||
pavgw xmm1, xmm3
|
||||
packuswb xmm0, xmm1
|
||||
|
||||
movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
|
||||
psrlw xmm0, 8
|
||||
pand xmm2, xmm7
|
||||
pavgw xmm0, xmm2
|
||||
packuswb xmm0, xmm0
|
||||
|
||||
movq qword ptr [edx], xmm0
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 8
|
||||
@@ -442,41 +443,37 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
mov edx, [esp + 8 + 12] // dst_ptr
|
||||
mov ecx, [esp + 8 + 16] // dst_width
|
||||
lea edi, [esi + esi * 2] // src_stride * 3
|
||||
vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
|
||||
vpsrlw ymm4, ymm4, 15
|
||||
vpsllw ymm5, ymm4, 3 // constant 0x0008
|
||||
vpackuswb ymm4, ymm4, ymm4
|
||||
vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff
|
||||
vpsrlw ymm7, ymm7, 8
|
||||
|
||||
wloop:
|
||||
vmovdqu ymm0, [eax] // average rows
|
||||
vmovdqu ymm1, [eax + 32]
|
||||
vmovdqu ymm2, [eax + esi]
|
||||
vmovdqu ymm3, [eax + esi + 32]
|
||||
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
|
||||
vpmaddubsw ymm1, ymm1, ymm4
|
||||
vpmaddubsw ymm2, ymm2, ymm4
|
||||
vpmaddubsw ymm3, ymm3, ymm4
|
||||
vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
|
||||
vpaddw ymm1, ymm1, ymm3
|
||||
vpavgb ymm0, ymm0, [eax + esi]
|
||||
vpavgb ymm1, ymm1, [eax + esi + 32]
|
||||
vmovdqu ymm2, [eax + esi * 2]
|
||||
vmovdqu ymm3, [eax + esi * 2 + 32]
|
||||
vpmaddubsw ymm2, ymm2, ymm4
|
||||
vpmaddubsw ymm3, ymm3, ymm4
|
||||
vpaddw ymm0, ymm0, ymm2 // add row 2
|
||||
vpaddw ymm1, ymm1, ymm3
|
||||
vmovdqu ymm2, [eax + edi]
|
||||
vmovdqu ymm3, [eax + edi + 32]
|
||||
lea eax, [eax + 64]
|
||||
vpmaddubsw ymm2, ymm2, ymm4
|
||||
vpmaddubsw ymm3, ymm3, ymm4
|
||||
vpaddw ymm0, ymm0, ymm2 // add row 3
|
||||
vpaddw ymm1, ymm1, ymm3
|
||||
vphaddw ymm0, ymm0, ymm1 // mutates
|
||||
vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
|
||||
vpaddw ymm0, ymm0, ymm5 // + 8 for round
|
||||
vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
|
||||
vpavgb ymm2, ymm2, [eax + edi]
|
||||
vpavgb ymm3, ymm3, [eax + edi + 32]
|
||||
lea eax, [eax + 64]
|
||||
vpavgb ymm0, ymm0, ymm2
|
||||
vpavgb ymm1, ymm1, ymm3
|
||||
|
||||
vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels)
|
||||
vpand ymm3, ymm1, ymm7
|
||||
vpsrlw ymm0, ymm0, 8
|
||||
vpsrlw ymm1, ymm1, 8
|
||||
vpavgw ymm0, ymm0, ymm2
|
||||
vpavgw ymm1, ymm1, ymm3
|
||||
vpackuswb ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||
|
||||
vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)
|
||||
vpsrlw ymm0, ymm0, 8
|
||||
vpavgw ymm0, ymm0, ymm2
|
||||
vpackuswb ymm0, ymm0, ymm0
|
||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||
|
||||
vmovdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
@@ -502,9 +499,9 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
// src_stride ignored
|
||||
mov edx, [esp + 12] // dst_ptr
|
||||
mov ecx, [esp + 16] // dst_width
|
||||
movdqa xmm3, xmmword ptr kShuf0
|
||||
movdqa xmm4, xmmword ptr kShuf1
|
||||
movdqa xmm5, xmmword ptr kShuf2
|
||||
movdqa xmm3, kShuf0
|
||||
movdqa xmm4, kShuf1
|
||||
movdqa xmm5, kShuf2
|
||||
|
||||
wloop:
|
||||
movdqu xmm0, [eax]
|
||||
@@ -551,12 +548,12 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
||||
mov esi, [esp + 4 + 8] // src_stride
|
||||
mov edx, [esp + 4 + 12] // dst_ptr
|
||||
mov ecx, [esp + 4 + 16] // dst_width
|
||||
movdqa xmm2, xmmword ptr kShuf01
|
||||
movdqa xmm3, xmmword ptr kShuf11
|
||||
movdqa xmm4, xmmword ptr kShuf21
|
||||
movdqa xmm5, xmmword ptr kMadd01
|
||||
movdqa xmm6, xmmword ptr kMadd11
|
||||
movdqa xmm7, xmmword ptr kRound34
|
||||
movdqa xmm2, kShuf01
|
||||
movdqa xmm3, kShuf11
|
||||
movdqa xmm4, kShuf21
|
||||
movdqa xmm5, kMadd01
|
||||
movdqa xmm6, kMadd11
|
||||
movdqa xmm7, kRound34
|
||||
|
||||
wloop:
|
||||
movdqu xmm0, [eax] // pixels 0..7
|
||||
@@ -582,7 +579,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
||||
lea eax, [eax + 32]
|
||||
pavgb xmm0, xmm1
|
||||
pshufb xmm0, xmm4
|
||||
movdqa xmm1, xmmword ptr kMadd21
|
||||
movdqa xmm1, kMadd21
|
||||
pmaddubsw xmm0, xmm1
|
||||
paddsw xmm0, xmm7
|
||||
psrlw xmm0, 2
|
||||
@@ -608,12 +605,12 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
||||
mov esi, [esp + 4 + 8] // src_stride
|
||||
mov edx, [esp + 4 + 12] // dst_ptr
|
||||
mov ecx, [esp + 4 + 16] // dst_width
|
||||
movdqa xmm2, xmmword ptr kShuf01
|
||||
movdqa xmm3, xmmword ptr kShuf11
|
||||
movdqa xmm4, xmmword ptr kShuf21
|
||||
movdqa xmm5, xmmword ptr kMadd01
|
||||
movdqa xmm6, xmmword ptr kMadd11
|
||||
movdqa xmm7, xmmword ptr kRound34
|
||||
movdqa xmm2, kShuf01
|
||||
movdqa xmm3, kShuf11
|
||||
movdqa xmm4, kShuf21
|
||||
movdqa xmm5, kMadd01
|
||||
movdqa xmm6, kMadd11
|
||||
movdqa xmm7, kRound34
|
||||
|
||||
wloop:
|
||||
movdqu xmm0, [eax] // pixels 0..7
|
||||
@@ -642,7 +639,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
||||
pavgb xmm1, xmm0
|
||||
pavgb xmm0, xmm1
|
||||
pshufb xmm0, xmm4
|
||||
movdqa xmm1, xmmword ptr kMadd21
|
||||
movdqa xmm1, kMadd21
|
||||
pmaddubsw xmm0, xmm1
|
||||
paddsw xmm0, xmm7
|
||||
psrlw xmm0, 2
|
||||
@@ -668,8 +665,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
// src_stride ignored
|
||||
mov edx, [esp + 12] // dst_ptr
|
||||
mov ecx, [esp + 16] // dst_width
|
||||
movdqa xmm4, xmmword ptr kShuf38a
|
||||
movdqa xmm5, xmmword ptr kShuf38b
|
||||
movdqa xmm4, kShuf38a
|
||||
movdqa xmm5, kShuf38b
|
||||
|
||||
xloop:
|
||||
movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
|
||||
@@ -701,9 +698,9 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
||||
mov esi, [esp + 4 + 8] // src_stride
|
||||
mov edx, [esp + 4 + 12] // dst_ptr
|
||||
mov ecx, [esp + 4 + 16] // dst_width
|
||||
movdqa xmm2, xmmword ptr kShufAc
|
||||
movdqa xmm3, xmmword ptr kShufAc3
|
||||
movdqa xmm4, xmmword ptr kScaleAc33
|
||||
movdqa xmm2, kShufAc
|
||||
movdqa xmm3, kShufAc3
|
||||
movdqa xmm4, kScaleAc33
|
||||
pxor xmm5, xmm5
|
||||
|
||||
xloop:
|
||||
@@ -766,10 +763,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
||||
mov esi, [esp + 4 + 8] // src_stride
|
||||
mov edx, [esp + 4 + 12] // dst_ptr
|
||||
mov ecx, [esp + 4 + 16] // dst_width
|
||||
movdqa xmm2, xmmword ptr kShufAb0
|
||||
movdqa xmm3, xmmword ptr kShufAb1
|
||||
movdqa xmm4, xmmword ptr kShufAb2
|
||||
movdqa xmm5, xmmword ptr kScaleAb2
|
||||
movdqa xmm2, kShufAb0
|
||||
movdqa xmm3, kShufAb1
|
||||
movdqa xmm4, kShufAb2
|
||||
movdqa xmm5, kScaleAb2
|
||||
|
||||
xloop:
|
||||
movdqu xmm0, [eax] // average 2 rows into xmm0
|
||||
@@ -860,16 +857,6 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
||||
}
|
||||
#endif // HAS_SCALEADDROW_AVX2
|
||||
|
||||
// Constant for making pixels signed to avoid pmaddubsw
|
||||
// saturation.
|
||||
static uvec8 kFsub80 =
|
||||
{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
|
||||
|
||||
// Constant for making pixels unsigned and adding .5 for rounding.
|
||||
static uvec16 kFadd40 =
|
||||
{ 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
|
||||
|
||||
// Bilinear column filtering. SSSE3 version.
|
||||
__declspec(naked)
|
||||
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
@@ -887,8 +874,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
movd xmm5, eax
|
||||
pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
|
||||
psrlw xmm6, 9
|
||||
pcmpeqb xmm7, xmm7 // generate 0x0001
|
||||
psrlw xmm7, 15
|
||||
pextrw eax, xmm2, 1 // get x0 integer. preroll
|
||||
sub ecx, 2
|
||||
jl xloop29
|
||||
@@ -911,22 +896,20 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
movd xmm4, ebx
|
||||
pshufb xmm1, xmm5 // 0011
|
||||
punpcklwd xmm0, xmm4
|
||||
psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
|
||||
pxor xmm1, xmm6 // 0..7f and 7f..0
|
||||
paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
|
||||
pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
|
||||
pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.
|
||||
pextrw eax, xmm2, 1 // get x0 integer. next iteration.
|
||||
pextrw edx, xmm2, 3 // get x1 integer. next iteration.
|
||||
paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
|
||||
psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
|
||||
packuswb xmm1, xmm1 // 8 bits, 2 pixels.
|
||||
movd ebx, xmm1
|
||||
psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
|
||||
packuswb xmm0, xmm0 // 8 bits, 2 pixels.
|
||||
movd ebx, xmm0
|
||||
mov [edi], bx
|
||||
lea edi, [edi + 2]
|
||||
sub ecx, 2 // 2 pixels
|
||||
jge xloop2
|
||||
|
||||
xloop29:
|
||||
|
||||
add ecx, 2 - 1
|
||||
jl xloop99
|
||||
|
||||
@@ -935,14 +918,11 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
movd xmm0, ebx
|
||||
psrlw xmm2, 9 // 7 bit fractions.
|
||||
pshufb xmm2, xmm5 // 0011
|
||||
psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
|
||||
pxor xmm2, xmm6 // 0..7f and 7f..0
|
||||
paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
|
||||
pmaddubsw xmm2, xmm0 // 16 bit
|
||||
paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
|
||||
psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
|
||||
packuswb xmm2, xmm2 // 8 bits
|
||||
movd ebx, xmm2
|
||||
pmaddubsw xmm0, xmm2 // 16 bit
|
||||
psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
|
||||
packuswb xmm0, xmm0 // 8 bits
|
||||
movd ebx, xmm0
|
||||
mov [edi], bl
|
||||
|
||||
xloop99:
|
||||
@@ -1253,8 +1233,8 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
||||
mov ecx, [esp + 8 + 12] // dst_width
|
||||
movd xmm2, [esp + 8 + 16] // x
|
||||
movd xmm3, [esp + 8 + 20] // dx
|
||||
movdqa xmm4, xmmword ptr kShuffleColARGB
|
||||
movdqa xmm5, xmmword ptr kShuffleFractions
|
||||
movdqa xmm4, kShuffleColARGB
|
||||
movdqa xmm5, kShuffleFractions
|
||||
pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
|
||||
psrlw xmm6, 9
|
||||
pextrw eax, xmm2, 1 // get x0 integer. preroll
|
||||
|
||||
Reference in New Issue
Block a user