Fixes color issue when scaling without breaking mingw. BUG=https://bugs.chromium.org/p/libyuv/issues/detail?id=605 BUG=https://bugs.chromium.org/p/webm/issues/detail?id=1252 Change-Id: I09437d93fd65964ad57113274d8c819f3eaf2e57
		
			
				
	
	
		
			1375 lines
		
	
	
		
			42 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			1375 lines
		
	
	
		
			42 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
/*
 | 
						|
 *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
 | 
						|
 *
 | 
						|
 *  Use of this source code is governed by a BSD-style license
 | 
						|
 *  that can be found in the LICENSE file in the root of the source
 | 
						|
 *  tree. An additional intellectual property rights grant can be found
 | 
						|
 *  in the file PATENTS. All contributing project authors may
 | 
						|
 *  be found in the AUTHORS file in the root of the source tree.
 | 
						|
 */
 | 
						|
 | 
						|
#include "libyuv/row.h"
 | 
						|
#include "libyuv/scale_row.h"
 | 
						|
 | 
						|
#ifdef __cplusplus
 | 
						|
namespace libyuv {
 | 
						|
extern "C" {
 | 
						|
#endif
 | 
						|
 | 
						|
// This module is for 32 bit Visual C x86 and clangcl
 | 
						|
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 | 
						|
 | 
						|
// Offsets for source bytes 0 to 9
 | 
						|
static uvec8 kShuf0 =
 | 
						|
  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
 | 
						|
 | 
						|
// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
 | 
						|
static uvec8 kShuf1 =
 | 
						|
  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
 | 
						|
 | 
						|
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
 | 
						|
static uvec8 kShuf2 =
 | 
						|
  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
 | 
						|
 | 
						|
// Offsets for source bytes 0 to 10
 | 
						|
static uvec8 kShuf01 =
 | 
						|
  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
 | 
						|
 | 
						|
// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
 | 
						|
static uvec8 kShuf11 =
 | 
						|
  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
 | 
						|
 | 
						|
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
 | 
						|
static uvec8 kShuf21 =
 | 
						|
  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
 | 
						|
 | 
						|
// Coefficients for source bytes 0 to 10
 | 
						|
static uvec8 kMadd01 =
 | 
						|
  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
 | 
						|
 | 
						|
// Coefficients for source bytes 10 to 21
 | 
						|
static uvec8 kMadd11 =
 | 
						|
  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
 | 
						|
 | 
						|
// Coefficients for source bytes 21 to 31
 | 
						|
static uvec8 kMadd21 =
 | 
						|
  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
 | 
						|
 | 
						|
// Coefficients for source bytes 21 to 31
 | 
						|
static vec16 kRound34 =
 | 
						|
  { 2, 2, 2, 2, 2, 2, 2, 2 };
 | 
						|
 | 
						|
static uvec8 kShuf38a =
 | 
						|
  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
 | 
						|
 | 
						|
static uvec8 kShuf38b =
 | 
						|
  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
 | 
						|
 | 
						|
// Arrange words 0,3,6 into 0,1,2
 | 
						|
static uvec8 kShufAc =
 | 
						|
  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
 | 
						|
 | 
						|
// Arrange words 0,3,6 into 3,4,5
 | 
						|
static uvec8 kShufAc3 =
 | 
						|
  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
 | 
						|
 | 
						|
// Scaling values for boxes of 3x3 and 2x3
 | 
						|
static uvec16 kScaleAc33 =
 | 
						|
  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
 | 
						|
 | 
						|
// Arrange first value for pixels 0,1,2,3,4,5
 | 
						|
static uvec8 kShufAb0 =
 | 
						|
  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
 | 
						|
 | 
						|
// Arrange second value for pixels 0,1,2,3,4,5
 | 
						|
static uvec8 kShufAb1 =
 | 
						|
  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
 | 
						|
 | 
						|
// Arrange third value for pixels 0,1,2,3,4,5
 | 
						|
static uvec8 kShufAb2 =
 | 
						|
  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
 | 
						|
 | 
						|
// Scaling values for boxes of 3x2 and 2x2
 | 
						|
static uvec16 kScaleAb2 =
 | 
						|
  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
 | 
						|
 | 
						|
// Reads 32 pixels, throws half away and writes 16 pixels.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 | 
						|
                         uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    mov        eax, [esp + 4]        // src_ptr
 | 
						|
                                     // src_stride ignored
 | 
						|
    mov        edx, [esp + 12]       // dst_ptr
 | 
						|
    mov        ecx, [esp + 16]       // dst_width
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movdqu     xmm0, [eax]
 | 
						|
    movdqu     xmm1, [eax + 16]
 | 
						|
    lea        eax,  [eax + 32]
 | 
						|
    psrlw      xmm0, 8               // isolate odd pixels.
 | 
						|
    psrlw      xmm1, 8
 | 
						|
    packuswb   xmm0, xmm1
 | 
						|
    movdqu     [edx], xmm0
 | 
						|
    lea        edx, [edx + 16]
 | 
						|
    sub        ecx, 16
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Blends 32x1 rectangle to 16x1.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 | 
						|
                               uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    mov        eax, [esp + 4]        // src_ptr
 | 
						|
                                     // src_stride
 | 
						|
    mov        edx, [esp + 12]       // dst_ptr
 | 
						|
    mov        ecx, [esp + 16]       // dst_width
 | 
						|
 | 
						|
    pcmpeqb    xmm4, xmm4            // constant 0x0101
 | 
						|
    psrlw      xmm4, 15
 | 
						|
    packuswb   xmm4, xmm4
 | 
						|
    pxor       xmm5, xmm5            // constant 0
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movdqu     xmm0, [eax]
 | 
						|
    movdqu     xmm1, [eax + 16]
 | 
						|
    lea        eax,  [eax + 32]
 | 
						|
    pmaddubsw  xmm0, xmm4      // horizontal add
 | 
						|
    pmaddubsw  xmm1, xmm4
 | 
						|
    pavgw      xmm0, xmm5      // (x + 1) / 2
 | 
						|
    pavgw      xmm1, xmm5
 | 
						|
    packuswb   xmm0, xmm1
 | 
						|
    movdqu     [edx], xmm0
 | 
						|
    lea        edx, [edx + 16]
 | 
						|
    sub        ecx, 16
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Blends 32x2 rectangle to 16x1.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 | 
						|
                            uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    push       esi
 | 
						|
    mov        eax, [esp + 4 + 4]    // src_ptr
 | 
						|
    mov        esi, [esp + 4 + 8]    // src_stride
 | 
						|
    mov        edx, [esp + 4 + 12]   // dst_ptr
 | 
						|
    mov        ecx, [esp + 4 + 16]   // dst_width
 | 
						|
 | 
						|
    pcmpeqb    xmm4, xmm4            // constant 0x0101
 | 
						|
    psrlw      xmm4, 15
 | 
						|
    packuswb   xmm4, xmm4
 | 
						|
    pxor       xmm5, xmm5            // constant 0
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movdqu     xmm0, [eax]
 | 
						|
    movdqu     xmm1, [eax + 16]
 | 
						|
    movdqu     xmm2, [eax + esi]
 | 
						|
    movdqu     xmm3, [eax + esi + 16]
 | 
						|
    lea        eax,  [eax + 32]
 | 
						|
    pmaddubsw  xmm0, xmm4      // horizontal add
 | 
						|
    pmaddubsw  xmm1, xmm4
 | 
						|
    pmaddubsw  xmm2, xmm4
 | 
						|
    pmaddubsw  xmm3, xmm4
 | 
						|
    paddw      xmm0, xmm2      // vertical add
 | 
						|
    paddw      xmm1, xmm3
 | 
						|
    psrlw      xmm0, 1
 | 
						|
    psrlw      xmm1, 1
 | 
						|
    pavgw      xmm0, xmm5      // (x + 1) / 2
 | 
						|
    pavgw      xmm1, xmm5
 | 
						|
    packuswb   xmm0, xmm1
 | 
						|
    movdqu     [edx], xmm0
 | 
						|
    lea        edx, [edx + 16]
 | 
						|
    sub        ecx, 16
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    pop        esi
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
#ifdef HAS_SCALEROWDOWN2_AVX2
 | 
						|
// Reads 64 pixels, throws half away and writes 32 pixels.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 | 
						|
                        uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    mov        eax, [esp + 4]        // src_ptr
 | 
						|
                                     // src_stride ignored
 | 
						|
    mov        edx, [esp + 12]       // dst_ptr
 | 
						|
    mov        ecx, [esp + 16]       // dst_width
 | 
						|
 | 
						|
  wloop:
 | 
						|
    vmovdqu     ymm0, [eax]
 | 
						|
    vmovdqu     ymm1, [eax + 32]
 | 
						|
    lea         eax,  [eax + 64]
 | 
						|
    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
 | 
						|
    vpsrlw      ymm1, ymm1, 8
 | 
						|
    vpackuswb   ymm0, ymm0, ymm1
 | 
						|
    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
 | 
						|
    vmovdqu     [edx], ymm0
 | 
						|
    lea         edx, [edx + 32]
 | 
						|
    sub         ecx, 32
 | 
						|
    jg          wloop
 | 
						|
 | 
						|
    vzeroupper
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Blends 64x1 rectangle to 32x1.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 | 
						|
                              uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    mov         eax, [esp + 4]        // src_ptr
 | 
						|
                                      // src_stride
 | 
						|
    mov         edx, [esp + 12]       // dst_ptr
 | 
						|
    mov         ecx, [esp + 16]       // dst_width
 | 
						|
 | 
						|
    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
 | 
						|
    vpsrlw      ymm4, ymm4, 15
 | 
						|
    vpackuswb   ymm4, ymm4, ymm4
 | 
						|
    vpxor       ymm5, ymm5, ymm5      // constant 0
 | 
						|
 | 
						|
  wloop:
 | 
						|
    vmovdqu     ymm0, [eax]
 | 
						|
    vmovdqu     ymm1, [eax + 32]
 | 
						|
    lea         eax,  [eax + 64]
 | 
						|
    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
 | 
						|
    vpmaddubsw  ymm1, ymm1, ymm4
 | 
						|
    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
 | 
						|
    vpavgw      ymm1, ymm1, ymm5
 | 
						|
    vpackuswb   ymm0, ymm0, ymm1
 | 
						|
    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
 | 
						|
    vmovdqu     [edx], ymm0
 | 
						|
    lea         edx, [edx + 32]
 | 
						|
    sub         ecx, 32
 | 
						|
    jg          wloop
 | 
						|
 | 
						|
    vzeroupper
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// For rounding, average = (sum + 2) / 4
 | 
						|
// becomes average((sum >> 1), 0)
 | 
						|
// Blends 64x2 rectangle to 32x1.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 | 
						|
                           uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    push        esi
 | 
						|
    mov         eax, [esp + 4 + 4]    // src_ptr
 | 
						|
    mov         esi, [esp + 4 + 8]    // src_stride
 | 
						|
    mov         edx, [esp + 4 + 12]   // dst_ptr
 | 
						|
    mov         ecx, [esp + 4 + 16]   // dst_width
 | 
						|
 | 
						|
    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
 | 
						|
    vpsrlw      ymm4, ymm4, 15
 | 
						|
    vpackuswb   ymm4, ymm4, ymm4
 | 
						|
    vpxor       ymm5, ymm5, ymm5      // constant 0
 | 
						|
 | 
						|
  wloop:
 | 
						|
    vmovdqu     ymm0, [eax]
 | 
						|
    vmovdqu     ymm1, [eax + 32]
 | 
						|
    vmovdqu     ymm2, [eax + esi]
 | 
						|
    vmovdqu     ymm3, [eax + esi + 32]
 | 
						|
    lea         eax,  [eax + 64]
 | 
						|
    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
 | 
						|
    vpmaddubsw  ymm1, ymm1, ymm4
 | 
						|
    vpmaddubsw  ymm2, ymm2, ymm4
 | 
						|
    vpmaddubsw  ymm3, ymm3, ymm4
 | 
						|
    vpaddw      ymm0, ymm0, ymm2      // vertical add
 | 
						|
    vpaddw      ymm1, ymm1, ymm3
 | 
						|
    vpsrlw      ymm0, ymm0, 1         // (x + 2) / 4 = (x / 2 + 1) / 2
 | 
						|
    vpsrlw      ymm1, ymm1, 1
 | 
						|
    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
 | 
						|
    vpavgw      ymm1, ymm1, ymm5
 | 
						|
    vpackuswb   ymm0, ymm0, ymm1
 | 
						|
    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
 | 
						|
    vmovdqu     [edx], ymm0
 | 
						|
    lea         edx, [edx + 32]
 | 
						|
    sub         ecx, 32
 | 
						|
    jg          wloop
 | 
						|
 | 
						|
    pop         esi
 | 
						|
    vzeroupper
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
#endif  // HAS_SCALEROWDOWN2_AVX2
 | 
						|
 | 
						|
// Point samples 32 pixels to 8 pixels.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 | 
						|
                        uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    mov        eax, [esp + 4]        // src_ptr
 | 
						|
                                     // src_stride ignored
 | 
						|
    mov        edx, [esp + 12]       // dst_ptr
 | 
						|
    mov        ecx, [esp + 16]       // dst_width
 | 
						|
    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
 | 
						|
    psrld      xmm5, 24
 | 
						|
    pslld      xmm5, 16
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movdqu     xmm0, [eax]
 | 
						|
    movdqu     xmm1, [eax + 16]
 | 
						|
    lea        eax,  [eax + 32]
 | 
						|
    pand       xmm0, xmm5
 | 
						|
    pand       xmm1, xmm5
 | 
						|
    packuswb   xmm0, xmm1
 | 
						|
    psrlw      xmm0, 8
 | 
						|
    packuswb   xmm0, xmm0
 | 
						|
    movq       qword ptr [edx], xmm0
 | 
						|
    lea        edx, [edx + 8]
 | 
						|
    sub        ecx, 8
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Blends 32x4 rectangle to 8x1.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 | 
						|
                           uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    push       esi
 | 
						|
    push       edi
 | 
						|
    mov        eax, [esp + 8 + 4]    // src_ptr
 | 
						|
    mov        esi, [esp + 8 + 8]    // src_stride
 | 
						|
    mov        edx, [esp + 8 + 12]   // dst_ptr
 | 
						|
    mov        ecx, [esp + 8 + 16]   // dst_width
 | 
						|
    lea        edi, [esi + esi * 2]  // src_stride * 3
 | 
						|
    pcmpeqb    xmm4, xmm4            // constant 0x0101
 | 
						|
    psrlw      xmm4, 15
 | 
						|
    movdqa     xmm5, xmm4
 | 
						|
    packuswb   xmm4, xmm4
 | 
						|
    psllw      xmm5, 3               // constant 0x0008
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movdqu     xmm0, [eax]           // average rows
 | 
						|
    movdqu     xmm1, [eax + 16]
 | 
						|
    movdqu     xmm2, [eax + esi]
 | 
						|
    movdqu     xmm3, [eax + esi + 16]
 | 
						|
    pmaddubsw  xmm0, xmm4      // horizontal add
 | 
						|
    pmaddubsw  xmm1, xmm4
 | 
						|
    pmaddubsw  xmm2, xmm4
 | 
						|
    pmaddubsw  xmm3, xmm4
 | 
						|
    paddw      xmm0, xmm2      // vertical add rows 0, 1
 | 
						|
    paddw      xmm1, xmm3
 | 
						|
    movdqu     xmm2, [eax + esi * 2]
 | 
						|
    movdqu     xmm3, [eax + esi * 2 + 16]
 | 
						|
    pmaddubsw  xmm2, xmm4
 | 
						|
    pmaddubsw  xmm3, xmm4
 | 
						|
    paddw      xmm0, xmm2      // add row 2
 | 
						|
    paddw      xmm1, xmm3
 | 
						|
    movdqu     xmm2, [eax + edi]
 | 
						|
    movdqu     xmm3, [eax + edi + 16]
 | 
						|
    lea        eax, [eax + 32]
 | 
						|
    pmaddubsw  xmm2, xmm4
 | 
						|
    pmaddubsw  xmm3, xmm4
 | 
						|
    paddw      xmm0, xmm2      // add row 3
 | 
						|
    paddw      xmm1, xmm3
 | 
						|
    phaddw     xmm0, xmm1
 | 
						|
    paddw      xmm0, xmm5      // + 8 for round
 | 
						|
    psrlw      xmm0, 4         // /16 for average of 4 * 4
 | 
						|
    packuswb   xmm0, xmm0
 | 
						|
    movq       qword ptr [edx], xmm0
 | 
						|
    lea        edx, [edx + 8]
 | 
						|
    sub        ecx, 8
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    pop        edi
 | 
						|
    pop        esi
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
#ifdef HAS_SCALEROWDOWN4_AVX2
 | 
						|
// Point samples 64 pixels to 16 pixels.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 | 
						|
                        uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    mov         eax, [esp + 4]        // src_ptr
 | 
						|
                                      // src_stride ignored
 | 
						|
    mov         edx, [esp + 12]       // dst_ptr
 | 
						|
    mov         ecx, [esp + 16]       // dst_width
 | 
						|
    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
 | 
						|
    vpsrld      ymm5, ymm5, 24
 | 
						|
    vpslld      ymm5, ymm5, 16
 | 
						|
 | 
						|
  wloop:
 | 
						|
    vmovdqu     ymm0, [eax]
 | 
						|
    vmovdqu     ymm1, [eax + 32]
 | 
						|
    lea         eax,  [eax + 64]
 | 
						|
    vpand       ymm0, ymm0, ymm5
 | 
						|
    vpand       ymm1, ymm1, ymm5
 | 
						|
    vpackuswb   ymm0, ymm0, ymm1
 | 
						|
    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
 | 
						|
    vpsrlw      ymm0, ymm0, 8
 | 
						|
    vpackuswb   ymm0, ymm0, ymm0
 | 
						|
    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
 | 
						|
    vmovdqu     [edx], xmm0
 | 
						|
    lea         edx, [edx + 16]
 | 
						|
    sub         ecx, 16
 | 
						|
    jg          wloop
 | 
						|
 | 
						|
    vzeroupper
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Blends 64x4 rectangle to 16x1.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 | 
						|
                           uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    push        esi
 | 
						|
    push        edi
 | 
						|
    mov         eax, [esp + 8 + 4]    // src_ptr
 | 
						|
    mov         esi, [esp + 8 + 8]    // src_stride
 | 
						|
    mov         edx, [esp + 8 + 12]   // dst_ptr
 | 
						|
    mov         ecx, [esp + 8 + 16]   // dst_width
 | 
						|
    lea         edi, [esi + esi * 2]  // src_stride * 3
 | 
						|
    vpcmpeqb    ymm4, ymm4, ymm4            // constant 0x0101
 | 
						|
    vpsrlw      ymm4, ymm4, 15
 | 
						|
    vpsllw      ymm5, ymm4, 3               // constant 0x0008
 | 
						|
    vpackuswb   ymm4, ymm4, ymm4
 | 
						|
 | 
						|
  wloop:
 | 
						|
    vmovdqu     ymm0, [eax]           // average rows
 | 
						|
    vmovdqu     ymm1, [eax + 32]
 | 
						|
    vmovdqu     ymm2, [eax + esi]
 | 
						|
    vmovdqu     ymm3, [eax + esi + 32]
 | 
						|
    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
 | 
						|
    vpmaddubsw  ymm1, ymm1, ymm4
 | 
						|
    vpmaddubsw  ymm2, ymm2, ymm4
 | 
						|
    vpmaddubsw  ymm3, ymm3, ymm4
 | 
						|
    vpaddw      ymm0, ymm0, ymm2      // vertical add rows 0, 1
 | 
						|
    vpaddw      ymm1, ymm1, ymm3
 | 
						|
    vmovdqu     ymm2, [eax + esi * 2]
 | 
						|
    vmovdqu     ymm3, [eax + esi * 2 + 32]
 | 
						|
    vpmaddubsw  ymm2, ymm2, ymm4
 | 
						|
    vpmaddubsw  ymm3, ymm3, ymm4
 | 
						|
    vpaddw      ymm0, ymm0, ymm2      // add row 2
 | 
						|
    vpaddw      ymm1, ymm1, ymm3
 | 
						|
    vmovdqu     ymm2, [eax + edi]
 | 
						|
    vmovdqu     ymm3, [eax + edi + 32]
 | 
						|
    lea         eax,  [eax + 64]
 | 
						|
    vpmaddubsw  ymm2, ymm2, ymm4
 | 
						|
    vpmaddubsw  ymm3, ymm3, ymm4
 | 
						|
    vpaddw      ymm0, ymm0, ymm2      // add row 3
 | 
						|
    vpaddw      ymm1, ymm1, ymm3
 | 
						|
    vphaddw     ymm0, ymm0, ymm1      // mutates
 | 
						|
    vpermq      ymm0, ymm0, 0xd8      // unmutate vphaddw
 | 
						|
    vpaddw      ymm0, ymm0, ymm5      // + 8 for round
 | 
						|
    vpsrlw      ymm0, ymm0, 4         // /32 for average of 4 * 4
 | 
						|
    vpackuswb   ymm0, ymm0, ymm0
 | 
						|
    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
 | 
						|
    vmovdqu     [edx], xmm0
 | 
						|
    lea         edx, [edx + 16]
 | 
						|
    sub         ecx, 16
 | 
						|
    jg          wloop
 | 
						|
 | 
						|
    pop        edi
 | 
						|
    pop        esi
 | 
						|
    vzeroupper
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
#endif  // HAS_SCALEROWDOWN4_AVX2
 | 
						|
 | 
						|
// Point samples 32 pixels to 24 pixels.
 | 
						|
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 | 
						|
// Then shuffled to do the scaling.
 | 
						|
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 | 
						|
                          uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    mov        eax, [esp + 4]        // src_ptr
 | 
						|
                                     // src_stride ignored
 | 
						|
    mov        edx, [esp + 12]       // dst_ptr
 | 
						|
    mov        ecx, [esp + 16]       // dst_width
 | 
						|
    movdqa     xmm3, xmmword ptr kShuf0
 | 
						|
    movdqa     xmm4, xmmword ptr kShuf1
 | 
						|
    movdqa     xmm5, xmmword ptr kShuf2
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movdqu     xmm0, [eax]
 | 
						|
    movdqu     xmm1, [eax + 16]
 | 
						|
    lea        eax,  [eax + 32]
 | 
						|
    movdqa     xmm2, xmm1
 | 
						|
    palignr    xmm1, xmm0, 8
 | 
						|
    pshufb     xmm0, xmm3
 | 
						|
    pshufb     xmm1, xmm4
 | 
						|
    pshufb     xmm2, xmm5
 | 
						|
    movq       qword ptr [edx], xmm0
 | 
						|
    movq       qword ptr [edx + 8], xmm1
 | 
						|
    movq       qword ptr [edx + 16], xmm2
 | 
						|
    lea        edx, [edx + 24]
 | 
						|
    sub        ecx, 24
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Blends 32x2 rectangle to 24x1
 | 
						|
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 | 
						|
// Then shuffled to do the scaling.
 | 
						|
 | 
						|
// Register usage:
 | 
						|
// xmm0 src_row 0
 | 
						|
// xmm1 src_row 1
 | 
						|
// xmm2 shuf 0
 | 
						|
// xmm3 shuf 1
 | 
						|
// xmm4 shuf 2
 | 
						|
// xmm5 madd 0
 | 
						|
// xmm6 madd 1
 | 
						|
// xmm7 kRound34
 | 
						|
 | 
						|
// Note that movdqa+palign may be better than movdqu.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
 | 
						|
                                ptrdiff_t src_stride,
 | 
						|
                                uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    push       esi
 | 
						|
    mov        eax, [esp + 4 + 4]    // src_ptr
 | 
						|
    mov        esi, [esp + 4 + 8]    // src_stride
 | 
						|
    mov        edx, [esp + 4 + 12]   // dst_ptr
 | 
						|
    mov        ecx, [esp + 4 + 16]   // dst_width
 | 
						|
    movdqa     xmm2, xmmword ptr kShuf01
 | 
						|
    movdqa     xmm3, xmmword ptr kShuf11
 | 
						|
    movdqa     xmm4, xmmword ptr kShuf21
 | 
						|
    movdqa     xmm5, xmmword ptr kMadd01
 | 
						|
    movdqa     xmm6, xmmword ptr kMadd11
 | 
						|
    movdqa     xmm7, xmmword ptr kRound34
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movdqu     xmm0, [eax]           // pixels 0..7
 | 
						|
    movdqu     xmm1, [eax + esi]
 | 
						|
    pavgb      xmm0, xmm1
 | 
						|
    pshufb     xmm0, xmm2
 | 
						|
    pmaddubsw  xmm0, xmm5
 | 
						|
    paddsw     xmm0, xmm7
 | 
						|
    psrlw      xmm0, 2
 | 
						|
    packuswb   xmm0, xmm0
 | 
						|
    movq       qword ptr [edx], xmm0
 | 
						|
    movdqu     xmm0, [eax + 8]       // pixels 8..15
 | 
						|
    movdqu     xmm1, [eax + esi + 8]
 | 
						|
    pavgb      xmm0, xmm1
 | 
						|
    pshufb     xmm0, xmm3
 | 
						|
    pmaddubsw  xmm0, xmm6
 | 
						|
    paddsw     xmm0, xmm7
 | 
						|
    psrlw      xmm0, 2
 | 
						|
    packuswb   xmm0, xmm0
 | 
						|
    movq       qword ptr [edx + 8], xmm0
 | 
						|
    movdqu     xmm0, [eax + 16]      // pixels 16..23
 | 
						|
    movdqu     xmm1, [eax + esi + 16]
 | 
						|
    lea        eax, [eax + 32]
 | 
						|
    pavgb      xmm0, xmm1
 | 
						|
    pshufb     xmm0, xmm4
 | 
						|
    movdqa     xmm1, xmmword ptr kMadd21
 | 
						|
    pmaddubsw  xmm0, xmm1
 | 
						|
    paddsw     xmm0, xmm7
 | 
						|
    psrlw      xmm0, 2
 | 
						|
    packuswb   xmm0, xmm0
 | 
						|
    movq       qword ptr [edx + 16], xmm0
 | 
						|
    lea        edx, [edx + 24]
 | 
						|
    sub        ecx, 24
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    pop        esi
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Note that movdqa+palign may be better than movdqu.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
 | 
						|
                                ptrdiff_t src_stride,
 | 
						|
                                uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    push       esi
 | 
						|
    mov        eax, [esp + 4 + 4]    // src_ptr
 | 
						|
    mov        esi, [esp + 4 + 8]    // src_stride
 | 
						|
    mov        edx, [esp + 4 + 12]   // dst_ptr
 | 
						|
    mov        ecx, [esp + 4 + 16]   // dst_width
 | 
						|
    movdqa     xmm2, xmmword ptr kShuf01
 | 
						|
    movdqa     xmm3, xmmword ptr kShuf11
 | 
						|
    movdqa     xmm4, xmmword ptr kShuf21
 | 
						|
    movdqa     xmm5, xmmword ptr kMadd01
 | 
						|
    movdqa     xmm6, xmmword ptr kMadd11
 | 
						|
    movdqa     xmm7, xmmword ptr kRound34
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movdqu     xmm0, [eax]           // pixels 0..7
 | 
						|
    movdqu     xmm1, [eax + esi]
 | 
						|
    pavgb      xmm1, xmm0
 | 
						|
    pavgb      xmm0, xmm1
 | 
						|
    pshufb     xmm0, xmm2
 | 
						|
    pmaddubsw  xmm0, xmm5
 | 
						|
    paddsw     xmm0, xmm7
 | 
						|
    psrlw      xmm0, 2
 | 
						|
    packuswb   xmm0, xmm0
 | 
						|
    movq       qword ptr [edx], xmm0
 | 
						|
    movdqu     xmm0, [eax + 8]       // pixels 8..15
 | 
						|
    movdqu     xmm1, [eax + esi + 8]
 | 
						|
    pavgb      xmm1, xmm0
 | 
						|
    pavgb      xmm0, xmm1
 | 
						|
    pshufb     xmm0, xmm3
 | 
						|
    pmaddubsw  xmm0, xmm6
 | 
						|
    paddsw     xmm0, xmm7
 | 
						|
    psrlw      xmm0, 2
 | 
						|
    packuswb   xmm0, xmm0
 | 
						|
    movq       qword ptr [edx + 8], xmm0
 | 
						|
    movdqu     xmm0, [eax + 16]      // pixels 16..23
 | 
						|
    movdqu     xmm1, [eax + esi + 16]
 | 
						|
    lea        eax, [eax + 32]
 | 
						|
    pavgb      xmm1, xmm0
 | 
						|
    pavgb      xmm0, xmm1
 | 
						|
    pshufb     xmm0, xmm4
 | 
						|
    movdqa     xmm1, xmmword ptr kMadd21
 | 
						|
    pmaddubsw  xmm0, xmm1
 | 
						|
    paddsw     xmm0, xmm7
 | 
						|
    psrlw      xmm0, 2
 | 
						|
    packuswb   xmm0, xmm0
 | 
						|
    movq       qword ptr [edx + 16], xmm0
 | 
						|
    lea        edx, [edx+24]
 | 
						|
    sub        ecx, 24
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    pop        esi
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// 3/8 point sampler
 | 
						|
 | 
						|
// Scale 32 pixels to 12
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 | 
						|
                          uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    mov        eax, [esp + 4]        // src_ptr
 | 
						|
                                     // src_stride ignored
 | 
						|
    mov        edx, [esp + 12]       // dst_ptr
 | 
						|
    mov        ecx, [esp + 16]       // dst_width
 | 
						|
    movdqa     xmm4, xmmword ptr kShuf38a
 | 
						|
    movdqa     xmm5, xmmword ptr kShuf38b
 | 
						|
 | 
						|
  xloop:
 | 
						|
    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
 | 
						|
    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
 | 
						|
    lea        eax, [eax + 32]
 | 
						|
    pshufb     xmm0, xmm4
 | 
						|
    pshufb     xmm1, xmm5
 | 
						|
    paddusb    xmm0, xmm1
 | 
						|
 | 
						|
    movq       qword ptr [edx], xmm0  // write 12 pixels
 | 
						|
    movhlps    xmm1, xmm0
 | 
						|
    movd       [edx + 8], xmm1
 | 
						|
    lea        edx, [edx + 12]
 | 
						|
    sub        ecx, 12
 | 
						|
    jg         xloop
 | 
						|
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Scale 16x3 pixels to 6x1 with interpolation
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
 | 
						|
                                ptrdiff_t src_stride,
 | 
						|
                                uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    push       esi
 | 
						|
    mov        eax, [esp + 4 + 4]    // src_ptr
 | 
						|
    mov        esi, [esp + 4 + 8]    // src_stride
 | 
						|
    mov        edx, [esp + 4 + 12]   // dst_ptr
 | 
						|
    mov        ecx, [esp + 4 + 16]   // dst_width
 | 
						|
    movdqa     xmm2, xmmword ptr kShufAc
 | 
						|
    movdqa     xmm3, xmmword ptr kShufAc3
 | 
						|
    movdqa     xmm4, xmmword ptr kScaleAc33
 | 
						|
    pxor       xmm5, xmm5
 | 
						|
 | 
						|
  xloop:
 | 
						|
    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
 | 
						|
    movdqu     xmm6, [eax + esi]
 | 
						|
    movhlps    xmm1, xmm0
 | 
						|
    movhlps    xmm7, xmm6
 | 
						|
    punpcklbw  xmm0, xmm5
 | 
						|
    punpcklbw  xmm1, xmm5
 | 
						|
    punpcklbw  xmm6, xmm5
 | 
						|
    punpcklbw  xmm7, xmm5
 | 
						|
    paddusw    xmm0, xmm6
 | 
						|
    paddusw    xmm1, xmm7
 | 
						|
    movdqu     xmm6, [eax + esi * 2]
 | 
						|
    lea        eax, [eax + 16]
 | 
						|
    movhlps    xmm7, xmm6
 | 
						|
    punpcklbw  xmm6, xmm5
 | 
						|
    punpcklbw  xmm7, xmm5
 | 
						|
    paddusw    xmm0, xmm6
 | 
						|
    paddusw    xmm1, xmm7
 | 
						|
 | 
						|
    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
 | 
						|
    psrldq     xmm0, 2
 | 
						|
    paddusw    xmm6, xmm0
 | 
						|
    psrldq     xmm0, 2
 | 
						|
    paddusw    xmm6, xmm0
 | 
						|
    pshufb     xmm6, xmm2
 | 
						|
 | 
						|
    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
 | 
						|
    psrldq     xmm1, 2
 | 
						|
    paddusw    xmm7, xmm1
 | 
						|
    psrldq     xmm1, 2
 | 
						|
    paddusw    xmm7, xmm1
 | 
						|
    pshufb     xmm7, xmm3
 | 
						|
    paddusw    xmm6, xmm7
 | 
						|
 | 
						|
    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
 | 
						|
    packuswb   xmm6, xmm6
 | 
						|
 | 
						|
    movd       [edx], xmm6           // write 6 pixels
 | 
						|
    psrlq      xmm6, 16
 | 
						|
    movd       [edx + 2], xmm6
 | 
						|
    lea        edx, [edx + 6]
 | 
						|
    sub        ecx, 6
 | 
						|
    jg         xloop
 | 
						|
 | 
						|
    pop        esi
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Scale 16x2 pixels to 6x1 with interpolation
 | 
						|
__declspec(naked)
 | 
						|
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
 | 
						|
                                ptrdiff_t src_stride,
 | 
						|
                                uint8* dst_ptr, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    push       esi
 | 
						|
    mov        eax, [esp + 4 + 4]    // src_ptr
 | 
						|
    mov        esi, [esp + 4 + 8]    // src_stride
 | 
						|
    mov        edx, [esp + 4 + 12]   // dst_ptr
 | 
						|
    mov        ecx, [esp + 4 + 16]   // dst_width
 | 
						|
    movdqa     xmm2, xmmword ptr kShufAb0
 | 
						|
    movdqa     xmm3, xmmword ptr kShufAb1
 | 
						|
    movdqa     xmm4, xmmword ptr kShufAb2
 | 
						|
    movdqa     xmm5, xmmword ptr kScaleAb2
 | 
						|
 | 
						|
  xloop:
 | 
						|
    movdqu     xmm0, [eax]           // average 2 rows into xmm0
 | 
						|
    movdqu     xmm1, [eax + esi]
 | 
						|
    lea        eax, [eax + 16]
 | 
						|
    pavgb      xmm0, xmm1
 | 
						|
 | 
						|
    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
 | 
						|
    pshufb     xmm1, xmm2
 | 
						|
    movdqa     xmm6, xmm0
 | 
						|
    pshufb     xmm6, xmm3
 | 
						|
    paddusw    xmm1, xmm6
 | 
						|
    pshufb     xmm0, xmm4
 | 
						|
    paddusw    xmm1, xmm0
 | 
						|
 | 
						|
    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
 | 
						|
    packuswb   xmm1, xmm1
 | 
						|
 | 
						|
    movd       [edx], xmm1           // write 6 pixels
 | 
						|
    psrlq      xmm1, 16
 | 
						|
    movd       [edx + 2], xmm1
 | 
						|
    lea        edx, [edx + 6]
 | 
						|
    sub        ecx, 6
 | 
						|
    jg         xloop
 | 
						|
 | 
						|
    pop        esi
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Reads 16 bytes and accumulates to 16 shorts at a time.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
 | 
						|
  __asm {
 | 
						|
    mov        eax, [esp + 4]   // src_ptr
 | 
						|
    mov        edx, [esp + 8]   // dst_ptr
 | 
						|
    mov        ecx, [esp + 12]  // src_width
 | 
						|
    pxor       xmm5, xmm5
 | 
						|
 | 
						|
  // sum rows
 | 
						|
  xloop:
 | 
						|
    movdqu     xmm3, [eax]       // read 16 bytes
 | 
						|
    lea        eax, [eax + 16]
 | 
						|
    movdqu     xmm0, [edx]       // read 16 words from destination
 | 
						|
    movdqu     xmm1, [edx + 16]
 | 
						|
    movdqa     xmm2, xmm3
 | 
						|
    punpcklbw  xmm2, xmm5
 | 
						|
    punpckhbw  xmm3, xmm5
 | 
						|
    paddusw    xmm0, xmm2        // sum 16 words
 | 
						|
    paddusw    xmm1, xmm3
 | 
						|
    movdqu     [edx], xmm0       // write 16 words to destination
 | 
						|
    movdqu     [edx + 16], xmm1
 | 
						|
    lea        edx, [edx + 32]
 | 
						|
    sub        ecx, 16
 | 
						|
    jg         xloop
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
#ifdef HAS_SCALEADDROW_AVX2
 | 
						|
// Reads 32 bytes and accumulates to 32 shorts at a time.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
 | 
						|
  __asm {
 | 
						|
    mov         eax, [esp + 4]   // src_ptr
 | 
						|
    mov         edx, [esp + 8]   // dst_ptr
 | 
						|
    mov         ecx, [esp + 12]  // src_width
 | 
						|
    vpxor       ymm5, ymm5, ymm5
 | 
						|
 | 
						|
  // sum rows
 | 
						|
  xloop:
 | 
						|
    vmovdqu     ymm3, [eax]       // read 32 bytes
 | 
						|
    lea         eax, [eax + 32]
 | 
						|
    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
 | 
						|
    vpunpcklbw  ymm2, ymm3, ymm5
 | 
						|
    vpunpckhbw  ymm3, ymm3, ymm5
 | 
						|
    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
 | 
						|
    vpaddusw    ymm1, ymm3, [edx + 32]
 | 
						|
    vmovdqu     [edx], ymm0       // write 32 words to destination
 | 
						|
    vmovdqu     [edx + 32], ymm1
 | 
						|
    lea         edx, [edx + 64]
 | 
						|
    sub         ecx, 32
 | 
						|
    jg          xloop
 | 
						|
 | 
						|
    vzeroupper
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
#endif  // HAS_SCALEADDROW_AVX2
 | 
						|
 | 
						|
// Constant for making pixels signed to avoid pmaddubsw
 | 
						|
// saturation.
 | 
						|
static uvec8 kFsub80 =
 | 
						|
  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 | 
						|
    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
 | 
						|
 | 
						|
// Constant for making pixels unsigned and adding .5 for rounding.
 | 
						|
static uvec16 kFadd40 =
 | 
						|
  { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
 | 
						|
 | 
						|
// Bilinear column filtering. SSSE3 version.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 | 
						|
                           int dst_width, int x, int dx) {
 | 
						|
  __asm {
 | 
						|
    push       ebx
 | 
						|
    push       esi
 | 
						|
    push       edi
 | 
						|
    mov        edi, [esp + 12 + 4]    // dst_ptr
 | 
						|
    mov        esi, [esp + 12 + 8]    // src_ptr
 | 
						|
    mov        ecx, [esp + 12 + 12]   // dst_width
 | 
						|
    movd       xmm2, [esp + 12 + 16]  // x
 | 
						|
    movd       xmm3, [esp + 12 + 20]  // dx
 | 
						|
    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
 | 
						|
    movd       xmm5, eax
 | 
						|
    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
 | 
						|
    psrlw      xmm6, 9
 | 
						|
    pcmpeqb    xmm7, xmm7           // generate 0x0001
 | 
						|
    psrlw      xmm7, 15
 | 
						|
    pextrw     eax, xmm2, 1         // get x0 integer. preroll
 | 
						|
    sub        ecx, 2
 | 
						|
    jl         xloop29
 | 
						|
 | 
						|
    movdqa     xmm0, xmm2           // x1 = x0 + dx
 | 
						|
    paddd      xmm0, xmm3
 | 
						|
    punpckldq  xmm2, xmm0           // x0 x1
 | 
						|
    punpckldq  xmm3, xmm3           // dx dx
 | 
						|
    paddd      xmm3, xmm3           // dx * 2, dx * 2
 | 
						|
    pextrw     edx, xmm2, 3         // get x1 integer. preroll
 | 
						|
 | 
						|
    // 2 Pixel loop.
 | 
						|
  xloop2:
 | 
						|
    movdqa     xmm1, xmm2           // x0, x1 fractions.
 | 
						|
    paddd      xmm2, xmm3           // x += dx
 | 
						|
    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
 | 
						|
    movd       xmm0, ebx
 | 
						|
    psrlw      xmm1, 9              // 7 bit fractions.
 | 
						|
    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
 | 
						|
    movd       xmm4, ebx
 | 
						|
    pshufb     xmm1, xmm5           // 0011
 | 
						|
    punpcklwd  xmm0, xmm4
 | 
						|
    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
 | 
						|
    pxor       xmm1, xmm6           // 0..7f and 7f..0
 | 
						|
    paddusb    xmm1, xmm7           // +1 so 0..7f and 80..1
 | 
						|
    pmaddubsw  xmm1, xmm0           // 16 bit, 2 pixels.
 | 
						|
    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
 | 
						|
    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
 | 
						|
    paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
 | 
						|
    psrlw      xmm1, 7              // 8.7 fixed point to low 8 bits.
 | 
						|
    packuswb   xmm1, xmm1           // 8 bits, 2 pixels.
 | 
						|
    movd       ebx, xmm1
 | 
						|
    mov        [edi], bx
 | 
						|
    lea        edi, [edi + 2]
 | 
						|
    sub        ecx, 2               // 2 pixels
 | 
						|
    jge        xloop2
 | 
						|
 | 
						|
 xloop29:
 | 
						|
    add        ecx, 2 - 1
 | 
						|
    jl         xloop99
 | 
						|
 | 
						|
    // 1 pixel remainder
 | 
						|
    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
 | 
						|
    movd       xmm0, ebx
 | 
						|
    psrlw      xmm2, 9              // 7 bit fractions.
 | 
						|
    pshufb     xmm2, xmm5           // 0011
 | 
						|
    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
 | 
						|
    pxor       xmm2, xmm6           // 0..7f and 7f..0
 | 
						|
    paddusb    xmm2, xmm7           // +1 so 0..7f and 80..1
 | 
						|
    pmaddubsw  xmm2, xmm0           // 16 bit
 | 
						|
    paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
 | 
						|
    psrlw      xmm2, 7              // 8.7 fixed point to low 8 bits.
 | 
						|
    packuswb   xmm2, xmm2           // 8 bits
 | 
						|
    movd       ebx, xmm2
 | 
						|
    mov        [edi], bl
 | 
						|
 | 
						|
 xloop99:
 | 
						|
 | 
						|
    pop        edi
 | 
						|
    pop        esi
 | 
						|
    pop        ebx
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Reads 16 pixels, duplicates them and writes 32 pixels.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 | 
						|
                       int dst_width, int x, int dx) {
 | 
						|
  __asm {
 | 
						|
    mov        edx, [esp + 4]    // dst_ptr
 | 
						|
    mov        eax, [esp + 8]    // src_ptr
 | 
						|
    mov        ecx, [esp + 12]   // dst_width
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movdqu     xmm0, [eax]
 | 
						|
    lea        eax,  [eax + 16]
 | 
						|
    movdqa     xmm1, xmm0
 | 
						|
    punpcklbw  xmm0, xmm0
 | 
						|
    punpckhbw  xmm1, xmm1
 | 
						|
    movdqu     [edx], xmm0
 | 
						|
    movdqu     [edx + 16], xmm1
 | 
						|
    lea        edx, [edx + 32]
 | 
						|
    sub        ecx, 32
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
 | 
						|
__declspec(naked)
 | 
						|
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
 | 
						|
                            ptrdiff_t src_stride,
 | 
						|
                            uint8* dst_argb, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    mov        eax, [esp + 4]        // src_argb
 | 
						|
                                     // src_stride ignored
 | 
						|
    mov        edx, [esp + 12]       // dst_argb
 | 
						|
    mov        ecx, [esp + 16]       // dst_width
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movdqu     xmm0, [eax]
 | 
						|
    movdqu     xmm1, [eax + 16]
 | 
						|
    lea        eax,  [eax + 32]
 | 
						|
    shufps     xmm0, xmm1, 0xdd
 | 
						|
    movdqu     [edx], xmm0
 | 
						|
    lea        edx, [edx + 16]
 | 
						|
    sub        ecx, 4
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Blends 8x1 rectangle to 4x1.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
 | 
						|
                                  ptrdiff_t src_stride,
 | 
						|
                                  uint8* dst_argb, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    mov        eax, [esp + 4]        // src_argb
 | 
						|
                                     // src_stride ignored
 | 
						|
    mov        edx, [esp + 12]       // dst_argb
 | 
						|
    mov        ecx, [esp + 16]       // dst_width
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movdqu     xmm0, [eax]
 | 
						|
    movdqu     xmm1, [eax + 16]
 | 
						|
    lea        eax,  [eax + 32]
 | 
						|
    movdqa     xmm2, xmm0
 | 
						|
    shufps     xmm0, xmm1, 0x88      // even pixels
 | 
						|
    shufps     xmm2, xmm1, 0xdd      // odd pixels
 | 
						|
    pavgb      xmm0, xmm2
 | 
						|
    movdqu     [edx], xmm0
 | 
						|
    lea        edx, [edx + 16]
 | 
						|
    sub        ecx, 4
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Blends 8x2 rectangle to 4x1.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
 | 
						|
                               ptrdiff_t src_stride,
 | 
						|
                               uint8* dst_argb, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    push       esi
 | 
						|
    mov        eax, [esp + 4 + 4]    // src_argb
 | 
						|
    mov        esi, [esp + 4 + 8]    // src_stride
 | 
						|
    mov        edx, [esp + 4 + 12]   // dst_argb
 | 
						|
    mov        ecx, [esp + 4 + 16]   // dst_width
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movdqu     xmm0, [eax]
 | 
						|
    movdqu     xmm1, [eax + 16]
 | 
						|
    movdqu     xmm2, [eax + esi]
 | 
						|
    movdqu     xmm3, [eax + esi + 16]
 | 
						|
    lea        eax,  [eax + 32]
 | 
						|
    pavgb      xmm0, xmm2            // average rows
 | 
						|
    pavgb      xmm1, xmm3
 | 
						|
    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
 | 
						|
    shufps     xmm0, xmm1, 0x88      // even pixels
 | 
						|
    shufps     xmm2, xmm1, 0xdd      // odd pixels
 | 
						|
    pavgb      xmm0, xmm2
 | 
						|
    movdqu     [edx], xmm0
 | 
						|
    lea        edx, [edx + 16]
 | 
						|
    sub        ecx, 4
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    pop        esi
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Reads 4 pixels at a time.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
 | 
						|
                               int src_stepx,
 | 
						|
                               uint8* dst_argb, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    push       ebx
 | 
						|
    push       edi
 | 
						|
    mov        eax, [esp + 8 + 4]    // src_argb
 | 
						|
                                     // src_stride ignored
 | 
						|
    mov        ebx, [esp + 8 + 12]   // src_stepx
 | 
						|
    mov        edx, [esp + 8 + 16]   // dst_argb
 | 
						|
    mov        ecx, [esp + 8 + 20]   // dst_width
 | 
						|
    lea        ebx, [ebx * 4]
 | 
						|
    lea        edi, [ebx + ebx * 2]
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movd       xmm0, [eax]
 | 
						|
    movd       xmm1, [eax + ebx]
 | 
						|
    punpckldq  xmm0, xmm1
 | 
						|
    movd       xmm2, [eax + ebx * 2]
 | 
						|
    movd       xmm3, [eax + edi]
 | 
						|
    lea        eax,  [eax + ebx * 4]
 | 
						|
    punpckldq  xmm2, xmm3
 | 
						|
    punpcklqdq xmm0, xmm2
 | 
						|
    movdqu     [edx], xmm0
 | 
						|
    lea        edx, [edx + 16]
 | 
						|
    sub        ecx, 4
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    pop        edi
 | 
						|
    pop        ebx
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Blends four 2x2 to 4x1.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
 | 
						|
                                  ptrdiff_t src_stride,
 | 
						|
                                  int src_stepx,
 | 
						|
                                  uint8* dst_argb, int dst_width) {
 | 
						|
  __asm {
 | 
						|
    push       ebx
 | 
						|
    push       esi
 | 
						|
    push       edi
 | 
						|
    mov        eax, [esp + 12 + 4]    // src_argb
 | 
						|
    mov        esi, [esp + 12 + 8]    // src_stride
 | 
						|
    mov        ebx, [esp + 12 + 12]   // src_stepx
 | 
						|
    mov        edx, [esp + 12 + 16]   // dst_argb
 | 
						|
    mov        ecx, [esp + 12 + 20]   // dst_width
 | 
						|
    lea        esi, [eax + esi]       // row1 pointer
 | 
						|
    lea        ebx, [ebx * 4]
 | 
						|
    lea        edi, [ebx + ebx * 2]
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movq       xmm0, qword ptr [eax]  // row0 4 pairs
 | 
						|
    movhps     xmm0, qword ptr [eax + ebx]
 | 
						|
    movq       xmm1, qword ptr [eax + ebx * 2]
 | 
						|
    movhps     xmm1, qword ptr [eax + edi]
 | 
						|
    lea        eax,  [eax + ebx * 4]
 | 
						|
    movq       xmm2, qword ptr [esi]  // row1 4 pairs
 | 
						|
    movhps     xmm2, qword ptr [esi + ebx]
 | 
						|
    movq       xmm3, qword ptr [esi + ebx * 2]
 | 
						|
    movhps     xmm3, qword ptr [esi + edi]
 | 
						|
    lea        esi,  [esi + ebx * 4]
 | 
						|
    pavgb      xmm0, xmm2            // average rows
 | 
						|
    pavgb      xmm1, xmm3
 | 
						|
    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
 | 
						|
    shufps     xmm0, xmm1, 0x88      // even pixels
 | 
						|
    shufps     xmm2, xmm1, 0xdd      // odd pixels
 | 
						|
    pavgb      xmm0, xmm2
 | 
						|
    movdqu     [edx], xmm0
 | 
						|
    lea        edx, [edx + 16]
 | 
						|
    sub        ecx, 4
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    pop        edi
 | 
						|
    pop        esi
 | 
						|
    pop        ebx
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Column scaling unfiltered. SSE2 version.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
 | 
						|
                        int dst_width, int x, int dx) {
 | 
						|
  __asm {
 | 
						|
    push       edi
 | 
						|
    push       esi
 | 
						|
    mov        edi, [esp + 8 + 4]    // dst_argb
 | 
						|
    mov        esi, [esp + 8 + 8]    // src_argb
 | 
						|
    mov        ecx, [esp + 8 + 12]   // dst_width
 | 
						|
    movd       xmm2, [esp + 8 + 16]  // x
 | 
						|
    movd       xmm3, [esp + 8 + 20]  // dx
 | 
						|
 | 
						|
    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
 | 
						|
    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
 | 
						|
    paddd      xmm2, xmm0
 | 
						|
    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
 | 
						|
    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
 | 
						|
    paddd      xmm2, xmm0            // x3 x2 x1 x0
 | 
						|
    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
 | 
						|
    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
 | 
						|
 | 
						|
    pextrw     eax, xmm2, 1          // get x0 integer.
 | 
						|
    pextrw     edx, xmm2, 3          // get x1 integer.
 | 
						|
 | 
						|
    cmp        ecx, 0
 | 
						|
    jle        xloop99
 | 
						|
    sub        ecx, 4
 | 
						|
    jl         xloop49
 | 
						|
 | 
						|
    // 4 Pixel loop.
 | 
						|
 xloop4:
 | 
						|
    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
 | 
						|
    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
 | 
						|
    pextrw     eax, xmm2, 5           // get x2 integer.
 | 
						|
    pextrw     edx, xmm2, 7           // get x3 integer.
 | 
						|
    paddd      xmm2, xmm3             // x += dx
 | 
						|
    punpckldq  xmm0, xmm1             // x0 x1
 | 
						|
 | 
						|
    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
 | 
						|
    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
 | 
						|
    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
 | 
						|
    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
 | 
						|
    punpckldq  xmm1, xmm4             // x2 x3
 | 
						|
    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
 | 
						|
    movdqu     [edi], xmm0
 | 
						|
    lea        edi, [edi + 16]
 | 
						|
    sub        ecx, 4                 // 4 pixels
 | 
						|
    jge        xloop4
 | 
						|
 | 
						|
 xloop49:
 | 
						|
    test       ecx, 2
 | 
						|
    je         xloop29
 | 
						|
 | 
						|
    // 2 Pixels.
 | 
						|
    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
 | 
						|
    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
 | 
						|
    pextrw     eax, xmm2, 5           // get x2 integer.
 | 
						|
    punpckldq  xmm0, xmm1             // x0 x1
 | 
						|
 | 
						|
    movq       qword ptr [edi], xmm0
 | 
						|
    lea        edi, [edi + 8]
 | 
						|
 | 
						|
 xloop29:
 | 
						|
    test       ecx, 1
 | 
						|
    je         xloop99
 | 
						|
 | 
						|
    // 1 Pixels.
 | 
						|
    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
 | 
						|
    movd       dword ptr [edi], xmm0
 | 
						|
 xloop99:
 | 
						|
 | 
						|
    pop        esi
 | 
						|
    pop        edi
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
 | 
						|
// TODO(fbarchard): Port to Neon
 | 
						|
 | 
						|
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
 | 
						|
static uvec8 kShuffleColARGB = {
 | 
						|
  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
 | 
						|
  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
 | 
						|
};
 | 
						|
 | 
						|
// Shuffle table for duplicating 2 fractions into 8 bytes each
 | 
						|
static uvec8 kShuffleFractions = {
 | 
						|
  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 | 
						|
};
 | 
						|
 | 
						|
__declspec(naked)
 | 
						|
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
 | 
						|
                               int dst_width, int x, int dx) {
 | 
						|
  __asm {
 | 
						|
    push       esi
 | 
						|
    push       edi
 | 
						|
    mov        edi, [esp + 8 + 4]    // dst_argb
 | 
						|
    mov        esi, [esp + 8 + 8]    // src_argb
 | 
						|
    mov        ecx, [esp + 8 + 12]   // dst_width
 | 
						|
    movd       xmm2, [esp + 8 + 16]  // x
 | 
						|
    movd       xmm3, [esp + 8 + 20]  // dx
 | 
						|
    movdqa     xmm4, xmmword ptr kShuffleColARGB
 | 
						|
    movdqa     xmm5, xmmword ptr kShuffleFractions
 | 
						|
    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
 | 
						|
    psrlw      xmm6, 9
 | 
						|
    pextrw     eax, xmm2, 1         // get x0 integer. preroll
 | 
						|
    sub        ecx, 2
 | 
						|
    jl         xloop29
 | 
						|
 | 
						|
    movdqa     xmm0, xmm2           // x1 = x0 + dx
 | 
						|
    paddd      xmm0, xmm3
 | 
						|
    punpckldq  xmm2, xmm0           // x0 x1
 | 
						|
    punpckldq  xmm3, xmm3           // dx dx
 | 
						|
    paddd      xmm3, xmm3           // dx * 2, dx * 2
 | 
						|
    pextrw     edx, xmm2, 3         // get x1 integer. preroll
 | 
						|
 | 
						|
    // 2 Pixel loop.
 | 
						|
  xloop2:
 | 
						|
    movdqa     xmm1, xmm2           // x0, x1 fractions.
 | 
						|
    paddd      xmm2, xmm3           // x += dx
 | 
						|
    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
 | 
						|
    psrlw      xmm1, 9              // 7 bit fractions.
 | 
						|
    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
 | 
						|
    pshufb     xmm1, xmm5           // 0000000011111111
 | 
						|
    pshufb     xmm0, xmm4           // arrange pixels into pairs
 | 
						|
    pxor       xmm1, xmm6           // 0..7f and 7f..0
 | 
						|
    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
 | 
						|
    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
 | 
						|
    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
 | 
						|
    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
 | 
						|
    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
 | 
						|
    movq       qword ptr [edi], xmm0
 | 
						|
    lea        edi, [edi + 8]
 | 
						|
    sub        ecx, 2               // 2 pixels
 | 
						|
    jge        xloop2
 | 
						|
 | 
						|
 xloop29:
 | 
						|
 | 
						|
    add        ecx, 2 - 1
 | 
						|
    jl         xloop99
 | 
						|
 | 
						|
    // 1 pixel remainder
 | 
						|
    psrlw      xmm2, 9              // 7 bit fractions.
 | 
						|
    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
 | 
						|
    pshufb     xmm2, xmm5           // 00000000
 | 
						|
    pshufb     xmm0, xmm4           // arrange pixels into pairs
 | 
						|
    pxor       xmm2, xmm6           // 0..7f and 7f..0
 | 
						|
    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
 | 
						|
    psrlw      xmm0, 7
 | 
						|
    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
 | 
						|
    movd       [edi], xmm0
 | 
						|
 | 
						|
 xloop99:
 | 
						|
 | 
						|
    pop        edi
 | 
						|
    pop        esi
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Reads 4 pixels, duplicates them and writes 8 pixels.
 | 
						|
__declspec(naked)
 | 
						|
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
 | 
						|
                           int dst_width, int x, int dx) {
 | 
						|
  __asm {
 | 
						|
    mov        edx, [esp + 4]    // dst_argb
 | 
						|
    mov        eax, [esp + 8]    // src_argb
 | 
						|
    mov        ecx, [esp + 12]   // dst_width
 | 
						|
 | 
						|
  wloop:
 | 
						|
    movdqu     xmm0, [eax]
 | 
						|
    lea        eax,  [eax + 16]
 | 
						|
    movdqa     xmm1, xmm0
 | 
						|
    punpckldq  xmm0, xmm0
 | 
						|
    punpckhdq  xmm1, xmm1
 | 
						|
    movdqu     [edx], xmm0
 | 
						|
    movdqu     [edx + 16], xmm1
 | 
						|
    lea        edx, [edx + 32]
 | 
						|
    sub        ecx, 8
 | 
						|
    jg         wloop
 | 
						|
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Divide num by div and return as 16.16 fixed point result.
 | 
						|
__declspec(naked)
 | 
						|
int FixedDiv_X86(int num, int div) {
 | 
						|
  __asm {
 | 
						|
    mov        eax, [esp + 4]    // num
 | 
						|
    cdq                          // extend num to 64 bits
 | 
						|
    shld       edx, eax, 16      // 32.16
 | 
						|
    shl        eax, 16
 | 
						|
    idiv       dword ptr [esp + 8]
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Divide num by div and return as 16.16 fixed point result.
 | 
						|
__declspec(naked)
 | 
						|
int FixedDiv1_X86(int num, int div) {
 | 
						|
  __asm {
 | 
						|
    mov        eax, [esp + 4]    // num
 | 
						|
    mov        ecx, [esp + 8]    // denom
 | 
						|
    cdq                          // extend num to 64 bits
 | 
						|
    shld       edx, eax, 16      // 32.16
 | 
						|
    shl        eax, 16
 | 
						|
    sub        eax, 0x00010001
 | 
						|
    sbb        edx, 0
 | 
						|
    sub        ecx, 1
 | 
						|
    idiv       ecx
 | 
						|
    ret
 | 
						|
  }
 | 
						|
}
 | 
						|
#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 | 
						|
 | 
						|
#ifdef __cplusplus
 | 
						|
}  // extern "C"
 | 
						|
}  // namespace libyuv
 | 
						|
#endif
 |