vpx/vpx_scale/win32/scaleopt.c

/*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


/****************************************************************************
*
*   Module Title :     scaleopt.cpp
*
*   Description  :     Optimized scaling functions
*
****************************************************************************/
#include "pragmas.h"


/****************************************************************************
*  Module Statics
****************************************************************************/
__declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
__declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
__declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };


#include "vpx_scale/vpxscale.h"
#include "vpx_mem/vpx_mem.h"

/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_3_5_scale_mmx
 *
 *  INPUTS        : const unsigned char *source :
 *                  unsigned int source_width    :
 *                  unsigned char *dest         :
 *                  unsigned int dest_width      :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_3_5_scale_mmx
(
  const unsigned char *source,
  unsigned int source_width,
  unsigned char *dest,
  unsigned int dest_width
) {
  (void) dest_width;

  __asm {

    push ebx

    mov         esi,    source
    mov         edi,    dest

    mov         ecx,    source_width
    lea         edx,    [esi+ecx-3];

    movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
    movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx

    movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
    pxor        mm7,    mm7             // clear mm7

    horiz_line_3_5_loop:

    mov        eax,    DWORD PTR [esi] // eax = 00 01 02 03
    mov        ebx,    eax

    and         ebx,    0xffff00        // ebx = xx 01 02 xx
    mov         ecx,    eax             // ecx = 00 01 02 03

    and         eax,    0xffff0000      // eax = xx xx 02 03
    xor         ecx,    eax             // ecx = 00 01 xx xx

    shr         ebx,    8               // ebx = 01 02 xx xx
    or          eax,    ebx             // eax = 01 02 02 03

    shl         ebx,    16              // ebx = xx xx 01 02
    movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx

    or          ebx,    ecx             // ebx = 00 01 01 02
    punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx

    movd        mm0,    ebx             // mm0 = 00 01 01 02
    pmullw      mm1,    mm6             //

    punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
    pmullw      mm0,    mm5             //

    mov         [edi],  ebx             // writeoutput 00 xx xx xx
    add         esi,    3

    add         edi,    5
    paddw       mm0,    mm1

    paddw       mm0,    mm4
    psrlw       mm0,    8

    cmp         esi,    edx
    packuswb    mm0,    mm7

    movd        DWORD Ptr [edi-4], mm0
    jl          horiz_line_3_5_loop

// Exit:
    mov         eax,    DWORD PTR [esi] // eax = 00 01 02 03
    mov         ebx,    eax

    and         ebx,    0xffff00        // ebx = xx 01 02 xx
    mov         ecx,    eax             // ecx = 00 01 02 03

    and         eax,    0xffff0000      // eax = xx xx 02 03
    xor         ecx,    eax             // ecx = 00 01 xx xx

    shr         ebx,    8               // ebx = 01 02 xx xx
    or          eax,    ebx             // eax = 01 02 02 03

    shl         eax,    8               // eax = xx 01 02 02
    and         eax,    0xffff0000      // eax = xx xx 02 02

    or          eax,    ebx             // eax = 01 02 02 02

    shl         ebx,    16              // ebx = xx xx 01 02
    movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx

    or          ebx,    ecx             // ebx = 00 01 01 02
    punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx

    movd        mm0,    ebx             // mm0 = 00 01 01 02
    pmullw      mm1,    mm6             //

    punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
    pmullw      mm0,    mm5             //

    mov         [edi],  ebx             // writeoutput 00 xx xx xx
    paddw       mm0,    mm1

    paddw       mm0,    mm4
    psrlw       mm0,    8

    packuswb    mm0,    mm7
    movd        DWORD Ptr [edi+1], mm0

    pop ebx

  }

}


/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_4_5_scale_mmx
 *
 *  INPUTS        : const unsigned char *source :
 *                  unsigned int source_width    :
 *                  unsigned char *dest         :
 *                  unsigned int dest_width      :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_4_5_scale_mmx
(
  const unsigned char *source,
  unsigned int source_width,
  unsigned char *dest,
  unsigned int dest_width
) {
  (void)dest_width;

  __asm {

    mov         esi,    source
    mov         edi,    dest

    mov         ecx,    source_width
    lea         edx,    [esi+ecx-8];

    movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
    movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx

    movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
    pxor        mm7,    mm7             // clear mm7

    horiz_line_4_5_loop:

    movq        mm0,    QWORD PTR [esi]           // mm0 = 00 01 02 03 04 05 06 07
    movq        mm1,    QWORD PTR [esi+1];        // mm1 = 01 02 03 04 05 06 07 08

    movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
    movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08

    movd        DWORD PTR [edi],  mm0             // write output 00 xx xx xx
    punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx

    punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
    pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205

    pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
    punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx

    movd        DWORD PTR [edi+5], mm2            // write ouput 05 xx xx xx
    pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205

    punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
    pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51

    paddw       mm0,    mm1             // added round values
    paddw       mm0,    mm4

    psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
    packuswb    mm0,    mm7

    movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
    add         edi,    10

    add         esi,    8
    paddw       mm2,    mm3             //

    paddw       mm2,    mm4             // added round values
    cmp         esi,    edx

    psrlw       mm2,    8
    packuswb    mm2,    mm7

    movd        DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
    jl         horiz_line_4_5_loop

// Exit:
    movq        mm0,    [esi]           // mm0 = 00 01 02 03 04 05 06 07
    movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07

    movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
    psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00

    movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
    pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00

    psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
    por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07

    movq        mm3,    mm1

    movd        DWORD PTR [edi],  mm0   // write output 00 xx xx xx
    punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx

    punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
    pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205

    pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
    punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx

    movd        DWORD PTR [edi+5], mm2  // write ouput 05 xx xx xx
    pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205

    punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
    pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51

    paddw       mm0,    mm1             // added round values
    paddw       mm0,    mm4

    psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
    packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx

    movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
    paddw       mm2,    mm3             //

    paddw       mm2,    mm4             // added round values
    psrlw       mm2,    8

    packuswb    mm2,    mm7
    movd        DWORD PTR [edi+6], mm2  // writeoutput 06 07 08 09


  }
}

/****************************************************************************
 *
 *  ROUTINE       : vertical_band_4_5_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has a "C" only
 *                  version.
 *
 ****************************************************************************/
static
void vertical_band_4_5_scale_mmx
(
  unsigned char *dest,
  unsigned int dest_pitch,
  unsigned int dest_width
) {
  __asm {

    mov         esi,    dest                    // Get the source and destination pointer
    mov         ecx,    dest_pitch               // Get the pitch size

    lea         edi,    [esi+ecx*2]             // tow lines below
    add         edi,    ecx                     // three lines below

    pxor        mm7,    mm7                     // clear out mm7
    mov         edx,    dest_width               // Loop counter

    vs_4_5_loop:

    movq        mm0,    QWORD ptr [esi]         // src[0];
    movq        mm1,    QWORD ptr [esi+ecx]     // src[1];

    movq        mm2,    mm0                     // Make a copy
    punpcklbw   mm0,    mm7                     // unpack low to word

    movq        mm5,    one_fifth
    punpckhbw   mm2,    mm7                     // unpack high to word

    pmullw      mm0,    mm5                     // a * 1/5

    movq        mm3,    mm1                     // make a copy
    punpcklbw   mm1,    mm7                     // unpack low to word

    pmullw      mm2,    mm5                     // a * 1/5
    movq        mm6,    four_fifths               // constan

    movq        mm4,    mm1                     // copy of low b
    pmullw      mm4,    mm6                     // b * 4/5

    punpckhbw   mm3,    mm7                     // unpack high to word
    movq        mm5,    mm3                     // copy of high b

    pmullw      mm5,    mm6                     // b * 4/5
    paddw       mm0,    mm4                     // a * 1/5 + b * 4/5

    paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
    paddw       mm0,    round_values             // + 128

    paddw       mm2,    round_values             // + 128
    psrlw       mm0,    8

    psrlw       mm2,    8
    packuswb    mm0,    mm2                     // des [1]

    movq        QWORD ptr [esi+ecx], mm0        // write des[1]
    movq        mm0,    [esi+ecx*2]             // mm0 = src[2]

    // mm1, mm3 --- Src[1]
    // mm0 --- Src[2]
    // mm7 for unpacking

    movq        mm5,    two_fifths
    movq        mm2,    mm0                     // make a copy

    pmullw      mm1,    mm5                     // b * 2/5
    movq        mm6,    three_fifths


    punpcklbw   mm0,    mm7                     // unpack low to word
    pmullw      mm3,    mm5                     // b * 2/5

    movq        mm4,    mm0                     // make copy of c
    punpckhbw   mm2,    mm7                     // unpack high to word

    pmullw      mm4,    mm6                     // c * 3/5
    movq        mm5,    mm2

    pmullw      mm5,    mm6                     // c * 3/5
    paddw       mm1,    mm4                     // b * 2/5 + c * 3/5

    paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
    paddw       mm1,    round_values             // + 128

    paddw       mm3,    round_values             // + 128
    psrlw       mm1,    8

    psrlw       mm3,    8
    packuswb    mm1,    mm3                     // des[2]

    movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
    movq        mm1,    [edi]                   // mm1=Src[3];

    // mm0, mm2 --- Src[2]
    // mm1 --- Src[3]
    // mm6 --- 3/5
    // mm7 for unpacking

    pmullw      mm0,    mm6                     // c * 3/5
    movq        mm5,    two_fifths               // mm5 = 2/5

    movq        mm3,    mm1                     // make a copy
    pmullw      mm2,    mm6                     // c * 3/5

    punpcklbw   mm1,    mm7                     // unpack low
    movq        mm4,    mm1                     // make a copy

    punpckhbw   mm3,    mm7                     // unpack high
    pmullw      mm4,    mm5                     // d * 2/5

    movq        mm6,    mm3                     // make a copy
    pmullw      mm6,    mm5                     // d * 2/5

    paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
    paddw       mm2,    mm6                     // c * 3/5 + d * 2/5

    paddw       mm0,    round_values             // + 128
    paddw       mm2,    round_values             // + 128

    psrlw       mm0,    8
    psrlw       mm2,    8

    packuswb    mm0,    mm2                     // des[3]
    movq        QWORD ptr [edi], mm0            // write des[3]

    //  mm1, mm3 --- Src[3]
    //  mm7 -- cleared for unpacking

    movq        mm0,    [edi+ecx*2]             // mm0, Src[0] of the next group

    movq        mm5,    four_fifths              // mm5 = 4/5
    pmullw      mm1,    mm5                     // d * 4/5

    movq        mm6,    one_fifth                // mm6 = 1/5
    movq        mm2,    mm0                     // make a copy

    pmullw      mm3,    mm5                     // d * 4/5
    punpcklbw   mm0,    mm7                     // unpack low

    pmullw      mm0,    mm6                     // an * 1/5
    punpckhbw   mm2,    mm7                     // unpack high

    paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
    pmullw      mm2,    mm6                     // an * 1/5

    paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
    paddw       mm1,    round_values             // + 128

    paddw       mm3,    round_values             // + 128
    psrlw       mm1,    8

    psrlw       mm3,    8
    packuswb    mm1,    mm3                     // des[4]

    movq        QWORD ptr [edi+ecx], mm1        // write des[4]

    add         edi,    8
    add         esi,    8

    sub         edx,    8
    jg         vs_4_5_loop
  }
}

/****************************************************************************
 *
 *  ROUTINE       : last_vertical_band_4_5_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : None
 *
 *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void last_vertical_band_4_5_scale_mmx
(
  unsigned char *dest,
  unsigned int dest_pitch,
  unsigned int dest_width
) {
  __asm {
    mov         esi,    dest                    // Get the source and destination pointer
    mov         ecx,    dest_pitch               // Get the pitch size

    lea         edi,    [esi+ecx*2]             // tow lines below
    add         edi,    ecx                     // three lines below

    pxor        mm7,    mm7                     // clear out mm7
    mov         edx,    dest_width               // Loop counter

    last_vs_4_5_loop:

    movq        mm0,    QWORD ptr [esi]         // src[0];
    movq        mm1,    QWORD ptr [esi+ecx]     // src[1];

    movq        mm2,    mm0                     // Make a copy
    punpcklbw   mm0,    mm7                     // unpack low to word

    movq        mm5,    one_fifth
    punpckhbw   mm2,    mm7                     // unpack high to word

    pmullw      mm0,    mm5                     // a * 1/5

    movq        mm3,    mm1                     // make a copy
    punpcklbw   mm1,    mm7                     // unpack low to word

    pmullw      mm2,    mm5                     // a * 1/5
    movq        mm6,    four_fifths               // constan

    movq        mm4,    mm1                     // copy of low b
    pmullw      mm4,    mm6                     // b * 4/5

    punpckhbw   mm3,    mm7                     // unpack high to word
    movq        mm5,    mm3                     // copy of high b

    pmullw      mm5,    mm6                     // b * 4/5
    paddw       mm0,    mm4                     // a * 1/5 + b * 4/5

    paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
    paddw       mm0,    round_values             // + 128

    paddw       mm2,    round_values             // + 128
    psrlw       mm0,    8

    psrlw       mm2,    8
    packuswb    mm0,    mm2                     // des [1]

    movq        QWORD ptr [esi+ecx], mm0        // write des[1]
    movq        mm0,    [esi+ecx*2]             // mm0 = src[2]

    // mm1, mm3 --- Src[1]
    // mm0 --- Src[2]
    // mm7 for unpacking

    movq        mm5,    two_fifths
    movq        mm2,    mm0                     // make a copy

    pmullw      mm1,    mm5                     // b * 2/5
    movq        mm6,    three_fifths


    punpcklbw   mm0,    mm7                     // unpack low to word
    pmullw      mm3,    mm5                     // b * 2/5

    movq        mm4,    mm0                     // make copy of c
    punpckhbw   mm2,    mm7                     // unpack high to word

    pmullw      mm4,    mm6                     // c * 3/5
    movq        mm5,    mm2

    pmullw      mm5,    mm6                     // c * 3/5
    paddw       mm1,    mm4                     // b * 2/5 + c * 3/5

    paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
    paddw       mm1,    round_values             // + 128

    paddw       mm3,    round_values             // + 128
    psrlw       mm1,    8

    psrlw       mm3,    8
    packuswb    mm1,    mm3                     // des[2]

    movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
    movq        mm1,    [edi]                   // mm1=Src[3];

    movq        QWORD ptr [edi+ecx], mm1        // write des[4];

    // mm0, mm2 --- Src[2]
    // mm1 --- Src[3]
    // mm6 --- 3/5
    // mm7 for unpacking

    pmullw      mm0,    mm6                     // c * 3/5
    movq        mm5,    two_fifths               // mm5 = 2/5

    movq        mm3,    mm1                     // make a copy
    pmullw      mm2,    mm6                     // c * 3/5

    punpcklbw   mm1,    mm7                     // unpack low
    movq        mm4,    mm1                     // make a copy

    punpckhbw   mm3,    mm7                     // unpack high
    pmullw      mm4,    mm5                     // d * 2/5

    movq        mm6,    mm3                     // make a copy
    pmullw      mm6,    mm5                     // d * 2/5

    paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
    paddw       mm2,    mm6                     // c * 3/5 + d * 2/5

    paddw       mm0,    round_values             // + 128
    paddw       mm2,    round_values             // + 128

    psrlw       mm0,    8
    psrlw       mm2,    8

    packuswb    mm0,    mm2                     // des[3]
    movq        QWORD ptr [edi], mm0            // write des[3]

    //  mm1, mm3 --- Src[3]
    //  mm7 -- cleared for unpacking
    add         edi,    8
    add         esi,    8

    sub         edx,    8
    jg          last_vs_4_5_loop
  }
}

/****************************************************************************
 *
 *  ROUTINE       : vertical_band_3_5_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void vertical_band_3_5_scale_mmx
(
  unsigned char *dest,
  unsigned int dest_pitch,
  unsigned int dest_width
) {
  __asm {
    mov         esi,    dest                    // Get the source and destination pointer
    mov         ecx,    dest_pitch               // Get the pitch size

    lea         edi,    [esi+ecx*2]             // tow lines below
    add         edi,    ecx                     // three lines below

    pxor        mm7,    mm7                     // clear out mm7
    mov         edx,    dest_width               // Loop counter

    vs_3_5_loop:

    movq        mm0,    QWORD ptr [esi]         // src[0];
    movq        mm1,    QWORD ptr [esi+ecx]     // src[1];

    movq        mm2,    mm0                     // Make a copy
    punpcklbw   mm0,    mm7                     // unpack low to word

    movq        mm5,    two_fifths               // mm5 = 2/5
    punpckhbw   mm2,    mm7                     // unpack high to word

    pmullw      mm0,    mm5                     // a * 2/5

    movq        mm3,    mm1                     // make a copy
    punpcklbw   mm1,    mm7                     // unpack low to word

    pmullw      mm2,    mm5                     // a * 2/5
    movq        mm6,    three_fifths             // mm6 = 3/5

    movq        mm4,    mm1                     // copy of low b
    pmullw      mm4,    mm6                     // b * 3/5

    punpckhbw   mm3,    mm7                     // unpack high to word
    movq        mm5,    mm3                     // copy of high b

    pmullw      mm5,    mm6                     // b * 3/5
    paddw       mm0,    mm4                     // a * 2/5 + b * 3/5

    paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
    paddw       mm0,    round_values             // + 128

    paddw       mm2,    round_values             // + 128
    psrlw       mm0,    8

    psrlw       mm2,    8
    packuswb    mm0,    mm2                     // des [1]

    movq        QWORD ptr [esi+ecx], mm0        // write des[1]
    movq        mm0,    [esi+ecx*2]             // mm0 = src[2]

    // mm1, mm3 --- Src[1]
    // mm0 --- Src[2]
    // mm7 for unpacking

    movq        mm4,    mm1                     // b low
    pmullw      mm1,    four_fifths              // b * 4/5 low

    movq        mm5,    mm3                     // b high
    pmullw      mm3,    four_fifths              // b * 4/5 high

    movq        mm2,    mm0                     // c
    pmullw      mm4,    one_fifth                // b * 1/5

    punpcklbw   mm0,    mm7                     // c low
    pmullw      mm5,    one_fifth                // b * 1/5

    movq        mm6,    mm0                     // make copy of c low
    punpckhbw   mm2,    mm7                     // c high

    pmullw      mm6,    one_fifth                // c * 1/5 low
    movq        mm7,    mm2                     // make copy of c high

    pmullw      mm7,    one_fifth                // c * 1/5 high
    paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low

    paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
    movq        mm6,    mm0                     // make copy of c low

    pmullw      mm6,    four_fifths              // c * 4/5 low
    movq        mm7,    mm2                     // make copy of c high

    pmullw      mm7,    four_fifths              // c * 4/5 high

    paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
    paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high

    paddw       mm1,    round_values             // + 128
    paddw       mm3,    round_values             // + 128

    psrlw       mm1,    8
    psrlw       mm3,    8

    packuswb    mm1,    mm3                     // des[2]
    movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]

    paddw       mm4,    round_values             // + 128
    paddw       mm5,    round_values             // + 128

    psrlw       mm4,    8
    psrlw       mm5,    8

    packuswb    mm4,    mm5                     // des[3]
    movq        QWORD ptr [edi], mm4            // write des[3]

    //  mm0, mm2 --- Src[3]

    pxor        mm7,    mm7                     // clear mm7 for unpacking
    movq        mm1,    [edi+ecx*2]             // mm1 = Src[0] of the next group

    movq        mm5,    three_fifths             // mm5 = 3/5
    pmullw      mm0,    mm5                     // d * 3/5

    movq        mm6,    two_fifths                // mm6 = 2/5
    movq        mm3,    mm1                     // make a copy

    pmullw      mm2,    mm5                     // d * 3/5
    punpcklbw   mm1,    mm7                     // unpack low

    pmullw      mm1,    mm6                     // an * 2/5
    punpckhbw   mm3,    mm7                     // unpack high

    paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
    pmullw      mm3,    mm6                     // an * 2/5

    paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
    paddw       mm0,    round_values             // + 128

    paddw       mm2,    round_values             // + 128
    psrlw       mm0,    8

    psrlw       mm2,    8
    packuswb    mm0,    mm2                     // des[4]

    movq        QWORD ptr [edi+ecx], mm0        // write des[4]

    add         edi,    8
    add         esi,    8

    sub         edx,    8
    jg          vs_3_5_loop
  }
}

/****************************************************************************
 *
 *  ROUTINE       : last_vertical_band_3_5_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void last_vertical_band_3_5_scale_mmx
(
  unsigned char *dest,
  unsigned int dest_pitch,
  unsigned int dest_width
) {
  __asm {
    mov         esi,    dest                    // Get the source and destination pointer
    mov         ecx,    dest_pitch               // Get the pitch size

    lea         edi,    [esi+ecx*2]             // tow lines below
    add         edi,    ecx                     // three lines below

    pxor        mm7,    mm7                     // clear out mm7
    mov         edx,    dest_width               // Loop counter


    last_vs_3_5_loop:

    movq        mm0,    QWORD ptr [esi]         // src[0];
    movq        mm1,    QWORD ptr [esi+ecx]     // src[1];

    movq        mm2,    mm0                     // Make a copy
    punpcklbw   mm0,    mm7                     // unpack low to word

    movq        mm5,    two_fifths               // mm5 = 2/5
    punpckhbw   mm2,    mm7                     // unpack high to word

    pmullw      mm0,    mm5                     // a * 2/5

    movq        mm3,    mm1                     // make a copy
    punpcklbw   mm1,    mm7                     // unpack low to word

    pmullw      mm2,    mm5                     // a * 2/5
    movq        mm6,    three_fifths             // mm6 = 3/5

    movq        mm4,    mm1                     // copy of low b
    pmullw      mm4,    mm6                     // b * 3/5

    punpckhbw   mm3,    mm7                     // unpack high to word
    movq        mm5,    mm3                     // copy of high b

    pmullw      mm5,    mm6                     // b * 3/5
    paddw       mm0,    mm4                     // a * 2/5 + b * 3/5

    paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
    paddw       mm0,    round_values             // + 128

    paddw       mm2,    round_values             // + 128
    psrlw       mm0,    8

    psrlw       mm2,    8
    packuswb    mm0,    mm2                     // des [1]

    movq        QWORD ptr [esi+ecx], mm0        // write des[1]
    movq        mm0,    [esi+ecx*2]             // mm0 = src[2]


    // mm1, mm3 --- Src[1]
    // mm0 --- Src[2]
    // mm7 for unpacking

    movq        mm4,    mm1                     // b low
    pmullw      mm1,    four_fifths              // b * 4/5 low

    movq        QWORD ptr [edi+ecx], mm0        // write des[4]

    movq        mm5,    mm3                     // b high
    pmullw      mm3,    four_fifths              // b * 4/5 high

    movq        mm2,    mm0                     // c
    pmullw      mm4,    one_fifth                // b * 1/5

    punpcklbw   mm0,    mm7                     // c low
    pmullw      mm5,    one_fifth                // b * 1/5

    movq        mm6,    mm0                     // make copy of c low
    punpckhbw   mm2,    mm7                     // c high

    pmullw      mm6,    one_fifth                // c * 1/5 low
    movq        mm7,    mm2                     // make copy of c high

    pmullw      mm7,    one_fifth                // c * 1/5 high
    paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low

    paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
    movq        mm6,    mm0                     // make copy of c low

    pmullw      mm6,    four_fifths              // c * 4/5 low
    movq        mm7,    mm2                     // make copy of c high

    pmullw      mm7,    four_fifths              // c * 4/5 high

    paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
    paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high

    paddw       mm1,    round_values             // + 128
    paddw       mm3,    round_values             // + 128

    psrlw       mm1,    8
    psrlw       mm3,    8

    packuswb    mm1,    mm3                     // des[2]
    movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]

    paddw       mm4,    round_values             // + 128
    paddw       mm5,    round_values             // + 128

    psrlw       mm4,    8
    psrlw       mm5,    8

    packuswb    mm4,    mm5                     // des[3]
    movq        QWORD ptr [edi], mm4            // write des[3]

    //  mm0, mm2 --- Src[3]

    add         edi,    8
    add         esi,    8

    sub         edx,    8
    jg          last_vs_3_5_loop
  }
}

/****************************************************************************
 *
 *  ROUTINE       : vertical_band_1_2_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void vertical_band_1_2_scale_mmx
(
  unsigned char *dest,
  unsigned int dest_pitch,
  unsigned int dest_width
) {
  __asm {

    mov         esi,    dest                    // Get the source and destination pointer
    mov         ecx,    dest_pitch               // Get the pitch size

    pxor        mm7,    mm7                     // clear out mm7
    mov         edx,    dest_width               // Loop counter

    vs_1_2_loop:

    movq        mm0,    [esi]                   // get Src[0]
    movq        mm1,    [esi + ecx * 2]         // get Src[1]

    movq        mm2,    mm0                     // make copy before unpack
    movq        mm3,    mm1                     // make copy before unpack

    punpcklbw   mm0,    mm7                     // low Src[0]
    movq        mm6,    four_ones                // mm6= 1, 1, 1, 1

    punpcklbw   mm1,    mm7                     // low Src[1]
    paddw       mm0,    mm1                     // low (a + b)

    punpckhbw   mm2,    mm7                     // high Src[0]
    paddw       mm0,    mm6                     // low (a + b + 1)

    punpckhbw   mm3,    mm7
    paddw       mm2,    mm3                     // high (a + b )

    psraw       mm0,    1                       // low (a + b +1 )/2
    paddw       mm2,    mm6                     // high (a + b + 1)

    psraw       mm2,    1                       // high (a + b + 1)/2
    packuswb    mm0,    mm2                     // pack results

    movq        [esi+ecx], mm0                  // write out eight bytes
    add         esi,    8

    sub         edx,    8
    jg          vs_1_2_loop
  }

}

/****************************************************************************
 *
 *  ROUTINE       : last_vertical_band_1_2_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void last_vertical_band_1_2_scale_mmx
(
  unsigned char *dest,
  unsigned int dest_pitch,
  unsigned int dest_width
) {
  __asm {
    mov         esi,    dest                    // Get the source and destination pointer
    mov         ecx,    dest_pitch               // Get the pitch size

    mov         edx,    dest_width               // Loop counter

    last_vs_1_2_loop:

    movq        mm0,    [esi]                   // get Src[0]
    movq        [esi+ecx], mm0                  // write out eight bytes

    add         esi,    8
    sub         edx,    8

    jg         last_vs_1_2_loop
  }
}

/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_1_2_scale
 *
 *  INPUTS        : const unsigned char *source :
 *                  unsigned int source_width    :
 *                  unsigned char *dest         :
 *                  unsigned int dest_width      :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_1_2_scale_mmx
(
  const unsigned char *source,
  unsigned int source_width,
  unsigned char *dest,
  unsigned int dest_width
) {
  (void) dest_width;

  __asm {
    mov         esi,    source
    mov         edi,    dest

    pxor        mm7,    mm7
    movq        mm6,    four_ones

    mov         ecx,    source_width

    hs_1_2_loop:

    movq        mm0,    [esi]
    movq        mm1,    [esi+1]

    movq        mm2,    mm0
    movq        mm3,    mm1

    movq        mm4,    mm0
    punpcklbw   mm0,    mm7

    punpcklbw   mm1,    mm7
    paddw       mm0,    mm1

    paddw       mm0,    mm6
    punpckhbw   mm2,    mm7

    punpckhbw   mm3,    mm7
    paddw       mm2,    mm3

    paddw       mm2,    mm6
    psraw       mm0,    1

    psraw       mm2,    1
    packuswb    mm0,    mm2

    movq        mm2,    mm4
    punpcklbw   mm2,    mm0

    movq        [edi],  mm2
    punpckhbw   mm4,    mm0

    movq        [edi+8], mm4
    add         esi,    8

    add         edi,    16
    sub         ecx,    8

    cmp         ecx,    8
    jg          hs_1_2_loop

// last eight pixel

    movq        mm0,    [esi]
    movq        mm1,    mm0

    movq        mm2,    mm0
    movq        mm3,    mm1

    psrlq       mm1,    8
    psrlq       mm3,    56

    psllq       mm3,    56
    por         mm1,    mm3

    movq        mm3,    mm1
    movq        mm4,    mm0

    punpcklbw   mm0,    mm7
    punpcklbw   mm1,    mm7

    paddw       mm0,    mm1
    paddw       mm0,    mm6

    punpckhbw   mm2,    mm7
    punpckhbw   mm3,    mm7

    paddw       mm2,    mm3
    paddw       mm2,    mm6

    psraw       mm0,    1
    psraw       mm2,    1

    packuswb    mm0,    mm2
    movq        mm2,    mm4

    punpcklbw   mm2,    mm0
    movq        [edi],  mm2

    punpckhbw   mm4,    mm0
    movq        [edi+8], mm4
  }
}


__declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };


/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_5_4_scale_mmx
 *
 *  INPUTS        : const unsigned char *source : Pointer to source data.
 *                  unsigned int source_width    : Stride of source.
 *                  unsigned char *dest         : Pointer to destination data.
 *                  unsigned int dest_width      : Stride of destination (NOT USED).
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : Copies horizontal line of pixels from source to
 *                  destination scaling up by 4 to 5.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_5_4_scale_mmx
(
  const unsigned char *source,
  unsigned int source_width,
  unsigned char *dest,
  unsigned int dest_width
) {
  /*
  unsigned i;
  unsigned int a, b, c, d, e;
  unsigned char *des = dest;
  const unsigned char *src = source;

  (void) dest_width;

  for ( i=0; i<source_width; i+=5 )
  {
      a = src[0];
      b = src[1];
      c = src[2];
      d = src[3];
      e = src[4];

      des[0] = a;
      des[1] = ((b*192 + c* 64 + 128)>>8);
      des[2] = ((c*128 + d*128 + 128)>>8);
      des[3] = ((d* 64 + e*192 + 128)>>8);

      src += 5;
      des += 4;
  }
  */
  (void) dest_width;

  __asm {

    mov         esi,        source;
    mov         edi,        dest;

    mov         ecx,        source_width;
    movq        mm5,        const54_1;

    pxor        mm7,        mm7;
    movq        mm6,        const54_2;

    movq        mm4,        round_values;
    lea         edx,        [esi+ecx];
    horizontal_line_5_4_loop:

    movq        mm0,        QWORD PTR  [esi];
    00 01 02 03 04 05 06 07
    movq        mm1,        mm0;
    00 01 02 03 04 05 06 07

    psrlq       mm0,        8;
    01 02 03 04 05 06 07 xx
    punpcklbw   mm1,        mm7;
    xx 00 xx 01 xx 02 xx 03

    punpcklbw   mm0,        mm7;
    xx 01 xx 02 xx 03 xx 04
    pmullw      mm1,        mm5

    pmullw      mm0,        mm6
    add         esi,        5

    add         edi,        4
    paddw       mm1,        mm0

    paddw       mm1,        mm4
    psrlw       mm1,        8

    cmp         esi,        edx
    packuswb    mm1,        mm7

    movd        DWORD PTR [edi-4], mm1

    jl          horizontal_line_5_4_loop

  }

}
__declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
__declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };

static
void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {

  __asm {
    push        ebx

    mov         esi,    source                    // Get the source and destination pointer
    mov         ecx,    src_pitch               // Get the pitch size

    mov         edi,    dest                    // tow lines below
    pxor        mm7,    mm7                     // clear out mm7

    mov         edx,    dest_pitch               // Loop counter
    mov         ebx,    dest_width

    vs_5_4_loop:

    movd        mm0,    DWORD ptr [esi]         // src[0];
    movd        mm1,    DWORD ptr [esi+ecx]     // src[1];

    movd        mm2,    DWORD ptr [esi+ecx*2]
    lea         eax,    [esi+ecx*2]             //

    punpcklbw   mm1,    mm7
    punpcklbw   mm2,    mm7

    movq        mm3,    mm2
    pmullw      mm1,    three_fourths

    pmullw      mm2,    one_fourths
    movd        mm4,    [eax+ecx]

    pmullw      mm3,    two_fourths
    punpcklbw   mm4,    mm7

    movq        mm5,    mm4
    pmullw      mm4,    two_fourths

    paddw       mm1,    mm2
    movd        mm6,    [eax+ecx*2]

    pmullw      mm5,    one_fourths
    paddw       mm1,    round_values;

    paddw       mm3,    mm4
    psrlw       mm1,    8

    punpcklbw   mm6,    mm7
    paddw       mm3,    round_values

    pmullw      mm6,    three_fourths
    psrlw       mm3,    8

    packuswb    mm1,    mm7
    packuswb    mm3,    mm7

    movd        DWORD PTR [edi], mm0
    movd        DWORD PTR [edi+edx], mm1


    paddw       mm5,    mm6
    movd        DWORD PTR [edi+edx*2], mm3

    lea         eax,    [edi+edx*2]
    paddw       mm5,    round_values

    psrlw       mm5,    8
    add         edi,    4

    packuswb    mm5,    mm7
    movd        DWORD PTR [eax+edx], mm5

    add         esi,    4
    sub         ebx,    4

    jg         vs_5_4_loop

    pop         ebx
  }
}


__declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
__declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };


static
void horizontal_line_5_3_scale_mmx
(
  const unsigned char *source,
  unsigned int source_width,
  unsigned char *dest,
  unsigned int dest_width
) {

  (void) dest_width;
  __asm {

    mov         esi,        source;
    mov         edi,        dest;

    mov         ecx,        source_width;
    movq        mm5,        const53_1;

    pxor        mm7,        mm7;
    movq        mm6,        const53_2;

    movq        mm4,        round_values;
    lea         edx,        [esi+ecx-5];
    horizontal_line_5_3_loop:

    movq        mm0,        QWORD PTR  [esi];
    00 01 02 03 04 05 06 07
    movq        mm1,        mm0;
    00 01 02 03 04 05 06 07

    psllw       mm0,        8;
    xx 00 xx 02 xx 04 xx 06
    psrlw       mm1,        8;
    01 xx 03 xx 05 xx 07 xx

    psrlw       mm0,        8;
    00 xx 02 xx 04 xx 06 xx
    psllq       mm1,        16;
    xx xx 01 xx 03 xx 05 xx

    pmullw      mm0,        mm6

    pmullw      mm1,        mm5
    add         esi,        5

    add         edi,        3
    paddw       mm1,        mm0

    paddw       mm1,        mm4
    psrlw       mm1,        8

    cmp         esi,        edx
    packuswb    mm1,        mm7

    movd        DWORD PTR [edi-3], mm1
    jl          horizontal_line_5_3_loop

// exit condition
    movq        mm0,        QWORD PTR  [esi];
    00 01 02 03 04 05 06 07
    movq        mm1,        mm0;
    00 01 02 03 04 05 06 07

    psllw       mm0,        8;
    xx 00 xx 02 xx 04 xx 06
    psrlw       mm1,        8;
    01 xx 03 xx 05 xx 07 xx

    psrlw       mm0,        8;
    00 xx 02 xx 04 xx 06 xx
    psllq       mm1,        16;
    xx xx 01 xx 03 xx 05 xx

    pmullw      mm0,        mm6

    pmullw      mm1,        mm5
    paddw       mm1,        mm0

    paddw       mm1,        mm4
    psrlw       mm1,        8

    packuswb    mm1,        mm7
    movd        eax,        mm1

    mov         edx,        eax
    shr         edx,        16

    mov         WORD PTR[edi],   ax
    mov         BYTE PTR[edi+2], dl

  }

}

__declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };

static
void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {

  __asm {
    push        ebx

    mov         esi,    source                    // Get the source and destination pointer
    mov         ecx,    src_pitch               // Get the pitch size

    mov         edi,    dest                    // tow lines below
    pxor        mm7,    mm7                     // clear out mm7

    mov         edx,    dest_pitch               // Loop counter
    movq        mm5,    one_thirds

    movq        mm6,    two_thirds
    mov         ebx,    dest_width;

    vs_5_3_loop:

    movd        mm0,    DWORD ptr [esi]         // src[0];
    movd        mm1,    DWORD ptr [esi+ecx]     // src[1];

    movd        mm2,    DWORD ptr [esi+ecx*2]
    lea         eax,    [esi+ecx*2]             //

    punpcklbw   mm1,    mm7
    punpcklbw   mm2,    mm7

    pmullw      mm1,    mm5
    pmullw      mm2,    mm6

    movd        mm3,    DWORD ptr [eax+ecx]
    movd        mm4,    DWORD ptr [eax+ecx*2]

    punpcklbw   mm3,    mm7
    punpcklbw   mm4,    mm7

    pmullw      mm3,    mm6
    pmullw      mm4,    mm5


    movd        DWORD PTR [edi], mm0
    paddw       mm1,    mm2

    paddw       mm1,    round_values
    psrlw       mm1,    8

    packuswb    mm1,    mm7
    paddw       mm3,    mm4

    paddw       mm3,    round_values
    movd        DWORD PTR [edi+edx], mm1

    psrlw       mm3,    8
    packuswb    mm3,    mm7

    movd        DWORD PTR [edi+edx*2], mm3


    add         edi,    4
    add         esi,    4

    sub         ebx,    4
    jg          vs_5_3_loop

    pop         ebx
  }
}


/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_2_1_scale
 *
 *  INPUTS        : const unsigned char *source :
 *                  unsigned int source_width    :
 *                  unsigned char *dest         :
 *                  unsigned int dest_width      :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_2_1_scale_mmx
(
  const unsigned char *source,
  unsigned int source_width,
  unsigned char *dest,
  unsigned int dest_width
) {
  (void) dest_width;
  (void) source_width;
  __asm {
    mov         esi,    source
    mov         edi,    dest

    pxor        mm7,    mm7
    mov         ecx,    dest_width

    xor         edx,    edx
    hs_2_1_loop:

    movq        mm0,    [esi+edx*2]
    psllw       mm0,    8

    psrlw       mm0,    8
    packuswb    mm0,    mm7

    movd        DWORD Ptr [edi+edx], mm0;
    add         edx,    4

    cmp         edx,    ecx
    jl          hs_2_1_loop

  }
}


static
void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
  (void) dest_pitch;
  (void) src_pitch;
  vpx_memcpy(dest, source, dest_width);
}


__declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
__declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };

static
void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {

  (void) dest_pitch;
  __asm {
    mov         esi,        source
    mov         edi,        dest

    mov         eax,        src_pitch
    mov         edx,        dest_width

    pxor        mm7,        mm7
    sub         esi,        eax             // back one line


    lea         ecx,        [esi+edx];
    movq        mm6,        round_values;

    movq        mm5,        three_sixteenths;
    movq        mm4,        ten_sixteenths;

    vs_2_1_i_loop:
    movd        mm0,        [esi]           //
    movd        mm1,        [esi+eax]       //

    movd        mm2,        [esi+eax*2]     //
    punpcklbw   mm0,        mm7

    pmullw      mm0,        mm5
    punpcklbw   mm1,        mm7

    pmullw      mm1,        mm4
    punpcklbw   mm2,        mm7

    pmullw      mm2,        mm5
    paddw       mm0,        round_values

    paddw       mm1,        mm2
    paddw       mm0,        mm1

    psrlw       mm0,        8
    packuswb    mm0,        mm7

    movd        DWORD PTR [edi],        mm0
    add         esi,        4

    add         edi,        4;
    cmp         esi,        ecx
    jl          vs_2_1_i_loop

  }
}


void
register_mmxscalers(void) {
  vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
  vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
  vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
  vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
  vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
  vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
  vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
  vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
  vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;

  vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
  vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
  vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
  vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
  vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
  vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;


  vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
  vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
  vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
  vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
  vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
  vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
  vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;


}