vpx/vp8/encoder/x86/preproc_mmx.c

/*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


#include "memory.h"
#include "preproc.h"
#include "pragmas.h"

/****************************************************************************
*  Macros
****************************************************************************/
#define FRAMECOUNT 7
#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )

/****************************************************************************
*  Imports
****************************************************************************/
extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);

/****************************************************************************
*  Exported Global Variables
****************************************************************************/
void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);

/****************************************************************************
 *
 *  ROUTINE       : temp_filter_wmt
 *
 *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
 *                  unsigned char *s     : Pointer to source frame.
 *                  unsigned char *d     : Pointer to destination frame.
 *                  int bytes            : Number of bytes to filter.
 *                  int strength         : Strength of filter to apply.
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : Performs a closesness adjusted temporarl blur
 *
 *  SPECIAL NOTES : Destination frame can be same as source frame.
 *
 ****************************************************************************/
void temp_filter_wmt
(
    pre_proc_instance *ppi,
    unsigned char *s,
    unsigned char *d,
    int bytes,
    int strength
)
{
    int byte = 0;
    unsigned char *frameptr = ppi->frame_buffer;

    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3, 3, 3, 3, 3};
    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};

    if (ppi->frame == 0)
    {
        do
        {
            int i;
            int frame = 0;

            do
            {
                for (i = 0; i < 8; i++)
                {
                    *frameptr = s[byte+i];
                    ++frameptr;
                }

                ++frame;
            }
            while (frame < FRAMECOUNT);

            for (i = 0; i < 8; i++)
                d[byte+i] = s[byte+i];

            byte += 8;

        }
        while (byte < bytes);
    }
    else
    {
        int i;
        int offset2 = (ppi->frame % FRAMECOUNT);

        do
        {
            __declspec(align(16)) unsigned short counts[8];
            __declspec(align(16)) unsigned short sums[8];
            __asm
            {
                mov         eax, offset2
                mov         edi, s                  // source pixels
                pxor        xmm1, xmm1              // accumulator

                pxor        xmm7, xmm7

                mov         esi, frameptr           // accumulator
                pxor        xmm2, xmm2              // count

                movq        xmm3, QWORD PTR [edi]

                movq        QWORD PTR [esi+8*eax], xmm3

                punpcklbw   xmm3, xmm2              // xmm3 source pixels
                mov         ecx,  FRAMECOUNT

                next_frame:
                movq        xmm4, QWORD PTR [esi]   // get frame buffer values
                punpcklbw   xmm4, xmm7              // xmm4 frame buffer pixels
                movdqa      xmm6, xmm4              // save the pixel values
                psubsw      xmm4, xmm3              // subtracted pixel values
                pmullw      xmm4, xmm4              // square xmm4
                movd        xmm5, strength
                psrlw       xmm4, xmm5              // should be strength
                pmullw      xmm4, threes            // 3 * modifier
                movdqa      xmm5, sixteens          // 16s
                psubusw     xmm5, xmm4              // 16 - modifiers
                movdqa      xmm4, xmm5              // save the modifiers
                pmullw      xmm4, xmm6              // multiplier values
                paddusw     xmm1, xmm4              // accumulator
                paddusw     xmm2, xmm5              // count
                add         esi, 8                  // next frame
                dec         ecx                     // next set of eight pixels
                jnz         next_frame

                movdqa      counts, xmm2
                psrlw       xmm2, 1                 // divide count by 2 for rounding
                paddusw     xmm1, xmm2              // rounding added in

                mov         frameptr, esi

                movdqa      sums, xmm1
            }

            for (i = 0; i < 8; i++)
            {
                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
                blurvalue >>= 16;
                d[i] = blurvalue;
            }

            s += 8;
            d += 8;
            byte += 8;
        }
        while (byte < bytes);
    }

    ++ppi->frame;
    __asm emms
}

/****************************************************************************
 *
 *  ROUTINE       : temp_filter_mmx
 *
 *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
 *                  unsigned char *s     : Pointer to source frame.
 *                  unsigned char *d     : Pointer to destination frame.
 *                  int bytes            : Number of bytes to filter.
 *                  int strength         : Strength of filter to apply.
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : Performs a closesness adjusted temporarl blur
 *
 *  SPECIAL NOTES : Destination frame can be same as source frame.
 *
 ****************************************************************************/
void temp_filter_mmx
(
    pre_proc_instance *ppi,
    unsigned char *s,
    unsigned char *d,
    int bytes,
    int strength
)
{
    int byte = 0;
    unsigned char *frameptr = ppi->frame_buffer;

    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3};
    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};

    if (ppi->frame == 0)
    {
        do
        {
            int i;
            int frame = 0;

            do
            {
                for (i = 0; i < 4; i++)
                {
                    *frameptr = s[byte+i];
                    ++frameptr;
                }

                ++frame;
            }
            while (frame < FRAMECOUNT);

            for (i = 0; i < 4; i++)
                d[byte+i] = s[byte+i];

            byte += 4;

        }
        while (byte < bytes);
    }
    else
    {
        int i;
        int offset2 = (ppi->frame % FRAMECOUNT);

        do
        {
            __declspec(align(16)) unsigned short counts[8];
            __declspec(align(16)) unsigned short sums[8];
            __asm
            {

                mov         eax, offset2
                mov         edi, s                  // source pixels
                pxor        mm1, mm1                // accumulator
                pxor        mm7, mm7

                mov         esi, frameptr           // accumulator
                pxor        mm2, mm2                // count

                movd        mm3, DWORD PTR [edi]
                movd        DWORD PTR [esi+4*eax], mm3

                punpcklbw   mm3, mm2                // mm3 source pixels
                mov         ecx,  FRAMECOUNT

                next_frame:
                movd        mm4, DWORD PTR [esi]    // get frame buffer values
                punpcklbw   mm4, mm7                // mm4 frame buffer pixels
                movq        mm6, mm4                // save the pixel values
                psubsw      mm4, mm3                // subtracted pixel values
                pmullw      mm4, mm4                // square mm4
                movd        mm5, strength
                psrlw       mm4, mm5                // should be strength
                pmullw      mm4, threes             // 3 * modifier
                movq        mm5, sixteens           // 16s
                psubusw     mm5, mm4                // 16 - modifiers
                movq        mm4, mm5                // save the modifiers
                pmullw      mm4, mm6                // multiplier values
                paddusw     mm1, mm4                // accumulator
                paddusw     mm2, mm5                // count
                add         esi, 4                  // next frame
                dec         ecx                     // next set of eight pixels
                jnz         next_frame

                movq        counts, mm2
                psrlw       mm2, 1                  // divide count by 2 for rounding
                paddusw     mm1, mm2                // rounding added in

                mov         frameptr, esi

                movq        sums, mm1

            }

            for (i = 0; i < 4; i++)
            {
                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
                blurvalue >>= 16;
                d[i] = blurvalue;
            }

            s += 4;
            d += 4;
            byte += 4;
        }
        while (byte < bytes);
    }

    ++ppi->frame;
    __asm emms
}
Initial WebM release 2010-05-18 17:58:33 +02:00			`/*`
			`* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.`
			`*`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`* Use of this source code is governed by a BSD-style license`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`* that can be found in the LICENSE file in the root of the source`
			`* tree. An additional intellectual property rights grant can be found`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`* in the file PATENTS. All contributing project authors may`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`* be found in the AUTHORS file in the root of the source tree.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`*/`


			`#include "memory.h"`
			`#include "preproc.h"`
			`#include "pragmas.h"`

			`/****************************************************************************`
			`* Macros`
			`****************************************************************************/`
			`#define FRAMECOUNT 7`
			`#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )`

			`/****************************************************************************`
			`* Imports`
			`****************************************************************************/`
			`extern void vpx_get_processor_flags(int mmx_enabled, int xmm_enabled, int *wmt_enabled);`

			`/****************************************************************************`
			`* Exported Global Variables`
			`****************************************************************************/`
			`void (temp_filter)(pre_proc_instance ppi, unsigned char s, unsigned char d, int bytes, int strength);`

			`/****************************************************************************`
			`*`
			`* ROUTINE : temp_filter_wmt`
			`*`
			`* INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.`
			`* unsigned char *s : Pointer to source frame.`
			`* unsigned char *d : Pointer to destination frame.`
			`* int bytes : Number of bytes to filter.`
			`* int strength : Strength of filter to apply.`
			`*`
			`* OUTPUTS : None.`
			`*`
			`* RETURNS : void`
			`*`
			`* FUNCTION : Performs a closesness adjusted temporarl blur`
			`*`
			`* SPECIAL NOTES : Destination frame can be same as source frame.`
			`*`
			`****************************************************************************/`
			`void temp_filter_wmt`
			`(`
			`pre_proc_instance *ppi,`
			`unsigned char *s,`
			`unsigned char *d,`
			`int bytes,`
			`int strength`
			`)`
			`{`
			`int byte = 0;`
			`unsigned char *frameptr = ppi->frame_buffer;`

			`__declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3, 3, 3, 3, 3};`
			`__declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};`

			`if (ppi->frame == 0)`
			`{`
			`do`
			`{`
			`int i;`
			`int frame = 0;`

			`do`
			`{`
			`for (i = 0; i < 8; i++)`
			`{`
			`*frameptr = s[byte+i];`
			`++frameptr;`
			`}`

			`++frame;`
			`}`
			`while (frame < FRAMECOUNT);`

			`for (i = 0; i < 8; i++)`
			`d[byte+i] = s[byte+i];`

			`byte += 8;`

			`}`
			`while (byte < bytes);`
			`}`
			`else`
			`{`
			`int i;`
			`int offset2 = (ppi->frame % FRAMECOUNT);`

			`do`
			`{`
			`__declspec(align(16)) unsigned short counts[8];`
			`__declspec(align(16)) unsigned short sums[8];`
			`__asm`
			`{`
			`mov eax, offset2`
			`mov edi, s // source pixels`
			`pxor xmm1, xmm1 // accumulator`

			`pxor xmm7, xmm7`

			`mov esi, frameptr // accumulator`
			`pxor xmm2, xmm2 // count`

			`movq xmm3, QWORD PTR [edi]`

			`movq QWORD PTR [esi+8*eax], xmm3`

			`punpcklbw xmm3, xmm2 // xmm3 source pixels`
			`mov ecx, FRAMECOUNT`

			`next_frame:`
			`movq xmm4, QWORD PTR [esi] // get frame buffer values`
			`punpcklbw xmm4, xmm7 // xmm4 frame buffer pixels`
			`movdqa xmm6, xmm4 // save the pixel values`
			`psubsw xmm4, xmm3 // subtracted pixel values`
			`pmullw xmm4, xmm4 // square xmm4`
			`movd xmm5, strength`
			`psrlw xmm4, xmm5 // should be strength`
			`pmullw xmm4, threes // 3 * modifier`
			`movdqa xmm5, sixteens // 16s`
			`psubusw xmm5, xmm4 // 16 - modifiers`
			`movdqa xmm4, xmm5 // save the modifiers`
			`pmullw xmm4, xmm6 // multiplier values`
			`paddusw xmm1, xmm4 // accumulator`
			`paddusw xmm2, xmm5 // count`
			`add esi, 8 // next frame`
			`dec ecx // next set of eight pixels`
			`jnz next_frame`

			`movdqa counts, xmm2`
			`psrlw xmm2, 1 // divide count by 2 for rounding`
			`paddusw xmm1, xmm2 // rounding added in`

			`mov frameptr, esi`

			`movdqa sums, xmm1`
			`}`

			`for (i = 0; i < 8; i++)`
			`{`
			`int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];`
			`blurvalue >>= 16;`
			`d[i] = blurvalue;`
			`}`

			`s += 8;`
			`d += 8;`
			`byte += 8;`
			`}`
			`while (byte < bytes);`
			`}`

			`++ppi->frame;`
			`__asm emms`
			`}`

			`/****************************************************************************`
			`*`
			`* ROUTINE : temp_filter_mmx`
			`*`
			`* INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.`
			`* unsigned char *s : Pointer to source frame.`
			`* unsigned char *d : Pointer to destination frame.`
			`* int bytes : Number of bytes to filter.`
			`* int strength : Strength of filter to apply.`
			`*`
			`* OUTPUTS : None.`
			`*`
			`* RETURNS : void`
			`*`
			`* FUNCTION : Performs a closesness adjusted temporarl blur`
			`*`
			`* SPECIAL NOTES : Destination frame can be same as source frame.`
			`*`
			`****************************************************************************/`
			`void temp_filter_mmx`
			`(`
			`pre_proc_instance *ppi,`
			`unsigned char *s,`
			`unsigned char *d,`
			`int bytes,`
			`int strength`
			`)`
			`{`
			`int byte = 0;`
			`unsigned char *frameptr = ppi->frame_buffer;`

			`__declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3};`
			`__declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};`

			`if (ppi->frame == 0)`
			`{`
			`do`
			`{`
			`int i;`
			`int frame = 0;`

			`do`
			`{`
			`for (i = 0; i < 4; i++)`
			`{`
			`*frameptr = s[byte+i];`
			`++frameptr;`
			`}`

			`++frame;`
			`}`
			`while (frame < FRAMECOUNT);`

			`for (i = 0; i < 4; i++)`
			`d[byte+i] = s[byte+i];`

			`byte += 4;`

			`}`
			`while (byte < bytes);`
			`}`
			`else`
			`{`
			`int i;`
			`int offset2 = (ppi->frame % FRAMECOUNT);`

			`do`
			`{`
			`__declspec(align(16)) unsigned short counts[8];`
			`__declspec(align(16)) unsigned short sums[8];`
			`__asm`
			`{`

			`mov eax, offset2`
			`mov edi, s // source pixels`
			`pxor mm1, mm1 // accumulator`
			`pxor mm7, mm7`

			`mov esi, frameptr // accumulator`
			`pxor mm2, mm2 // count`

			`movd mm3, DWORD PTR [edi]`
			`movd DWORD PTR [esi+4*eax], mm3`

			`punpcklbw mm3, mm2 // mm3 source pixels`
			`mov ecx, FRAMECOUNT`

			`next_frame:`
			`movd mm4, DWORD PTR [esi] // get frame buffer values`
			`punpcklbw mm4, mm7 // mm4 frame buffer pixels`
			`movq mm6, mm4 // save the pixel values`
			`psubsw mm4, mm3 // subtracted pixel values`
			`pmullw mm4, mm4 // square mm4`
			`movd mm5, strength`
			`psrlw mm4, mm5 // should be strength`
			`pmullw mm4, threes // 3 * modifier`
			`movq mm5, sixteens // 16s`
			`psubusw mm5, mm4 // 16 - modifiers`
			`movq mm4, mm5 // save the modifiers`
			`pmullw mm4, mm6 // multiplier values`
			`paddusw mm1, mm4 // accumulator`
			`paddusw mm2, mm5 // count`
			`add esi, 4 // next frame`
			`dec ecx // next set of eight pixels`
			`jnz next_frame`

			`movq counts, mm2`
			`psrlw mm2, 1 // divide count by 2 for rounding`
			`paddusw mm1, mm2 // rounding added in`

			`mov frameptr, esi`

			`movq sums, mm1`

			`}`

			`for (i = 0; i < 4; i++)`
			`{`
			`int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];`
			`blurvalue >>= 16;`
			`d[i] = blurvalue;`
			`}`

			`s += 4;`
			`d += 4;`
			`byte += 4;`
			`}`
			`while (byte < bytes);`
			`}`

			`++ppi->frame;`
			`__asm emms`
			`}`