vpx/vp8/encoder/x86/variance_ssse3.c

/*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "vpx_config.h"
#include "vp8/encoder/variance.h"
#include "vp8/common/pragmas.h"
#include "vpx_ports/mem.h"

#if CONFIG_SIXTEENTH_SUBPEL_UV
#define HALFNDX 8
#else
#define HALFNDX 4
#endif

extern unsigned int vp8_get16x16var_sse2
(
    const unsigned char *src_ptr,
    int source_stride,
    const unsigned char *ref_ptr,
    int recon_stride,
    unsigned int *SSE,
    int *Sum
);
extern void vp8_half_horiz_vert_variance16x_h_sse2
(
    const unsigned char *ref_ptr,
    int ref_pixels_per_line,
    const unsigned char *src_ptr,
    int src_pixels_per_line,
    unsigned int Height,
    int *sum,
    unsigned int *sumsquared
);
extern void vp8_half_horiz_variance16x_h_sse2
(
    const unsigned char *ref_ptr,
    int ref_pixels_per_line,
    const unsigned char *src_ptr,
    int src_pixels_per_line,
    unsigned int Height,
    int *sum,
    unsigned int *sumsquared
);
extern void vp8_half_vert_variance16x_h_sse2
(
    const unsigned char *ref_ptr,
    int ref_pixels_per_line,
    const unsigned char *src_ptr,
    int src_pixels_per_line,
    unsigned int Height,
    int *sum,
    unsigned int *sumsquared
);
extern void vp8_filter_block2d_bil_var_ssse3
(
    const unsigned char *ref_ptr,
    int ref_pixels_per_line,
    const unsigned char *src_ptr,
    int src_pixels_per_line,
    unsigned int Height,
    int  xoffset,
    int  yoffset,
    int *sum,
    unsigned int *sumsquared
);

unsigned int vp8_sub_pixel_variance16x16_ssse3
(
    const unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    const unsigned char *dst_ptr,
    int dst_pixels_per_line,
    unsigned int *sse
)
{
    int xsum0;
    unsigned int xxsum0;

    // note we could avoid these if statements if the calling function
    // just called the appropriate functions inside.
    if (xoffset == HALFNDX && yoffset == 0)
    {
        vp8_half_horiz_variance16x_h_sse2(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 16,
            &xsum0, &xxsum0);
    }
    else if (xoffset == 0 && yoffset == HALFNDX)
    {
        vp8_half_vert_variance16x_h_sse2(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 16,
            &xsum0, &xxsum0);
    }
    else if (xoffset == HALFNDX && yoffset == HALFNDX)
    {
        vp8_half_horiz_vert_variance16x_h_sse2(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 16,
            &xsum0, &xxsum0);
    }
    else
    {
        vp8_filter_block2d_bil_var_ssse3(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 16,
            xoffset, yoffset,
            &xsum0, &xxsum0);
    }

    *sse = xxsum0;
    return (xxsum0 - ((xsum0 * xsum0) >> 8));
}

unsigned int vp8_sub_pixel_variance16x8_ssse3
(
    const unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    const unsigned char *dst_ptr,
    int dst_pixels_per_line,
    unsigned int *sse

)
{
    int xsum0;
    unsigned int xxsum0;

    if (xoffset == HALFNDX && yoffset == 0)
    {
        vp8_half_horiz_variance16x_h_sse2(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 8,
            &xsum0, &xxsum0);
    }
    else if (xoffset == 0 && yoffset == HALFNDX)
    {
        vp8_half_vert_variance16x_h_sse2(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 8,
            &xsum0, &xxsum0);
    }
    else if (xoffset == HALFNDX && yoffset == HALFNDX)
    {
        vp8_half_horiz_vert_variance16x_h_sse2(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 8,
            &xsum0, &xxsum0);
    }
    else
    {
        vp8_filter_block2d_bil_var_ssse3(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 8,
            xoffset, yoffset,
            &xsum0, &xxsum0);
    }

    *sse = xxsum0;
    return (xxsum0 - ((xsum0 * xsum0) >> 7));
}
Write SSSE3 sub-pixel filter function 1. Process 16 pixels at one time instead of 8. 2. Add check for both xoffset =0 and yoffset=0, which happens during motion search. This change gave encoder 1%~3% performance gain. Change-Id: Idaa39506b48f4f8b2fbbeb45aae8226fa32afb3e 2011-03-04 01:02:45 +01:00			`/*`
			`* Copyright (c) 2010 The WebM project authors. All Rights Reserved.`
			`*`
			`* Use of this source code is governed by a BSD-style license`
			`* that can be found in the LICENSE file in the root of the source`
			`* tree. An additional intellectual property rights grant can be found`
			`* in the file PATENTS. All contributing project authors may`
			`* be found in the AUTHORS file in the root of the source tree.`
			`*/`

Return sse value in vp8_variance SSE2 functions Minor modification. Change-Id: I09511d38fd1451d5c4106a48acdb3f766ce59cb7 2011-05-25 15:26:29 +02:00			`#include "vpx_config.h"`
Write SSSE3 sub-pixel filter function 1. Process 16 pixels at one time instead of 8. 2. Add check for both xoffset =0 and yoffset=0, which happens during motion search. This change gave encoder 1%~3% performance gain. Change-Id: Idaa39506b48f4f8b2fbbeb45aae8226fa32afb3e 2011-03-04 01:02:45 +01:00			`#include "vp8/encoder/variance.h"`
			`#include "vp8/common/pragmas.h"`
			`#include "vpx_ports/mem.h"`

Supporting high precision 1/8-pel motion vectors This is the initial patch for supporting 1/8th pel motion. Currently if we configure with enable-high-precision-mv, all motion vectors would default to 1/8 pel. Encode and decode syncs fine with the current code. In the next phase the code will be refactored so that we can choose the 1/8 pel mode adaptively at a frame/segment/mb level. Derf results: http://www.corp.google.com/~debargha/vp8_results/enhinterp_hpmv.html (about 0.83% better than 8-tap interpoaltion) Patch 3: Rebased. Also adding 1/16th pel interpolation for U and V Patch 4: HD results. http://www.corp.google.com/~debargha/vp8_results/enhinterp_hd_hpmv.html Seems impressive (unless I am doing something wrong). Patch 5: Added mmx/sse for bilateral filtering, as well as enforced use of c-versions of subpel filters with 8-taps and 1/16th pel; Also redesigned the 8-tap filters to reduce the cut-off in order to introduce a denoising effect. There is a new configure option sixteenth-subpel-uv which will use 1/16 th pel interpolation for uv, if the motion vectors have 1/8 pel accuracy. With the fixes the results are promising on the derf set. The enhanced interpolation option with 8-taps alone gives 3% improvement over thei derf set: http://www.corp.google.com/~debargha/vp8_results/enhinterpn.html Results on high precision mv and on the hd set are to follow. Patch 6: Adding a missing condition for CONFIG_SIXTEENTH_SUBPEL_UV in vp8/common/x86/x86_systemdependent.c Patch 7: Cleaning up various debug messages. Patch 8: Merge conflict Change-Id: I5b1d844457aefd7414a9e4e0e06c6ed38fd8cc04 2012-02-16 18:29:54 +01:00			`#if CONFIG_SIXTEENTH_SUBPEL_UV`
			`#define HALFNDX 8`
			`#else`
			`#define HALFNDX 4`
			`#endif`

Write SSSE3 sub-pixel filter function 1. Process 16 pixels at one time instead of 8. 2. Add check for both xoffset =0 and yoffset=0, which happens during motion search. This change gave encoder 1%~3% performance gain. Change-Id: Idaa39506b48f4f8b2fbbeb45aae8226fa32afb3e 2011-03-04 01:02:45 +01:00			`extern unsigned int vp8_get16x16var_sse2`
			`(`
			`const unsigned char *src_ptr,`
			`int source_stride,`
			`const unsigned char *ref_ptr,`
			`int recon_stride,`
			`unsigned int *SSE,`
			`int *Sum`
			`);`
			`extern void vp8_half_horiz_vert_variance16x_h_sse2`
			`(`
			`const unsigned char *ref_ptr,`
			`int ref_pixels_per_line,`
			`const unsigned char *src_ptr,`
			`int src_pixels_per_line,`
			`unsigned int Height,`
			`int *sum,`
			`unsigned int *sumsquared`
			`);`
			`extern void vp8_half_horiz_variance16x_h_sse2`
			`(`
			`const unsigned char *ref_ptr,`
			`int ref_pixels_per_line,`
			`const unsigned char *src_ptr,`
			`int src_pixels_per_line,`
			`unsigned int Height,`
			`int *sum,`
			`unsigned int *sumsquared`
			`);`
			`extern void vp8_half_vert_variance16x_h_sse2`
			`(`
			`const unsigned char *ref_ptr,`
			`int ref_pixels_per_line,`
			`const unsigned char *src_ptr,`
			`int src_pixels_per_line,`
			`unsigned int Height,`
			`int *sum,`
			`unsigned int *sumsquared`
			`);`
			`extern void vp8_filter_block2d_bil_var_ssse3`
			`(`
			`const unsigned char *ref_ptr,`
			`int ref_pixels_per_line,`
			`const unsigned char *src_ptr,`
			`int src_pixels_per_line,`
			`unsigned int Height,`
			`int xoffset,`
			`int yoffset,`
			`int *sum,`
			`unsigned int *sumsquared`
			`);`

			`unsigned int vp8_sub_pixel_variance16x16_ssse3`
			`(`
			`const unsigned char *src_ptr,`
			`int src_pixels_per_line,`
			`int xoffset,`
			`int yoffset,`
			`const unsigned char *dst_ptr,`
			`int dst_pixels_per_line,`
			`unsigned int *sse`
			`)`
			`{`
Add vp8_sub_pixel_variance16x8_ssse3 function Added SSSE3 function Change-Id: I8c304c92458618d93fda3a2f62bd09ccb63e75ad 2011-03-09 17:16:30 +01:00			`int xsum0;`
			`unsigned int xxsum0;`
Write SSSE3 sub-pixel filter function 1. Process 16 pixels at one time instead of 8. 2. Add check for both xoffset =0 and yoffset=0, which happens during motion search. This change gave encoder 1%~3% performance gain. Change-Id: Idaa39506b48f4f8b2fbbeb45aae8226fa32afb3e 2011-03-04 01:02:45 +01:00
			`// note we could avoid these if statements if the calling function`
			`// just called the appropriate functions inside.`
Bug fix in ssse3 variance computation. Fixes a bug that was introduced in the high precision mv patch. Change-Id: Ieadb433ebe4c3ef3e0e63944dab11528bf8bd73a 2012-02-25 05:24:54 +01:00			`if (xoffset == HALFNDX && yoffset == 0)`
Write SSSE3 sub-pixel filter function 1. Process 16 pixels at one time instead of 8. 2. Add check for both xoffset =0 and yoffset=0, which happens during motion search. This change gave encoder 1%~3% performance gain. Change-Id: Idaa39506b48f4f8b2fbbeb45aae8226fa32afb3e 2011-03-04 01:02:45 +01:00			`{`
			`vp8_half_horiz_variance16x_h_sse2(`
			`src_ptr, src_pixels_per_line,`
			`dst_ptr, dst_pixels_per_line, 16,`
			`&xsum0, &xxsum0);`
			`}`
Supporting high precision 1/8-pel motion vectors This is the initial patch for supporting 1/8th pel motion. Currently if we configure with enable-high-precision-mv, all motion vectors would default to 1/8 pel. Encode and decode syncs fine with the current code. In the next phase the code will be refactored so that we can choose the 1/8 pel mode adaptively at a frame/segment/mb level. Derf results: http://www.corp.google.com/~debargha/vp8_results/enhinterp_hpmv.html (about 0.83% better than 8-tap interpoaltion) Patch 3: Rebased. Also adding 1/16th pel interpolation for U and V Patch 4: HD results. http://www.corp.google.com/~debargha/vp8_results/enhinterp_hd_hpmv.html Seems impressive (unless I am doing something wrong). Patch 5: Added mmx/sse for bilateral filtering, as well as enforced use of c-versions of subpel filters with 8-taps and 1/16th pel; Also redesigned the 8-tap filters to reduce the cut-off in order to introduce a denoising effect. There is a new configure option sixteenth-subpel-uv which will use 1/16 th pel interpolation for uv, if the motion vectors have 1/8 pel accuracy. With the fixes the results are promising on the derf set. The enhanced interpolation option with 8-taps alone gives 3% improvement over thei derf set: http://www.corp.google.com/~debargha/vp8_results/enhinterpn.html Results on high precision mv and on the hd set are to follow. Patch 6: Adding a missing condition for CONFIG_SIXTEENTH_SUBPEL_UV in vp8/common/x86/x86_systemdependent.c Patch 7: Cleaning up various debug messages. Patch 8: Merge conflict Change-Id: I5b1d844457aefd7414a9e4e0e06c6ed38fd8cc04 2012-02-16 18:29:54 +01:00			`else if (xoffset == 0 && yoffset == HALFNDX)`
Write SSSE3 sub-pixel filter function 1. Process 16 pixels at one time instead of 8. 2. Add check for both xoffset =0 and yoffset=0, which happens during motion search. This change gave encoder 1%~3% performance gain. Change-Id: Idaa39506b48f4f8b2fbbeb45aae8226fa32afb3e 2011-03-04 01:02:45 +01:00			`{`
			`vp8_half_vert_variance16x_h_sse2(`
			`src_ptr, src_pixels_per_line,`
			`dst_ptr, dst_pixels_per_line, 16,`
			`&xsum0, &xxsum0);`
			`}`
Supporting high precision 1/8-pel motion vectors This is the initial patch for supporting 1/8th pel motion. Currently if we configure with enable-high-precision-mv, all motion vectors would default to 1/8 pel. Encode and decode syncs fine with the current code. In the next phase the code will be refactored so that we can choose the 1/8 pel mode adaptively at a frame/segment/mb level. Derf results: http://www.corp.google.com/~debargha/vp8_results/enhinterp_hpmv.html (about 0.83% better than 8-tap interpoaltion) Patch 3: Rebased. Also adding 1/16th pel interpolation for U and V Patch 4: HD results. http://www.corp.google.com/~debargha/vp8_results/enhinterp_hd_hpmv.html Seems impressive (unless I am doing something wrong). Patch 5: Added mmx/sse for bilateral filtering, as well as enforced use of c-versions of subpel filters with 8-taps and 1/16th pel; Also redesigned the 8-tap filters to reduce the cut-off in order to introduce a denoising effect. There is a new configure option sixteenth-subpel-uv which will use 1/16 th pel interpolation for uv, if the motion vectors have 1/8 pel accuracy. With the fixes the results are promising on the derf set. The enhanced interpolation option with 8-taps alone gives 3% improvement over thei derf set: http://www.corp.google.com/~debargha/vp8_results/enhinterpn.html Results on high precision mv and on the hd set are to follow. Patch 6: Adding a missing condition for CONFIG_SIXTEENTH_SUBPEL_UV in vp8/common/x86/x86_systemdependent.c Patch 7: Cleaning up various debug messages. Patch 8: Merge conflict Change-Id: I5b1d844457aefd7414a9e4e0e06c6ed38fd8cc04 2012-02-16 18:29:54 +01:00			`else if (xoffset == HALFNDX && yoffset == HALFNDX)`
Write SSSE3 sub-pixel filter function 1. Process 16 pixels at one time instead of 8. 2. Add check for both xoffset =0 and yoffset=0, which happens during motion search. This change gave encoder 1%~3% performance gain. Change-Id: Idaa39506b48f4f8b2fbbeb45aae8226fa32afb3e 2011-03-04 01:02:45 +01:00			`{`
			`vp8_half_horiz_vert_variance16x_h_sse2(`
			`src_ptr, src_pixels_per_line,`
			`dst_ptr, dst_pixels_per_line, 16,`
			`&xsum0, &xxsum0);`
			`}`
			`else`
			`{`
Improve SSE2 half-pixel filter funtions Rewrote these functions to process 16 pixels once instead of 8. Change-Id: Ic67e80124467a446a3df4cfecfb76a4248602adb 2011-03-08 22:25:06 +01:00			`vp8_filter_block2d_bil_var_ssse3(`
			`src_ptr, src_pixels_per_line,`
			`dst_ptr, dst_pixels_per_line, 16,`
			`xoffset, yoffset,`
			`&xsum0, &xxsum0);`
Write SSSE3 sub-pixel filter function 1. Process 16 pixels at one time instead of 8. 2. Add check for both xoffset =0 and yoffset=0, which happens during motion search. This change gave encoder 1%~3% performance gain. Change-Id: Idaa39506b48f4f8b2fbbeb45aae8226fa32afb3e 2011-03-04 01:02:45 +01:00			`}`

			`*sse = xxsum0;`
			`return (xxsum0 - ((xsum0 * xsum0) >> 8));`
			`}`
Add vp8_sub_pixel_variance16x8_ssse3 function Added SSSE3 function Change-Id: I8c304c92458618d93fda3a2f62bd09ccb63e75ad 2011-03-09 17:16:30 +01:00
			`unsigned int vp8_sub_pixel_variance16x8_ssse3`
			`(`
			`const unsigned char *src_ptr,`
			`int src_pixels_per_line,`
			`int xoffset,`
			`int yoffset,`
			`const unsigned char *dst_ptr,`
			`int dst_pixels_per_line,`
			`unsigned int *sse`

			`)`
			`{`
			`int xsum0;`
			`unsigned int xxsum0;`

Supporting high precision 1/8-pel motion vectors This is the initial patch for supporting 1/8th pel motion. Currently if we configure with enable-high-precision-mv, all motion vectors would default to 1/8 pel. Encode and decode syncs fine with the current code. In the next phase the code will be refactored so that we can choose the 1/8 pel mode adaptively at a frame/segment/mb level. Derf results: http://www.corp.google.com/~debargha/vp8_results/enhinterp_hpmv.html (about 0.83% better than 8-tap interpoaltion) Patch 3: Rebased. Also adding 1/16th pel interpolation for U and V Patch 4: HD results. http://www.corp.google.com/~debargha/vp8_results/enhinterp_hd_hpmv.html Seems impressive (unless I am doing something wrong). Patch 5: Added mmx/sse for bilateral filtering, as well as enforced use of c-versions of subpel filters with 8-taps and 1/16th pel; Also redesigned the 8-tap filters to reduce the cut-off in order to introduce a denoising effect. There is a new configure option sixteenth-subpel-uv which will use 1/16 th pel interpolation for uv, if the motion vectors have 1/8 pel accuracy. With the fixes the results are promising on the derf set. The enhanced interpolation option with 8-taps alone gives 3% improvement over thei derf set: http://www.corp.google.com/~debargha/vp8_results/enhinterpn.html Results on high precision mv and on the hd set are to follow. Patch 6: Adding a missing condition for CONFIG_SIXTEENTH_SUBPEL_UV in vp8/common/x86/x86_systemdependent.c Patch 7: Cleaning up various debug messages. Patch 8: Merge conflict Change-Id: I5b1d844457aefd7414a9e4e0e06c6ed38fd8cc04 2012-02-16 18:29:54 +01:00			`if (xoffset == HALFNDX && yoffset == 0)`
Add vp8_sub_pixel_variance16x8_ssse3 function Added SSSE3 function Change-Id: I8c304c92458618d93fda3a2f62bd09ccb63e75ad 2011-03-09 17:16:30 +01:00			`{`
			`vp8_half_horiz_variance16x_h_sse2(`
			`src_ptr, src_pixels_per_line,`
			`dst_ptr, dst_pixels_per_line, 8,`
			`&xsum0, &xxsum0);`
			`}`
Supporting high precision 1/8-pel motion vectors This is the initial patch for supporting 1/8th pel motion. Currently if we configure with enable-high-precision-mv, all motion vectors would default to 1/8 pel. Encode and decode syncs fine with the current code. In the next phase the code will be refactored so that we can choose the 1/8 pel mode adaptively at a frame/segment/mb level. Derf results: http://www.corp.google.com/~debargha/vp8_results/enhinterp_hpmv.html (about 0.83% better than 8-tap interpoaltion) Patch 3: Rebased. Also adding 1/16th pel interpolation for U and V Patch 4: HD results. http://www.corp.google.com/~debargha/vp8_results/enhinterp_hd_hpmv.html Seems impressive (unless I am doing something wrong). Patch 5: Added mmx/sse for bilateral filtering, as well as enforced use of c-versions of subpel filters with 8-taps and 1/16th pel; Also redesigned the 8-tap filters to reduce the cut-off in order to introduce a denoising effect. There is a new configure option sixteenth-subpel-uv which will use 1/16 th pel interpolation for uv, if the motion vectors have 1/8 pel accuracy. With the fixes the results are promising on the derf set. The enhanced interpolation option with 8-taps alone gives 3% improvement over thei derf set: http://www.corp.google.com/~debargha/vp8_results/enhinterpn.html Results on high precision mv and on the hd set are to follow. Patch 6: Adding a missing condition for CONFIG_SIXTEENTH_SUBPEL_UV in vp8/common/x86/x86_systemdependent.c Patch 7: Cleaning up various debug messages. Patch 8: Merge conflict Change-Id: I5b1d844457aefd7414a9e4e0e06c6ed38fd8cc04 2012-02-16 18:29:54 +01:00			`else if (xoffset == 0 && yoffset == HALFNDX)`
Add vp8_sub_pixel_variance16x8_ssse3 function Added SSSE3 function Change-Id: I8c304c92458618d93fda3a2f62bd09ccb63e75ad 2011-03-09 17:16:30 +01:00			`{`
			`vp8_half_vert_variance16x_h_sse2(`
			`src_ptr, src_pixels_per_line,`
			`dst_ptr, dst_pixels_per_line, 8,`
			`&xsum0, &xxsum0);`
			`}`
Supporting high precision 1/8-pel motion vectors This is the initial patch for supporting 1/8th pel motion. Currently if we configure with enable-high-precision-mv, all motion vectors would default to 1/8 pel. Encode and decode syncs fine with the current code. In the next phase the code will be refactored so that we can choose the 1/8 pel mode adaptively at a frame/segment/mb level. Derf results: http://www.corp.google.com/~debargha/vp8_results/enhinterp_hpmv.html (about 0.83% better than 8-tap interpoaltion) Patch 3: Rebased. Also adding 1/16th pel interpolation for U and V Patch 4: HD results. http://www.corp.google.com/~debargha/vp8_results/enhinterp_hd_hpmv.html Seems impressive (unless I am doing something wrong). Patch 5: Added mmx/sse for bilateral filtering, as well as enforced use of c-versions of subpel filters with 8-taps and 1/16th pel; Also redesigned the 8-tap filters to reduce the cut-off in order to introduce a denoising effect. There is a new configure option sixteenth-subpel-uv which will use 1/16 th pel interpolation for uv, if the motion vectors have 1/8 pel accuracy. With the fixes the results are promising on the derf set. The enhanced interpolation option with 8-taps alone gives 3% improvement over thei derf set: http://www.corp.google.com/~debargha/vp8_results/enhinterpn.html Results on high precision mv and on the hd set are to follow. Patch 6: Adding a missing condition for CONFIG_SIXTEENTH_SUBPEL_UV in vp8/common/x86/x86_systemdependent.c Patch 7: Cleaning up various debug messages. Patch 8: Merge conflict Change-Id: I5b1d844457aefd7414a9e4e0e06c6ed38fd8cc04 2012-02-16 18:29:54 +01:00			`else if (xoffset == HALFNDX && yoffset == HALFNDX)`
Add vp8_sub_pixel_variance16x8_ssse3 function Added SSSE3 function Change-Id: I8c304c92458618d93fda3a2f62bd09ccb63e75ad 2011-03-09 17:16:30 +01:00			`{`
			`vp8_half_horiz_vert_variance16x_h_sse2(`
			`src_ptr, src_pixels_per_line,`
			`dst_ptr, dst_pixels_per_line, 8,`
			`&xsum0, &xxsum0);`
			`}`
			`else`
			`{`
			`vp8_filter_block2d_bil_var_ssse3(`
			`src_ptr, src_pixels_per_line,`
			`dst_ptr, dst_pixels_per_line, 8,`
			`xoffset, yoffset,`
			`&xsum0, &xxsum0);`
			`}`

			`*sse = xxsum0;`
			`return (xxsum0 - ((xsum0 * xsum0) >> 7));`
			`}`