vpx/vpx_scale/generic/bicubic_scaler.c

/*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license 
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may 
 *  be found in the AUTHORS file in the root of the source tree.
 */


#include <float.h>
#include <math.h>
#include <stdio.h>
#include "vpx_mem/vpx_mem.h"
#include "vpxscale_arbitrary.h"

#define FIXED_POINT

#define MAX_IN_WIDTH        800
#define MAX_IN_HEIGHT       600
#define MAX_OUT_WIDTH       800
#define MAX_OUT_HEIGHT      600
#define MAX_OUT_DIMENSION   ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \
                             MAX_OUT_WIDTH : MAX_OUT_HEIGHT)

BICUBIC_SCALER_STRUCT g_b_scaler;
static int g_first_time = 1;

#pragma DATA_SECTION(g_hbuf, "VP6_HEAP")
#pragma DATA_ALIGN (g_hbuf, 32);
unsigned char g_hbuf[MAX_OUT_DIMENSION];

#pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP")
#pragma DATA_ALIGN (g_hbuf_uv, 32);
unsigned char g_hbuf_uv[MAX_OUT_DIMENSION];


#ifdef FIXED_POINT
static int a_i = 0.6 * 65536;
#else
static float a = -0.6;
#endif

#ifdef FIXED_POINT
//         3     2
// C0 = a*t - a*t
//
static INLINE short c0_fixed(unsigned int t)
{
    // put t in Q16 notation
    unsigned short v1, v2;

    // Q16
    v1 = (a_i * t) >> 16;
    v1 = (v1 * t) >> 16;

    // Q16
    v2 = (a_i * t) >> 16;
    v2 = (v2 * t) >> 16;
    v2 = (v2 * t) >> 16;

    // Q12
    return -((v1 - v2) >> 4);
}

//                     2          3
// C1 = a*t + (3-2*a)*t  - (2-a)*t
//
static INLINE short c1_fixed(unsigned int t)
{
    unsigned short v1, v2, v3;
    unsigned short two, three;

    // Q16
    v1 = (a_i * t) >> 16;

    // Q13
    two = 2 << 13;
    v2 = two - (a_i >> 3);
    v2 = (v2 * t) >> 16;
    v2 = (v2 * t) >> 16;
    v2 = (v2 * t) >> 16;

    // Q13
    three = 3 << 13;
    v3 = three - (2 * (a_i >> 3));
    v3 = (v3 * t) >> 16;
    v3 = (v3 * t) >> 16;

    // Q12
    return (((v1 >> 3) - v2 + v3) >> 1);

}

//                 2          3
// C2 = 1 - (3-a)*t  + (2-a)*t
//
static INLINE short c2_fixed(unsigned int t)
{
    unsigned short v1, v2, v3;
    unsigned short two, three;

    // Q13
    v1 = 1 << 13;

    // Q13
    three = 3 << 13;
    v2 = three - (a_i >> 3);
    v2 = (v2 * t) >> 16;
    v2 = (v2 * t) >> 16;

    // Q13
    two = 2 << 13;
    v3 = two - (a_i >> 3);
    v3 = (v3 * t) >> 16;
    v3 = (v3 * t) >> 16;
    v3 = (v3 * t) >> 16;

    // Q12
    return (v1 - v2 + v3) >> 1;
}

//                 2      3
// C3 = a*t - 2*a*t  + a*t
//
static INLINE short c3_fixed(unsigned int t)
{
    int v1, v2, v3;

    // Q16
    v1 = (a_i * t) >> 16;

    // Q15
    v2 = 2 * (a_i >> 1);
    v2 = (v2 * t) >> 16;
    v2 = (v2 * t) >> 16;

    // Q16
    v3 = (a_i * t) >> 16;
    v3 = (v3 * t) >> 16;
    v3 = (v3 * t) >> 16;

    // Q12
    return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3);
}
#else
//          3     2
// C0 = -a*t + a*t
//
float C0(float t)
{
    return -a * t * t * t + a * t * t;
}

//                      2          3
// C1 = -a*t + (2*a+3)*t  - (a+2)*t
//
float C1(float t)
{
    return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t;
}

//                 2          3
// C2 = 1 - (a+3)*t  + (a+2)*t
//
float C2(float t)
{
    return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f;
}

//                 2      3
// C3 = a*t - 2*a*t  + a*t
//
float C3(float t)
{
    return a * t * t * t - 2.0f * a * t * t + a * t;
}
#endif

#if 0
int compare_real_fixed()
{
    int i, errors = 0;
    float mult = 1.0 / 10000.0;
    unsigned int fixed_mult = mult * 4294967296;//65536;
    unsigned int phase_offset_int;
    float phase_offset_real;

    for (i = 0; i < 10000; i++)
    {
        int fixed0, fixed1, fixed2, fixed3, fixed_total;
        int real0, real1, real2, real3, real_total;

        phase_offset_real = (float)i * mult;
        phase_offset_int = (fixed_mult * i) >> 16;
//      phase_offset_int = phase_offset_real * 65536;

        fixed0 = c0_fixed(phase_offset_int);
        real0 = C0(phase_offset_real) * 4096.0;

        if ((abs(fixed0) > (abs(real0) + 1)) || (abs(fixed0) < (abs(real0) - 1)))
            errors++;

        fixed1 = c1_fixed(phase_offset_int);
        real1 = C1(phase_offset_real) * 4096.0;

        if ((abs(fixed1) > (abs(real1) + 1)) || (abs(fixed1) < (abs(real1) - 1)))
            errors++;

        fixed2 = c2_fixed(phase_offset_int);
        real2 = C2(phase_offset_real) * 4096.0;

        if ((abs(fixed2) > (abs(real2) + 1)) || (abs(fixed2) < (abs(real2) - 1)))
            errors++;

        fixed3 = c3_fixed(phase_offset_int);
        real3 = C3(phase_offset_real) * 4096.0;

        if ((abs(fixed3) > (abs(real3) + 1)) || (abs(fixed3) < (abs(real3) - 1)))
            errors++;

        fixed_total = fixed0 + fixed1 + fixed2 + fixed3;
        real_total = real0 + real1 + real2 + real3;

        if ((fixed_total > 4097) || (fixed_total < 4094))
            errors ++;

        if ((real_total > 4097) || (real_total < 4095))
            errors ++;
    }

    return errors;
}
#endif

// Find greatest common denominator between two integers.  Method used here is
//  slow compared to Euclid's algorithm, but does not require any division.
int gcd(int a, int b)
{
    // Problem with this algorithm is that if a or b = 0 this function
    //  will never exit.  Don't want to return 0 because any computation
    //  that was based on a common denoninator and tried to reduce by
    //  dividing by 0 would fail.  Best solution that could be thought of
    //  would to be fail by returing a 1;
    if (a <= 0 || b <= 0)
        return 1;

    while (a != b)
    {
        if (b > a)
            b = b - a;
        else
        {
            int tmp = a;//swap large and
            a = b; //small
            b = tmp;
        }
    }

    return b;
}

void bicubic_coefficient_init()
{
    vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
    g_first_time = 0;
}

void bicubic_coefficient_destroy()
{
    if (!g_first_time)
    {
        if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);

        if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);

        if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);

        if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);

        if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);

        if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);

        vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
    }
}

// Create the coeffients that will be used for the cubic interpolation.
//  Because scaling does not have to be equal in the vertical and horizontal
//  regimes the phase offsets will be different.  There are 4 coefficents
//  for each point, two on each side.  The layout is that there are the
//  4 coefficents for each phase in the array and then the next phase.
int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height)
{
    int i;
#ifdef FIXED_POINT
    int phase_offset_int;
    unsigned int fixed_mult;
    int product_val = 0;
#else
    float phase_offset;
#endif
    int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv;

    if (g_first_time)
        bicubic_coefficient_init();


    // check to see if the coefficents have already been set up correctly
    if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height)
        && (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height))
        return 0;

    g_b_scaler.in_width = in_width;
    g_b_scaler.in_height = in_height;
    g_b_scaler.out_width = out_width;
    g_b_scaler.out_height = out_height;

    // Don't want to allow crazy scaling, just try and prevent a catastrophic
    //  failure here.  Want to fail after setting the member functions so if
    //  if the scaler is called the member functions will not scale.
    if (out_width <= 0 || out_height <= 0)
        return -1;

    // reduce in/out width and height ratios using the gcd
    gcd_w = gcd(out_width, in_width);
    gcd_h = gcd(out_height, in_height);
    gcd_h_uv = gcd(out_height, in_height / 2);

    // the numerator width and height are to be saved in
    //  globals so they can be used during the scaling process
    //  without having to be recalculated.
    g_b_scaler.nw = out_width / gcd_w;
    d_w = in_width / gcd_w;

    g_b_scaler.nh = out_height / gcd_h;
    d_h = in_height / gcd_h;

    g_b_scaler.nh_uv = out_height / gcd_h_uv;
    d_h_uv = (in_height / 2) / gcd_h_uv;

    // allocate memory for the coefficents
    if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);

    if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);

    if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);

    g_b_scaler.l_w = (short *)vpx_memalign(32, out_width * 2);
    g_b_scaler.l_h = (short *)vpx_memalign(32, out_height * 2);
    g_b_scaler.l_h_uv = (short *)vpx_memalign(32, out_height * 2);

    if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);

    if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);

    if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);

    g_b_scaler.c_w = (short *)vpx_memalign(32, g_b_scaler.nw * 4 * 2);
    g_b_scaler.c_h = (short *)vpx_memalign(32, g_b_scaler.nh * 4 * 2);
    g_b_scaler.c_h_uv = (short *)vpx_memalign(32, g_b_scaler.nh_uv * 4 * 2);

    g_b_scaler.hbuf = g_hbuf;
    g_b_scaler.hbuf_uv = g_hbuf_uv;

    // Set up polyphase filter taps.  This needs to be done before
    //  the scaling because of the floating point math required.  The
    //  coefficients are multiplied by 2^12 so that fixed point math
    //  can be used in the main scaling loop.
#ifdef FIXED_POINT
    fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296;

    product_val = 0;

    for (i = 0; i < g_b_scaler.nw; i++)
    {
        if (product_val > g_b_scaler.nw)
            product_val -= g_b_scaler.nw;

        phase_offset_int = (fixed_mult * product_val) >> 16;

        g_b_scaler.c_w[i*4]   = c3_fixed(phase_offset_int);
        g_b_scaler.c_w[i*4+1] = c2_fixed(phase_offset_int);
        g_b_scaler.c_w[i*4+2] = c1_fixed(phase_offset_int);
        g_b_scaler.c_w[i*4+3] = c0_fixed(phase_offset_int);

        product_val += d_w;
    }


    fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296;

    product_val = 0;

    for (i = 0; i < g_b_scaler.nh; i++)
    {
        if (product_val > g_b_scaler.nh)
            product_val -= g_b_scaler.nh;

        phase_offset_int = (fixed_mult * product_val) >> 16;

        g_b_scaler.c_h[i*4]   = c0_fixed(phase_offset_int);
        g_b_scaler.c_h[i*4+1] = c1_fixed(phase_offset_int);
        g_b_scaler.c_h[i*4+2] = c2_fixed(phase_offset_int);
        g_b_scaler.c_h[i*4+3] = c3_fixed(phase_offset_int);

        product_val += d_h;
    }

    fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296;

    product_val = 0;

    for (i = 0; i < g_b_scaler.nh_uv; i++)
    {
        if (product_val > g_b_scaler.nh_uv)
            product_val -= g_b_scaler.nh_uv;

        phase_offset_int = (fixed_mult * product_val) >> 16;

        g_b_scaler.c_h_uv[i*4]   = c0_fixed(phase_offset_int);
        g_b_scaler.c_h_uv[i*4+1] = c1_fixed(phase_offset_int);
        g_b_scaler.c_h_uv[i*4+2] = c2_fixed(phase_offset_int);
        g_b_scaler.c_h_uv[i*4+3] = c3_fixed(phase_offset_int);

        product_val += d_h_uv;
    }

#else

    for (i = 0; i < g_nw; i++)
    {
        phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw;
        g_c_w[i*4]   = (C3(phase_offset) * 4096.0);
        g_c_w[i*4+1] = (C2(phase_offset) * 4096.0);
        g_c_w[i*4+2] = (C1(phase_offset) * 4096.0);
        g_c_w[i*4+3] = (C0(phase_offset) * 4096.0);
    }

    for (i = 0; i < g_nh; i++)
    {
        phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh;
        g_c_h[i*4]   = (C0(phase_offset) * 4096.0);
        g_c_h[i*4+1] = (C1(phase_offset) * 4096.0);
        g_c_h[i*4+2] = (C2(phase_offset) * 4096.0);
        g_c_h[i*4+3] = (C3(phase_offset) * 4096.0);
    }

    for (i = 0; i < g_nh_uv; i++)
    {
        phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv;
        g_c_h_uv[i*4]   = (C0(phase_offset) * 4096.0);
        g_c_h_uv[i*4+1] = (C1(phase_offset) * 4096.0);
        g_c_h_uv[i*4+2] = (C2(phase_offset) * 4096.0);
        g_c_h_uv[i*4+3] = (C3(phase_offset) * 4096.0);
    }

#endif

    // Create an array that corresponds input lines to output lines.
    //  This doesn't require floating point math, but it does require
    //  a division and because hardware division is not present that
    //  is a call.
    for (i = 0; i < out_width; i++)
    {
        g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw;

        if ((g_b_scaler.l_w[i] + 2) <= in_width)
            g_b_scaler.max_usable_out_width = i;

    }

    for (i = 0; i < out_height + 1; i++)
    {
        g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh;
        g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv;
    }

    return 0;
}

int bicubic_scale(int in_width, int in_height, int in_stride,
                  int out_width, int out_height, int out_stride,
                  unsigned char *input_image, unsigned char *output_image)
{
    short *RESTRICT l_w, * RESTRICT l_h;
    short *RESTRICT c_w, * RESTRICT c_h;
    unsigned char *RESTRICT ip, * RESTRICT op;
    unsigned char *RESTRICT hbuf;
    int h, w, lw, lh;
    int temp_sum;
    int phase_offset_w, phase_offset_h;

    c_w = g_b_scaler.c_w;
    c_h = g_b_scaler.c_h;

    op = output_image;

    l_w = g_b_scaler.l_w;
    l_h = g_b_scaler.l_h;

    phase_offset_h = 0;

    for (h = 0; h < out_height; h++)
    {
        // select the row to work on
        lh = l_h[h];
        ip = input_image + (in_stride * lh);

        // vp8_filter the row vertically into an temporary buffer.
        //  If the phase offset == 0 then all the multiplication
        //  is going to result in the output equalling the input.
        //  So instead point the temporary buffer to the input.
        //  Also handle the boundry condition of not being able to
        //  filter that last lines.
        if (phase_offset_h && (lh < in_height - 2))
        {
            hbuf = g_b_scaler.hbuf;

            for (w = 0; w < in_width; w++)
            {
                temp_sum =  c_h[phase_offset_h*4+3] * ip[w - in_stride];
                temp_sum += c_h[phase_offset_h*4+2] * ip[w];
                temp_sum += c_h[phase_offset_h*4+1] * ip[w + in_stride];
                temp_sum += c_h[phase_offset_h*4]   * ip[w + 2*in_stride];

                hbuf[w] = temp_sum >> 12;
            }
        }
        else
            hbuf = ip;

        // increase the phase offset for the next time around.
        if (++phase_offset_h >= g_b_scaler.nh)
            phase_offset_h = 0;

        // now filter and expand it horizontally into the final
        //  output buffer
        phase_offset_w = 0;

        for (w = 0; w < out_width; w++)
        {
            // get the index to use to expand the image
            lw = l_w[w];

            temp_sum =  c_w[phase_offset_w*4]   * hbuf[lw - 1];
            temp_sum += c_w[phase_offset_w*4+1] * hbuf[lw];
            temp_sum += c_w[phase_offset_w*4+2] * hbuf[lw + 1];
            temp_sum += c_w[phase_offset_w*4+3] * hbuf[lw + 2];
            temp_sum = temp_sum >> 12;

            if (++phase_offset_w >= g_b_scaler.nw)
                phase_offset_w = 0;

            // boundry conditions
            if ((lw + 2) >= in_width)
                temp_sum = hbuf[lw];

            if (lw == 0)
                temp_sum = hbuf[0];

            op[w] = temp_sum;
        }

        op += out_stride;
    }

    return 0;
}

void bicubic_scale_frame_reset()
{
    g_b_scaler.out_width = 0;
    g_b_scaler.out_height = 0;
}

void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
                         int new_width, int new_height)
{

    dst->y_width = new_width;
    dst->y_height = new_height;
    dst->uv_width = new_width / 2;
    dst->uv_height = new_height / 2;

    dst->y_stride = dst->y_width;
    dst->uv_stride = dst->uv_width;

    bicubic_scale(src->y_width, src->y_height, src->y_stride,
                  new_width, new_height, dst->y_stride,
                  src->y_buffer, dst->y_buffer);

    bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
                  new_width / 2, new_height / 2, dst->uv_stride,
                  src->u_buffer, dst->u_buffer);

    bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
                  new_width / 2, new_height / 2, dst->uv_stride,
                  src->v_buffer, dst->v_buffer);
}
Initial WebM release 2010-05-18 17:58:33 +02:00			`/*`
			`* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.`
			`*`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`* Use of this source code is governed by a BSD-style license`
			`* that can be found in the LICENSE file in the root of the source`
			`* tree. An additional intellectual property rights grant can be found`
			`* in the file PATENTS. All contributing project authors may`
			`* be found in the AUTHORS file in the root of the source tree.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`*/`


			`#include <float.h>`
			`#include <math.h>`
			`#include <stdio.h>`
			`#include "vpx_mem/vpx_mem.h"`
			`#include "vpxscale_arbitrary.h"`

			`#define FIXED_POINT`

			`#define MAX_IN_WIDTH 800`
			`#define MAX_IN_HEIGHT 600`
			`#define MAX_OUT_WIDTH 800`
			`#define MAX_OUT_HEIGHT 600`
			`#define MAX_OUT_DIMENSION ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \`
			`MAX_OUT_WIDTH : MAX_OUT_HEIGHT)`

			`BICUBIC_SCALER_STRUCT g_b_scaler;`
			`static int g_first_time = 1;`

			`#pragma DATA_SECTION(g_hbuf, "VP6_HEAP")`
			`#pragma DATA_ALIGN (g_hbuf, 32);`
			`unsigned char g_hbuf[MAX_OUT_DIMENSION];`

			`#pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP")`
			`#pragma DATA_ALIGN (g_hbuf_uv, 32);`
			`unsigned char g_hbuf_uv[MAX_OUT_DIMENSION];`


			`#ifdef FIXED_POINT`
			`static int a_i = 0.6 * 65536;`
			`#else`
			`static float a = -0.6;`
			`#endif`

			`#ifdef FIXED_POINT`
			`// 3 2`
			`// C0 = at - at`
			`//`
			`static INLINE short c0_fixed(unsigned int t)`
			`{`
			`// put t in Q16 notation`
			`unsigned short v1, v2;`

			`// Q16`
			`v1 = (a_i * t) >> 16;`
			`v1 = (v1 * t) >> 16;`

			`// Q16`
			`v2 = (a_i * t) >> 16;`
			`v2 = (v2 * t) >> 16;`
			`v2 = (v2 * t) >> 16;`

			`// Q12`
			`return -((v1 - v2) >> 4);`
			`}`

			`// 2 3`
			`// C1 = at + (3-2a)t - (2-a)t`
			`//`
			`static INLINE short c1_fixed(unsigned int t)`
			`{`
			`unsigned short v1, v2, v3;`
			`unsigned short two, three;`

			`// Q16`
			`v1 = (a_i * t) >> 16;`

			`// Q13`
			`two = 2 << 13;`
			`v2 = two - (a_i >> 3);`
			`v2 = (v2 * t) >> 16;`
			`v2 = (v2 * t) >> 16;`
			`v2 = (v2 * t) >> 16;`

			`// Q13`
			`three = 3 << 13;`
			`v3 = three - (2 * (a_i >> 3));`
			`v3 = (v3 * t) >> 16;`
			`v3 = (v3 * t) >> 16;`

			`// Q12`
			`return (((v1 >> 3) - v2 + v3) >> 1);`

			`}`

			`// 2 3`
			`// C2 = 1 - (3-a)t + (2-a)t`
			`//`
			`static INLINE short c2_fixed(unsigned int t)`
			`{`
			`unsigned short v1, v2, v3;`
			`unsigned short two, three;`

			`// Q13`
			`v1 = 1 << 13;`

			`// Q13`
			`three = 3 << 13;`
			`v2 = three - (a_i >> 3);`
			`v2 = (v2 * t) >> 16;`
			`v2 = (v2 * t) >> 16;`

			`// Q13`
			`two = 2 << 13;`
			`v3 = two - (a_i >> 3);`
			`v3 = (v3 * t) >> 16;`
			`v3 = (v3 * t) >> 16;`
			`v3 = (v3 * t) >> 16;`

			`// Q12`
			`return (v1 - v2 + v3) >> 1;`
			`}`

			`// 2 3`
			`// C3 = at - 2at + at`
			`//`
			`static INLINE short c3_fixed(unsigned int t)`
			`{`
			`int v1, v2, v3;`

			`// Q16`
			`v1 = (a_i * t) >> 16;`

			`// Q15`
			`v2 = 2 * (a_i >> 1);`
			`v2 = (v2 * t) >> 16;`
			`v2 = (v2 * t) >> 16;`

			`// Q16`
			`v3 = (a_i * t) >> 16;`
			`v3 = (v3 * t) >> 16;`
			`v3 = (v3 * t) >> 16;`

			`// Q12`
			`return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3);`
			`}`
			`#else`
			`// 3 2`
			`// C0 = -at + at`
			`//`
			`float C0(float t)`
			`{`
			`return -a * t * t * t + a * t * t;`
			`}`

			`// 2 3`
			`// C1 = -at + (2a+3)t - (a+2)t`
			`//`
			`float C1(float t)`
			`{`
			`return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t;`
			`}`

			`// 2 3`
			`// C2 = 1 - (a+3)t + (a+2)t`
			`//`
			`float C2(float t)`
			`{`
			`return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f;`
			`}`

			`// 2 3`
			`// C3 = at - 2at + at`
			`//`
			`float C3(float t)`
			`{`
			`return a * t * t * t - 2.0f * a * t * t + a * t;`
			`}`
			`#endif`

			`#if 0`
			`int compare_real_fixed()`
			`{`
			`int i, errors = 0;`
			`float mult = 1.0 / 10000.0;`
			`unsigned int fixed_mult = mult * 4294967296;//65536;`
			`unsigned int phase_offset_int;`
			`float phase_offset_real;`

			`for (i = 0; i < 10000; i++)`
			`{`
			`int fixed0, fixed1, fixed2, fixed3, fixed_total;`
			`int real0, real1, real2, real3, real_total;`

			`phase_offset_real = (float)i * mult;`
			`phase_offset_int = (fixed_mult * i) >> 16;`
			`// phase_offset_int = phase_offset_real * 65536;`

			`fixed0 = c0_fixed(phase_offset_int);`
			`real0 = C0(phase_offset_real) * 4096.0;`

			`if ((abs(fixed0) > (abs(real0) + 1)) \|\| (abs(fixed0) < (abs(real0) - 1)))`
			`errors++;`

			`fixed1 = c1_fixed(phase_offset_int);`
			`real1 = C1(phase_offset_real) * 4096.0;`

			`if ((abs(fixed1) > (abs(real1) + 1)) \|\| (abs(fixed1) < (abs(real1) - 1)))`
			`errors++;`

			`fixed2 = c2_fixed(phase_offset_int);`
			`real2 = C2(phase_offset_real) * 4096.0;`

			`if ((abs(fixed2) > (abs(real2) + 1)) \|\| (abs(fixed2) < (abs(real2) - 1)))`
			`errors++;`

			`fixed3 = c3_fixed(phase_offset_int);`
			`real3 = C3(phase_offset_real) * 4096.0;`

			`if ((abs(fixed3) > (abs(real3) + 1)) \|\| (abs(fixed3) < (abs(real3) - 1)))`
			`errors++;`

			`fixed_total = fixed0 + fixed1 + fixed2 + fixed3;`
			`real_total = real0 + real1 + real2 + real3;`

			`if ((fixed_total > 4097) \|\| (fixed_total < 4094))`
			`errors ++;`

			`if ((real_total > 4097) \|\| (real_total < 4095))`
			`errors ++;`
			`}`

			`return errors;`
			`}`
			`#endif`

			`// Find greatest common denominator between two integers. Method used here is`
			`// slow compared to Euclid's algorithm, but does not require any division.`
			`int gcd(int a, int b)`
			`{`
			`// Problem with this algorithm is that if a or b = 0 this function`
			`// will never exit. Don't want to return 0 because any computation`
			`// that was based on a common denoninator and tried to reduce by`
			`// dividing by 0 would fail. Best solution that could be thought of`
			`// would to be fail by returing a 1;`
			`if (a <= 0 \|\| b <= 0)`
			`return 1;`

			`while (a != b)`
			`{`
			`if (b > a)`
			`b = b - a;`
			`else`
			`{`
			`int tmp = a;//swap large and`
			`a = b; //small`
			`b = tmp;`
			`}`
			`}`

			`return b;`
			`}`

			`void bicubic_coefficient_init()`
			`{`
			`vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));`
			`g_first_time = 0;`
			`}`

			`void bicubic_coefficient_destroy()`
			`{`
			`if (!g_first_time)`
			`{`
			`if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);`

			`if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);`

			`if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);`

			`if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);`

			`if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);`

			`if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);`

			`vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));`
			`}`
			`}`

			`// Create the coeffients that will be used for the cubic interpolation.`
			`// Because scaling does not have to be equal in the vertical and horizontal`
			`// regimes the phase offsets will be different. There are 4 coefficents`
			`// for each point, two on each side. The layout is that there are the`
			`// 4 coefficents for each phase in the array and then the next phase.`
			`int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height)`
			`{`
			`int i;`
			`#ifdef FIXED_POINT`
			`int phase_offset_int;`
			`unsigned int fixed_mult;`
			`int product_val = 0;`
			`#else`
			`float phase_offset;`
			`#endif`
			`int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv;`

			`if (g_first_time)`
			`bicubic_coefficient_init();`


			`// check to see if the coefficents have already been set up correctly`
			`if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height)`
			`&& (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height))`
			`return 0;`

			`g_b_scaler.in_width = in_width;`
			`g_b_scaler.in_height = in_height;`
			`g_b_scaler.out_width = out_width;`
			`g_b_scaler.out_height = out_height;`

			`// Don't want to allow crazy scaling, just try and prevent a catastrophic`
			`// failure here. Want to fail after setting the member functions so if`
			`// if the scaler is called the member functions will not scale.`
			`if (out_width <= 0 \|\| out_height <= 0)`
			`return -1;`

			`// reduce in/out width and height ratios using the gcd`
			`gcd_w = gcd(out_width, in_width);`
			`gcd_h = gcd(out_height, in_height);`
			`gcd_h_uv = gcd(out_height, in_height / 2);`

			`// the numerator width and height are to be saved in`
			`// globals so they can be used during the scaling process`
			`// without having to be recalculated.`
			`g_b_scaler.nw = out_width / gcd_w;`
			`d_w = in_width / gcd_w;`

			`g_b_scaler.nh = out_height / gcd_h;`
			`d_h = in_height / gcd_h;`

			`g_b_scaler.nh_uv = out_height / gcd_h_uv;`
			`d_h_uv = (in_height / 2) / gcd_h_uv;`

			`// allocate memory for the coefficents`
			`if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);`

			`if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);`

			`if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);`

			`g_b_scaler.l_w = (short )vpx_memalign(32, out_width 2);`
			`g_b_scaler.l_h = (short )vpx_memalign(32, out_height 2);`
			`g_b_scaler.l_h_uv = (short )vpx_memalign(32, out_height 2);`

			`if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);`

			`if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);`

			`if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);`

			`g_b_scaler.c_w = (short )vpx_memalign(32, g_b_scaler.nw 4 * 2);`
			`g_b_scaler.c_h = (short )vpx_memalign(32, g_b_scaler.nh 4 * 2);`
			`g_b_scaler.c_h_uv = (short )vpx_memalign(32, g_b_scaler.nh_uv 4 * 2);`

			`g_b_scaler.hbuf = g_hbuf;`
			`g_b_scaler.hbuf_uv = g_hbuf_uv;`

			`// Set up polyphase filter taps. This needs to be done before`
			`// the scaling because of the floating point math required. The`
			`// coefficients are multiplied by 2^12 so that fixed point math`
			`// can be used in the main scaling loop.`
			`#ifdef FIXED_POINT`
			`fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296;`

			`product_val = 0;`

			`for (i = 0; i < g_b_scaler.nw; i++)`
			`{`
			`if (product_val > g_b_scaler.nw)`
			`product_val -= g_b_scaler.nw;`

			`phase_offset_int = (fixed_mult * product_val) >> 16;`

			`g_b_scaler.c_w[i*4] = c3_fixed(phase_offset_int);`
			`g_b_scaler.c_w[i*4+1] = c2_fixed(phase_offset_int);`
			`g_b_scaler.c_w[i*4+2] = c1_fixed(phase_offset_int);`
			`g_b_scaler.c_w[i*4+3] = c0_fixed(phase_offset_int);`

			`product_val += d_w;`
			`}`


			`fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296;`

			`product_val = 0;`

			`for (i = 0; i < g_b_scaler.nh; i++)`
			`{`
			`if (product_val > g_b_scaler.nh)`
			`product_val -= g_b_scaler.nh;`

			`phase_offset_int = (fixed_mult * product_val) >> 16;`

			`g_b_scaler.c_h[i*4] = c0_fixed(phase_offset_int);`
			`g_b_scaler.c_h[i*4+1] = c1_fixed(phase_offset_int);`
			`g_b_scaler.c_h[i*4+2] = c2_fixed(phase_offset_int);`
			`g_b_scaler.c_h[i*4+3] = c3_fixed(phase_offset_int);`

			`product_val += d_h;`
			`}`

			`fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296;`

			`product_val = 0;`

			`for (i = 0; i < g_b_scaler.nh_uv; i++)`
			`{`
			`if (product_val > g_b_scaler.nh_uv)`
			`product_val -= g_b_scaler.nh_uv;`

			`phase_offset_int = (fixed_mult * product_val) >> 16;`

			`g_b_scaler.c_h_uv[i*4] = c0_fixed(phase_offset_int);`
			`g_b_scaler.c_h_uv[i*4+1] = c1_fixed(phase_offset_int);`
			`g_b_scaler.c_h_uv[i*4+2] = c2_fixed(phase_offset_int);`
			`g_b_scaler.c_h_uv[i*4+3] = c3_fixed(phase_offset_int);`

			`product_val += d_h_uv;`
			`}`

			`#else`

			`for (i = 0; i < g_nw; i++)`
			`{`
			`phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw;`
			`g_c_w[i4] = (C3(phase_offset) 4096.0);`
			`g_c_w[i4+1] = (C2(phase_offset) 4096.0);`
			`g_c_w[i4+2] = (C1(phase_offset) 4096.0);`
			`g_c_w[i4+3] = (C0(phase_offset) 4096.0);`
			`}`

			`for (i = 0; i < g_nh; i++)`
			`{`
			`phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh;`
			`g_c_h[i4] = (C0(phase_offset) 4096.0);`
			`g_c_h[i4+1] = (C1(phase_offset) 4096.0);`
			`g_c_h[i4+2] = (C2(phase_offset) 4096.0);`
			`g_c_h[i4+3] = (C3(phase_offset) 4096.0);`
			`}`

			`for (i = 0; i < g_nh_uv; i++)`
			`{`
			`phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv;`
			`g_c_h_uv[i4] = (C0(phase_offset) 4096.0);`
			`g_c_h_uv[i4+1] = (C1(phase_offset) 4096.0);`
			`g_c_h_uv[i4+2] = (C2(phase_offset) 4096.0);`
			`g_c_h_uv[i4+3] = (C3(phase_offset) 4096.0);`
			`}`

			`#endif`

			`// Create an array that corresponds input lines to output lines.`
			`// This doesn't require floating point math, but it does require`
			`// a division and because hardware division is not present that`
			`// is a call.`
			`for (i = 0; i < out_width; i++)`
			`{`
			`g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw;`

			`if ((g_b_scaler.l_w[i] + 2) <= in_width)`
			`g_b_scaler.max_usable_out_width = i;`

			`}`

			`for (i = 0; i < out_height + 1; i++)`
			`{`
			`g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh;`
			`g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv;`
			`}`

			`return 0;`
			`}`

			`int bicubic_scale(int in_width, int in_height, int in_stride,`
			`int out_width, int out_height, int out_stride,`
			`unsigned char input_image, unsigned char output_image)`
			`{`
			`short RESTRICT l_w, RESTRICT l_h;`
			`short RESTRICT c_w, RESTRICT c_h;`
			`unsigned char RESTRICT ip, RESTRICT op;`
			`unsigned char *RESTRICT hbuf;`
			`int h, w, lw, lh;`
			`int temp_sum;`
			`int phase_offset_w, phase_offset_h;`

			`c_w = g_b_scaler.c_w;`
			`c_h = g_b_scaler.c_h;`

			`op = output_image;`

			`l_w = g_b_scaler.l_w;`
			`l_h = g_b_scaler.l_h;`

			`phase_offset_h = 0;`

			`for (h = 0; h < out_height; h++)`
			`{`
			`// select the row to work on`
			`lh = l_h[h];`
			`ip = input_image + (in_stride * lh);`

			`// vp8_filter the row vertically into an temporary buffer.`
			`// If the phase offset == 0 then all the multiplication`
			`// is going to result in the output equalling the input.`
			`// So instead point the temporary buffer to the input.`
			`// Also handle the boundry condition of not being able to`
			`// filter that last lines.`
			`if (phase_offset_h && (lh < in_height - 2))`
			`{`
			`hbuf = g_b_scaler.hbuf;`

			`for (w = 0; w < in_width; w++)`
			`{`
			`temp_sum = c_h[phase_offset_h4+3] ip[w - in_stride];`
			`temp_sum += c_h[phase_offset_h4+2] ip[w];`
			`temp_sum += c_h[phase_offset_h4+1] ip[w + in_stride];`
			`temp_sum += c_h[phase_offset_h4] ip[w + 2*in_stride];`

			`hbuf[w] = temp_sum >> 12;`
			`}`
			`}`
			`else`
			`hbuf = ip;`

			`// increase the phase offset for the next time around.`
			`if (++phase_offset_h >= g_b_scaler.nh)`
			`phase_offset_h = 0;`

			`// now filter and expand it horizontally into the final`
			`// output buffer`
			`phase_offset_w = 0;`

			`for (w = 0; w < out_width; w++)`
			`{`
			`// get the index to use to expand the image`
			`lw = l_w[w];`

			`temp_sum = c_w[phase_offset_w4] hbuf[lw - 1];`
			`temp_sum += c_w[phase_offset_w4+1] hbuf[lw];`
			`temp_sum += c_w[phase_offset_w4+2] hbuf[lw + 1];`
			`temp_sum += c_w[phase_offset_w4+3] hbuf[lw + 2];`
			`temp_sum = temp_sum >> 12;`

			`if (++phase_offset_w >= g_b_scaler.nw)`
			`phase_offset_w = 0;`

			`// boundry conditions`
			`if ((lw + 2) >= in_width)`
			`temp_sum = hbuf[lw];`

			`if (lw == 0)`
			`temp_sum = hbuf[0];`

			`op[w] = temp_sum;`
			`}`

			`op += out_stride;`
			`}`

			`return 0;`
			`}`

			`void bicubic_scale_frame_reset()`
			`{`
			`g_b_scaler.out_width = 0;`
			`g_b_scaler.out_height = 0;`
			`}`

			`void bicubic_scale_frame(YV12_BUFFER_CONFIG src, YV12_BUFFER_CONFIG dst,`
			`int new_width, int new_height)`
			`{`

			`dst->y_width = new_width;`
			`dst->y_height = new_height;`
			`dst->uv_width = new_width / 2;`
			`dst->uv_height = new_height / 2;`

			`dst->y_stride = dst->y_width;`
			`dst->uv_stride = dst->uv_width;`

			`bicubic_scale(src->y_width, src->y_height, src->y_stride,`
			`new_width, new_height, dst->y_stride,`
			`src->y_buffer, dst->y_buffer);`

			`bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,`
			`new_width / 2, new_height / 2, dst->uv_stride,`
			`src->u_buffer, dst->u_buffer);`

			`bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,`
			`new_width / 2, new_height / 2, dst->uv_stride,`
			`src->v_buffer, dst->v_buffer);`
			`}`