/*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be
 *  found  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <unistd.h>
#include <math.h>
#include <stdio.h>

#include "vp9/common/vp9_motion_model.h"
#include "vp9/encoder/vp9_opticalflow.h"
#include "vp9/encoder/vp9_ransac.h"
#include "vp9/encoder/vp9_resize.h"

// ITERATIONS is the number of iterative refinements to make on an initial
// flow estimate. 3 seems to work well but feel free to play with this.
#define ITERATIONS 3
#define MAX_MEDIAN_LENGTH  121
#define MAX_LEVELS 8
#define MIN_LEVEL_SIZE 30
#define MAX_ERROR  0.00001
#define BLUR_SIZE 11
#define BLUR_SIGMA 1.5
#define BLUR_AMP 1
#define MEDIAN_FILTER_SIZE 5
#define RELATIVE_ERROR(a, b) (fabs(((a) - (b))))
#define WRAP_PIXEL(p, size) ((p) < 0 ? abs((p)) - 1 : \
                            ((p) >= (size) ? (2 * (size) - 1 - (p)) : (p)))

// Struct for an image pyramid
typedef struct {
  int n_levels;
  int widths[MAX_LEVELS];
  int heights[MAX_LEVELS];
  int strides[MAX_LEVELS];
  unsigned char **levels;
} imPyramid;

typedef struct {
  int stride;
  double *confidence;
  double *u;
  double *v;
} flowMV;

static void * safe_malloc(size_t size) {
  void *v = malloc(size);
  if (!v) {
    fprintf(stderr, "Not enough memory\n");
    exit(EXIT_FAILURE);
  }
  return v;
}

// Produces an image pyramid where each level is resized by a
// specified resize factor
static void image_pyramid(unsigned char *img, imPyramid *pyr, int width,
                          int height, int stride, int n_levels) {
  int i;
  int max_levels = 1;
  int scaled = width < height ? (width/MIN_LEVEL_SIZE) :
               (height/MIN_LEVEL_SIZE);
  pyr->widths[0] = width;
  pyr->heights[0] = height;
  pyr->strides[0] = stride;

  if (n_levels == 1) {
    pyr->n_levels = 1;
    pyr->levels = (unsigned char**)safe_malloc(sizeof(*pyr->levels));
    pyr->levels[0] = img;
    return;
  }
  // compute number of possible levels of the pyramid. The smallest dim of the
  // smallest level should be no less than 30 px.
  while (scaled > 1) {
    max_levels++;
    scaled >>= 1;
  }

  if (n_levels > max_levels)
    n_levels = max_levels;

  pyr->n_levels = n_levels;
  pyr->levels = (unsigned char**)safe_malloc(n_levels * sizeof(*pyr->levels));
  pyr->levels[0] = img;

  // compute each level
  for (i = 1; i < n_levels; ++i) {
    pyr->widths[i] = pyr->widths[i-1] >> 1;
    pyr->heights[i] = pyr->heights[i-1] >> 1;
    pyr->strides[i] = pyr->widths[i];
    pyr->levels[i] = (unsigned char*)safe_malloc(pyr->widths[i] *
                                                 pyr->heights[i] *
                                                 sizeof(*pyr->levels[i]));

    vp9_resize_plane(pyr->levels[i-1], pyr->heights[i-1], pyr->widths[i-1],
                     pyr->strides[i-1], pyr->levels[i], pyr->heights[i],
                     pyr->widths[i], pyr->strides[i]);
  }
}

static void destruct_pyramid(imPyramid *pyr) {
  int i;

  // start at level 1 because level 0 is a reference to the original image
  for (i = 1; i < pyr->n_levels; ++i)
    free(pyr->levels[i]);
  free(pyr->levels);
}

// Convolution with reflected content padding to minimize edge artifacts.
static void convolve_char(double *filter, unsigned char *image,
                          double *convolved, int filt_width, int filt_height,
                          int image_width, int image_height, int image_stride,
                          int convolved_stride) {
  int i, j, x, y, imx, imy;
  double pixel_val;
  double filter_val;
  double image_val;
  int x_pad_size = filt_width >> 1;
  int y_pad_size = filt_height >> 1;
  for (j = 0; j < image_height; ++j)
    for (i = 0; i < image_width; ++i) {
      pixel_val = 0;
      for (y = (-y_pad_size); y<= y_pad_size; y++)
        for (x = (-x_pad_size); x <= x_pad_size; x++) {
          filter_val = filter[(x + x_pad_size) + (y + y_pad_size) * filt_width];
          imx = WRAP_PIXEL(i + x, image_width);
          imy = WRAP_PIXEL(j + y, image_height);
          image_val = image[imx + imy * image_stride];
          pixel_val += (filter_val * image_val);
        }
      convolved[i + j * convolved_stride] =  pixel_val;
    }
}

// Convolution with reflected content padding to minimize edge artifacts.
static void convolve_double(const double *filter, double *image,
                            double *convolved, int filt_width,
                            int filt_height, int image_width,
                            int image_height, int image_stride,
                            int convolved_stride) {
  int i, j, x, y, imx, imy;
  double pixel_val;
  double filter_val;
  double image_val;
  int x_pad_size = filt_width >> 1;
  int y_pad_size = filt_height >> 1;
  for (j = 0; j < image_height; ++j)
    for (i = 0; i < image_width; ++i) {
      pixel_val = 0;
      for (y = (-y_pad_size); y<= y_pad_size; y++)
        for (x = (-x_pad_size); x <= x_pad_size; x++) {
          filter_val = filter[(x + x_pad_size) + (y + y_pad_size) * filt_width];
          imx = WRAP_PIXEL(i + x, image_width);
          imy = WRAP_PIXEL(j + y, image_height);
          image_val = image[imx + imy * image_stride];
          pixel_val += (filter_val * image_val);
        }
      convolved[i + j * convolved_stride] = pixel_val;
    }
}

// computes x and y spatial derivatives of an image.
static void differentiate(double *img, int width,  int height,
                          int stride, double *dx_img,  double *dy_img,
                          int deriv_stride) {
  const double spatial_deriv_kernel[] = {-0.0833, 0.6667, 0.0,
                                         -0.6667, 0.0833};
  convolve_double(spatial_deriv_kernel, img, dx_img, 5, 1, width, height,
                  stride, deriv_stride);
  convolve_double(spatial_deriv_kernel, img, dy_img, 1, 5, width, height,
                  stride, deriv_stride);
}

// creates a 2D gaussian kernel
static void gaussian_kernel(double *mat, int size, double sig, double amp) {
  int x, y;
  double filter_val;
  int half;
  double denominator = 1 / (2.0 * sig * sig);
  double sum = 0.0f;

  size |= 1;
  half = size >> 1;
  for (y = -half; y<= half; y++)
    for (x = -half; x<= half; x++) {
      filter_val = amp * exp((double)(-1.0 * ((x * x + y * y) * denominator)));
      mat[(x + half) + ((y + half) * size)] = filter_val;
      sum += filter_val;
    }

  // normalize the filter
  if (sum > MAX_ERROR) {
    sum = 1 / sum;
    for (x = 0; x < size * size; x++) {
      mat[x] = mat[x] * sum;
    }
  }
}

// blurs image with a gaussian kernel
static void blur_img(unsigned char *img, int width, int height,
                     int stride, double *smooth_img, int smooth_stride,
                     int smoothing_size, double smoothing_sig,
                     double smoothing_amp) {
  double *smoothing_kernel = (double *)safe_malloc(smoothing_size *
                                                   smoothing_size *
                                                   sizeof(*smoothing_kernel));
  gaussian_kernel(smoothing_kernel, smoothing_size, smoothing_sig,
                  smoothing_amp);
  convolve_char(smoothing_kernel, img, smooth_img, smoothing_size,
                smoothing_size, width, height, stride, smooth_stride);
  free(smoothing_kernel);
}

// Implementation of Hoare's select for linear time median selection.
// Takes in a pointer to the beginning of a list of values, the length
// of the list, an integer representing the index of the number to be
// selected, and an unsigned integer used to seed a random number generator.
static double selection(double *vals, int length, int k, unsigned int seed) {
  int pivot_ind, x;
  double pivot;
  double L[MAX_MEDIAN_LENGTH], G[MAX_MEDIAN_LENGTH];
  int l = 0, e = 0, g = 0;
  pivot_ind = (int) ((length / ((double)RAND_MAX)) * rand_r(&seed));
  pivot = vals[pivot_ind];
  for (x = 0; x < length; ++x) {
    if (RELATIVE_ERROR(vals[x], pivot) <= MAX_ERROR) {
      e++;
    } else if (vals[x] < pivot) {
      L[l] = vals[x];
      l++;
    } else if (vals[x] > pivot) {
      G[g] = vals[x];
      g++;
    }
  }

  if (k <= l)
    return selection(L, l, k, seed);
  else if (k <= l + e)
    return pivot;
  else
    return selection(G, g, k - l - e, seed);
}

// Performs median filtering for denoising. This implementation uses hoare's
// select to find the median in a block. block_x, block_y are
// the x and y coordinates of the center of the block in the source.
static void median_filter(double *source, double *filtered, int block_size,
                          int width, int height, int source_stride,
                          int filtered_stride) {
  int i, j, block_x, block_y, imx, imy;
  double pivot, val;
  int length = block_size * block_size;
  double L[MAX_MEDIAN_LENGTH], G[MAX_MEDIAN_LENGTH];
  int k = length >> 1 | 1;
  int l, e, g;
  int pad_size = block_size >> 1;
  unsigned int seed = (unsigned int)*source;

  // find the median within each block. Reflected content is used for padding.
  for (block_y = 0; block_y < height; ++block_y)
    for (block_x = 0; block_x < width; ++block_x) {
      l = 0, e = 0, g = 0;
      memset(L, 0, length * sizeof(*L));
      memset(G, 0, length * sizeof(*G));
      pivot = source[block_x + block_y * source_stride];
      for (j = -pad_size; j <= pad_size; ++j)
        for (i = -pad_size; i <= pad_size; ++i) {
          imx = WRAP_PIXEL(i + block_x, width);
          imy = WRAP_PIXEL(j + block_y, height);
          val = source[imx + imy * source_stride];

          // pulling out the first iteration of selection so we don't have
          // to iterate through the block to flatten it and then
          // iterate through those same values to put them into
          // the L E G bins in selection. Ugly but more efficent.
          if (RELATIVE_ERROR(val, pivot) <= MAX_ERROR) {
            e++;
          } else if (val < pivot) {
            L[l] = val;
            l++;
          } else if (val > pivot) {
            G[g] = val;
            g++;
          }
        }
      if (k <= l)
        filtered[block_x + block_y * filtered_stride] =
            selection(L, l, k, seed);
      else if (k <= l + e)
        filtered[block_x + block_y * filtered_stride] = pivot;
      else
        filtered[block_x + block_y * filtered_stride] = selection(G, g, k -
                                                                  l - e, seed);
  }
}

static inline void pointwise_matrix_sub(double *mat1, double *mat2,
                                        double *diff, int width, int height,
                                        int stride1, int stride2,
                                        int diffstride) {
  int i, j;
  for (j = 0; j < height; ++j)
    for (i = 0; i < width; ++i) {
      diff[i + j * diffstride] = mat1[i + j * stride1] - mat2[i + j * stride2];
    }
}

static inline void pointwise_matrix_add(double *mat1, double *mat2,
                                        double *sum, int width, int height,
                                        int stride1, int stride2,
                                        int sumstride) {
  int i, j;
  for (j = 0; j < height; ++j)
    for (i = 0; i < width; ++i) {
      sum[i + j * sumstride] = mat1[i + j * stride1] + mat2[i + j * stride2];
    }
}

// Solves lucas kanade equation at any give pixel in the image using a local
// neighborhood for support. loc_x and loc_y are the x and y components
// of the center of the local neighborhood. Assumes dx, dy and dt have same
// stride. window is a wind_size x wind_size set of weights for the
// window_size x window_size neighborhood surrounding loc_x and loc_y.
static void optical_flow_per_pixel(double *dx, double *dy, double *dt,
                                   const double *window, int wind_size,
                                   flowMV *flow, int width, int height,
                                   int stride, int loc_x, int loc_y) {
  int i, j, iw, jw, im_ind;
  double g = 0;
  // M and b are matrices used to solve the equation a = M^-1 * b where a
  // are the desired optical flow parameters
  double M[4] = {0, 0, 0, 0};
  double b[2] = {0, 0};
  double det = 0;
  double trace2 = 0;
  double corner_score = 0;
  int step = wind_size >> 1;
  double gdx = 0, gdy = 0;

  for (j = loc_y - step; j <= loc_y + step; ++j)
    for (i = loc_x - step; i <= loc_x + step; ++i) {
      // if pixel is out of bounds, use reflected image content
      iw = WRAP_PIXEL(i, width);
      jw = WRAP_PIXEL(j, height);
      im_ind = iw + jw * stride;
      g = window[(i - loc_x + step) + (j - loc_y + step) * wind_size];
      gdx = g * dx[im_ind];
      gdy = g * dy[im_ind];
      M[0] += gdx * dx[im_ind];
      M[1] += gdx * dy[im_ind];
      M[3] += gdy * dy[im_ind];
      b[0] += -gdx * dt[im_ind];
      b[1] += -gdy * dt[im_ind];
    }
  M[2] = M[1];
  det = (M[0] * M[3]) - (M[1] * M[1]);
  trace2 = (M[0] + M[3]) * (M[0] + M[3]);
  if (RELATIVE_ERROR(det, 0) > MAX_ERROR) {
    const double det_inv = 1 / det;
    const double mult_b0 = det_inv * b[0];
    const double mult_b1 = det_inv * b[1];
    flow->u[loc_x + loc_y * flow->stride] = M[3] * mult_b0 - M[1] * mult_b1;
    flow->v[loc_x + loc_y * flow->stride] = -M[2] * mult_b0 + M[0] * mult_b1;
  } else {
    if (M[0] == 0 && M[3] == 0) {
      flow->u[loc_x + loc_y * flow->stride] = 0;
      flow->v[loc_x + loc_y * flow->stride] = 0;
    } else {
      const double trace2_inv = 1 / trace2;
      const double mult_b0 = trace2_inv * b[0];
      const double mult_b1 = trace2_inv * b[1];
      flow->u[loc_x + loc_y * flow->stride] = M[0] * mult_b0 + M[1] * mult_b1;
      flow->v[loc_x + loc_y * flow->stride] = M[2] * mult_b0 + M[3] * mult_b1;
    }
  }
  // compute inverse harris corner score as confidence metric
  corner_score = 1 / (det - (0.01 * trace2));
  flow->confidence[loc_x + loc_y * flow->stride] = corner_score > 2 ?
                                                   2 : corner_score;
}

// computes lucas kanade optical flow. Note that this assumes images that
// already denoised and differentiated.
static void lucas_kanade_base(double *smooth_frm, double *smooth_ref,
                              double *dx, double *dy, double *dt, flowMV *flow,
                              int width, int height, int smooth_frm_stride,
                              int smooth_ref_stride, int deriv_stride) {
  int i, j;
  const int local_neighborhood_sz = 5;
  // windowing function for local neighborhood weights
  const double window[] = {0.0039, 0.0156, 0.0234, 0.0156, 0.0039,
                           0.0156, 0.0625, 0.0938, 0.0625, 0.0156,
                           0.0234, 0.0938, 0.1406, 0.0938, 0.0234,
                           0.0156, 0.0625, 0.0938, 0.0625, 0.0156,
                           0.0039, 0.0156, 0.0234, 0.0156, 0.0039};
  // compute temporal derivative
  pointwise_matrix_sub(smooth_ref, smooth_frm, dt, width, height,
                       smooth_ref_stride, smooth_frm_stride, deriv_stride);

  for (j = 0; j < height; ++j)
    for (i = 0; i < width; ++i) {
      optical_flow_per_pixel(dx, dy, dt, window, local_neighborhood_sz,
                             flow, width, height, deriv_stride, i, j);
    }
}

// Improves an initial approximation for the vector field by iteratively
// warping one image towards the other.
static void iterative_refinement(unsigned char *ref, double *smooth_frm,
                                 double *dx, double *dy, double *dt,
                                 flowMV *flow, int width, int height,
                                 int ref_stride, int smooth_frm_stride,
                                 int deriv_stride, int n_refinements) {
  int i, j, n;
  double x, y;
  unsigned char *estimate = (unsigned char*)safe_malloc(width * height *
                                                        sizeof(*estimate));
  double *smooth_estimate = (double*)safe_malloc(width * height *
                                                 sizeof(*smooth_estimate));
  flowMV new_flow;
  new_flow.u = (double*)safe_malloc(width * height * sizeof(*new_flow.u));
  new_flow.v = (double*)safe_malloc(width * height * sizeof(*new_flow.v));
  new_flow.confidence = (double*)safe_malloc(width * height *
                                             sizeof(*new_flow.confidence));
  new_flow.stride = width;
  // warp one image toward the other
  for (n = 0; n < n_refinements; ++n) {
    for (j = 0; j < height; ++j)
      for (i = 0; i < width; ++i) {
        x = i - flow->u[i + j * flow->stride];
        y = j - flow->v[i + j * flow->stride];
        estimate[i + j * width] = interpolate(ref, x, y, width,
                                              height, ref_stride);
      }

    // compute flow between frame and warped estimate
    blur_img(estimate, width, height, width, smooth_estimate, width, BLUR_SIZE,
             BLUR_SIGMA, BLUR_AMP);
    lucas_kanade_base(smooth_frm, smooth_estimate, dx, dy, dt, &new_flow,
                      width, height, smooth_frm_stride, width, deriv_stride);

    // add residual and apply median filter for denoising
    pointwise_matrix_add(flow->u, new_flow.u, new_flow.u, width, height,
                         flow->stride, new_flow.stride, new_flow.stride);
    pointwise_matrix_add(flow->v, new_flow.v, new_flow.v, width, height,
                         flow->stride, new_flow.stride, new_flow.stride);
    median_filter(new_flow.u, flow->u, MEDIAN_FILTER_SIZE, width, height,
                  new_flow.stride, flow->stride);
    median_filter(new_flow.v, flow->v, MEDIAN_FILTER_SIZE, width, height,
                  new_flow.stride, flow->stride);
    median_filter(new_flow.confidence, flow->confidence, MEDIAN_FILTER_SIZE,
                  width, height, new_flow.stride, flow->stride);
  }

  free(smooth_estimate);
  free(estimate);
  free(new_flow.u);
  free(new_flow.v);
  free(new_flow.confidence);
}

// interface for computing optical flow.
void compute_flow(unsigned char *frm, unsigned char *ref,
                  double *u, double *v, double *confidence,
                  int width, int height, int frm_stride,
                  int ref_stride, int n_levels) {
  double *smooth_ref = (double*)safe_malloc(width * height *
                                            sizeof(*smooth_ref));
  double *smooth_frm = (double*)safe_malloc(width * height *
                                            sizeof(*smooth_frm));
  double *dx_frm = (double*)safe_malloc(width * height *
                                        sizeof(*dx_frm));
  double *dy_frm = (double*)safe_malloc(width * height *
                                       sizeof(*dx_frm));
  double *dt = (double *)safe_malloc(width * height * sizeof(*dt));
  flowMV flow;
  flow.u = u;
  flow.v = v;
  flow.confidence = confidence;
  flow.stride = width;
  if (n_levels == 1) {
    // Uses lucas kanade and iterative estimation without coarse to fine.
    // This is more efficient for small motion but becomes less accurate
    // as motion gets larger.
    blur_img(ref, width, height, ref_stride, smooth_ref, width, BLUR_SIZE,
             BLUR_SIGMA, BLUR_AMP);
    blur_img(frm, width, height, frm_stride, smooth_frm, width, BLUR_SIZE,
             BLUR_SIGMA, BLUR_AMP);
    differentiate(smooth_frm, width, height, width, dx_frm, dy_frm, width);
    lucas_kanade_base(smooth_frm, smooth_ref, dx_frm, dy_frm, dt, &flow,
                      width, height, width, width, width);
    iterative_refinement(ref, smooth_frm, dx_frm, dy_frm, dt, &flow, width,
                         height, ref_stride, width, width, ITERATIONS);
  } else {
    int i, j, k;
    // w, h, s_r, s_f are intermediate width, height, ref stride, frame stride
    // as the resolution changes up the pyramid
    int w, h, s_r, s_f, w_upscale, h_upscale, s_upscale;
    double x, y;
    double *smooth_frm_approx = (double*)safe_malloc(width * height *
                                             sizeof(*smooth_frm_approx));
    double *dx_frm_approx = (double*)safe_malloc(width * height *
                                             sizeof(*dx_frm_approx));
    double *dy_frm_approx = (double*)safe_malloc(width * height *
                                             sizeof(*dy_frm_approx));
    double *u_upscale = (double*)safe_malloc(width * height *
                                             sizeof(*u_upscale));
    double *v_upscale = (double*)safe_malloc(width * height *
                                             sizeof(*v_upscale));
    unsigned char *frm_approx = (unsigned char*)safe_malloc(width * height *
                                                           sizeof(*frm_approx));
    imPyramid *frm_pyramid = (imPyramid*)safe_malloc(sizeof(imPyramid));
    imPyramid *ref_pyramid = (imPyramid*)safe_malloc(sizeof(imPyramid));
    // create image pyramids
    image_pyramid(frm, frm_pyramid, width, height, frm_stride, n_levels);
    image_pyramid(ref, ref_pyramid, width, height, ref_stride, n_levels);
    n_levels = frm_pyramid->n_levels;
    w = frm_pyramid->widths[n_levels - 1];
    h = frm_pyramid->heights[n_levels - 1];
    s_r = ref_pyramid->strides[n_levels - 1];
    s_f = frm_pyramid->strides[n_levels - 1];

    // for each level in the pyramid starting with the coarsest
    for (i = n_levels - 1; i >= 0; --i) {
      assert(frm_pyramid->widths[i] == ref_pyramid->widths[i]);
      assert(frm_pyramid->heights[i] == ref_pyramid->heights[i]);
      blur_img(ref_pyramid->levels[i], w, h, s_r, smooth_ref, width, BLUR_SIZE,
               BLUR_SIGMA, BLUR_AMP);
      blur_img(frm_pyramid->levels[i], w, h, s_f, smooth_frm, width, BLUR_SIZE,
               BLUR_SIGMA, BLUR_AMP);
      differentiate(smooth_frm, w, h, width, dx_frm, dy_frm, width);


      //  Compute optical flow at this level between the reference frame and
      //  the estimate produced from warping. If this is the first iteration
      //  (meaning no warping has happened yet) then we have no approximation
      //  for the frame and only have to worry about flow between the original
      //  frame and reference. Every subsequent iteration requires computing
      //  and estimate for the frame based on previously computed flow vectors.

      if (i < n_levels - 1) {
        blur_img(frm_approx, w, h, width, smooth_frm_approx, width, BLUR_SIZE,
                 BLUR_SIGMA, BLUR_AMP);
        differentiate(smooth_frm_approx, w, h, width, dx_frm_approx,
                      dy_frm_approx, width);
        lucas_kanade_base(smooth_frm_approx, smooth_ref, dx_frm_approx,
                          dy_frm_approx, dt, &flow, w, h, width, width, width);
        pointwise_matrix_add(flow.u, u_upscale, u_upscale, w, h, flow.stride,
                             width, width);
        pointwise_matrix_add(flow.v, v_upscale, v_upscale, w, h, flow.stride,
                             width, width);
        median_filter(u_upscale, flow.u, MEDIAN_FILTER_SIZE, w, h, width,
                      flow.stride);
        median_filter(v_upscale, flow.v, MEDIAN_FILTER_SIZE, w, h, width,
                      flow.stride);
      } else {
        lucas_kanade_base(smooth_frm, smooth_ref, dx_frm,
                          dy_frm, dt, &flow, w, h, width, width, width);
      }
      iterative_refinement(ref_pyramid->levels[i], smooth_frm, dx_frm, dy_frm,
                           dt, &flow, w, h, s_r, width, width, ITERATIONS);


      // if we're at the finest level, we're ready to return u and v
      if (i == 0) {
        assert(w == width);
        assert(h == height);
        destruct_pyramid(frm_pyramid);
        destruct_pyramid(ref_pyramid);
        free(frm_pyramid);
        free(ref_pyramid);
        free(frm_approx);
        free(smooth_frm_approx);
        free(dx_frm_approx);
        free(dy_frm_approx);
        free(u_upscale);
        free(v_upscale);
        break;
      }

      w_upscale = ref_pyramid->widths[i - 1];
      h_upscale = ref_pyramid->heights[i - 1];
      s_upscale = ref_pyramid->strides[i - 1];

      // warp image according to optical flow estimate
      for (j = 0; j < h_upscale; ++j)
        for (k = 0; k < w_upscale; ++k) {
          u_upscale[k + j * w_upscale] = flow.u[(int)(k >> 1) +
                                                 (int)(j >> 1) * flow.stride];
          v_upscale[k + j * w_upscale] = flow.v[(int)(k >> 1) +
                                                 (int)(j >> 1) * flow.stride];
          x = k - u_upscale[k + j * w_upscale];
          y = j - v_upscale[k + j * w_upscale];
          frm_approx[k + j * width] = interpolate(ref_pyramid->levels[i - 1],
                                                  x, y, w_upscale, h_upscale,
                                                  s_upscale);
        }

      // assign dimensions for next level
      w = frm_pyramid->widths[i - 1];
      h = frm_pyramid->heights[i - 1];
      s_r = ref_pyramid->strides[i - 1];
      s_f = frm_pyramid->strides[i - 1];
    }
  }
  free(smooth_ref);
  free(smooth_frm);
  free(dx_frm);
  free(dy_frm);
  free(dt);
}