/*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include <stdlib.h>
#include <emmintrin.h>
#include <tmmintrin.h>

#include "vpx_ports/mem.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"

static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
  __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr);
  __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride));
  return _mm_unpacklo_epi64(temp1, temp2);
}

static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
  __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t*)ptr);
  __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride));
  __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
  temp1 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 2));
  temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 3));
  temp1 = _mm_unpacklo_epi32(temp1, temp2);
  return _mm_unpacklo_epi64(temp3, temp1);
}

static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
                                            const uint8_t *b_ptr, int b_stride,
                                            const uint8_t *m_ptr, int m_stride,
                                            int width, int height);

static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr,
                                               int a_stride,
                                               const uint8_t *b_ptr,
                                               int b_stride,
                                               const uint8_t *m_ptr,
                                               int m_stride,
                                               int height);

static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr,
                                               int a_stride,
                                               const uint8_t *b_ptr,
                                               int b_stride,
                                               const uint8_t *m_ptr,
                                               int m_stride,
                                               int height);

#define MASKSADMXN_SSSE3(m, n) \
unsigned int vpx_masked_sad##m##x##n##_ssse3(const uint8_t *src, \
                                             int src_stride, \
                                             const uint8_t *ref, \
                                             int ref_stride, \
                                             const uint8_t *msk, \
                                             int msk_stride) { \
  return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
                          m, n); \
}

#if CONFIG_EXT_PARTITION
MASKSADMXN_SSSE3(128, 128)
MASKSADMXN_SSSE3(128, 64)
MASKSADMXN_SSSE3(64, 128)
#endif  // CONFIG_EXT_PARTITION
MASKSADMXN_SSSE3(64, 64)
MASKSADMXN_SSSE3(64, 32)
MASKSADMXN_SSSE3(32, 64)
MASKSADMXN_SSSE3(32, 32)
MASKSADMXN_SSSE3(32, 16)
MASKSADMXN_SSSE3(16, 32)
MASKSADMXN_SSSE3(16, 16)
MASKSADMXN_SSSE3(16, 8)

#define MASKSAD8XN_SSSE3(n) \
unsigned int vpx_masked_sad8x##n##_ssse3(const uint8_t *src, \
                                         int src_stride, \
                                         const uint8_t *ref, \
                                         int ref_stride, \
                                         const uint8_t *msk, \
                                         int msk_stride) { \
  return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \
                             msk_stride, n); \
}

MASKSAD8XN_SSSE3(16)
MASKSAD8XN_SSSE3(8)
MASKSAD8XN_SSSE3(4)

#define MASKSAD4XN_SSSE3(n) \
unsigned int vpx_masked_sad4x##n##_ssse3(const uint8_t *src, int src_stride, \
                                         const uint8_t *ref, int ref_stride, \
                                         const uint8_t *msk, int msk_stride) { \
  return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
                             msk_stride, n); \
}

MASKSAD4XN_SSSE3(8)
MASKSAD4XN_SSSE3(4)

// For width a multiple of 16
// Assumes values in m are <=64
static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
                                            const uint8_t *b_ptr, int b_stride,
                                            const uint8_t *m_ptr, int m_stride,
                                            int width, int height) {
  int y, x;
  __m128i a, b, m, temp1, temp2;
  __m128i res = _mm_setzero_si128();
  __m128i one = _mm_set1_epi16(1);
  // For each row
  for (y = 0; y < height; y++) {
    // Covering the full width
    for (x = 0; x < width; x += 16) {
      // Load a, b, m in xmm registers
      a = _mm_loadu_si128((const __m128i*)(a_ptr + x));
      b = _mm_loadu_si128((const __m128i*)(b_ptr + x));
      m = _mm_loadu_si128((const __m128i*)(m_ptr + x));

      // Calculate the difference between a & b
      temp1 = _mm_subs_epu8(a, b);
      temp2 = _mm_subs_epu8(b, a);
      temp1 = _mm_or_si128(temp1, temp2);

      // Multiply by m and add together
      temp2 = _mm_maddubs_epi16(temp1, m);
      // Pad out row result to 32 bit integers & add to running total
      res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
    }
    // Move onto the next row
    a_ptr += a_stride;
    b_ptr += b_stride;
    m_ptr += m_stride;
  }
  res = _mm_hadd_epi32(res, _mm_setzero_si128());
  res = _mm_hadd_epi32(res, _mm_setzero_si128());
  // sad = (sad + 31) >> 6;
  return (_mm_cvtsi128_si32(res) + 31) >> 6;
}

static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr,
                                               int a_stride,
                                               const uint8_t *b_ptr,
                                               int b_stride,
                                               const uint8_t *m_ptr,
                                               int m_stride,
                                               int height) {
  int y;
  __m128i a, b, m, temp1, temp2, row_res;
  __m128i res = _mm_setzero_si128();
  __m128i one = _mm_set1_epi16(1);
  // Add the masked SAD for 2 rows at a time
  for (y = 0; y < height; y += 2) {
    // Load a, b, m in xmm registers
    a = width8_load_2rows(a_ptr, a_stride);
    b = width8_load_2rows(b_ptr, b_stride);
    m = width8_load_2rows(m_ptr, m_stride);

    // Calculate the difference between a & b
    temp1 = _mm_subs_epu8(a, b);
    temp2 = _mm_subs_epu8(b, a);
    temp1 = _mm_or_si128(temp1, temp2);

    // Multiply by m and add together
    row_res = _mm_maddubs_epi16(temp1, m);

    // Pad out row result to 32 bit integers & add to running total
    res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));

    // Move onto the next rows
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  }
  res = _mm_hadd_epi32(res, _mm_setzero_si128());
  res = _mm_hadd_epi32(res, _mm_setzero_si128());
  // sad = (sad + 31) >> 6;
  return (_mm_cvtsi128_si32(res) + 31) >> 6;
}

static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr,
                                               int a_stride,
                                               const uint8_t *b_ptr,
                                               int b_stride,
                                               const uint8_t *m_ptr,
                                               int m_stride,
                                               int height) {
  int y;
  __m128i a, b, m, temp1, temp2, row_res;
  __m128i res = _mm_setzero_si128();
  __m128i one = _mm_set1_epi16(1);
  // Add the masked SAD for 4 rows at a time
  for (y = 0; y < height; y += 4) {
    // Load a, b, m in xmm registers
    a = width4_load_4rows(a_ptr, a_stride);
    b = width4_load_4rows(b_ptr, b_stride);
    m = width4_load_4rows(m_ptr, m_stride);

    // Calculate the difference between a & b
    temp1 = _mm_subs_epu8(a, b);
    temp2 = _mm_subs_epu8(b, a);
    temp1 = _mm_or_si128(temp1, temp2);

    // Multiply by m and add together
    row_res = _mm_maddubs_epi16(temp1, m);

    // Pad out row result to 32 bit integers & add to running total
    res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));

    // Move onto the next rows
    a_ptr += a_stride * 4;
    b_ptr += b_stride * 4;
    m_ptr += m_stride * 4;
  }
  // Pad out row result to 32 bit integers & add to running total
  res = _mm_hadd_epi32(res, _mm_setzero_si128());
  res = _mm_hadd_epi32(res, _mm_setzero_si128());
  // sad = (sad + 31) >> 6;
  return (_mm_cvtsi128_si32(res) + 31) >> 6;
}

#if CONFIG_VPX_HIGHBITDEPTH
static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
                                               int stride) {
  __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr);
  __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride));
  return _mm_unpacklo_epi64(temp1, temp2);
}

static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr,
                                                   int a_stride,
                                                   const uint8_t *b8_ptr,
                                                   int b_stride,
                                                   const uint8_t *m_ptr,
                                                   int m_stride,
                                                   int width, int height);

static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr,
                                                      int a_stride,
                                                      const uint8_t *b8_ptr,
                                                      int b_stride,
                                                      const uint8_t *m_ptr,
                                                      int m_stride,
                                                      int height);

#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
unsigned int vpx_highbd_masked_sad##m##x##n##_ssse3(const uint8_t *src, \
                                                    int src_stride, \
                                                    const uint8_t *ref, \
                                                    int ref_stride, \
                                                    const uint8_t *msk, \
                                                    int msk_stride) { \
  return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \
                                 msk_stride, m, n); \
}

#if CONFIG_EXT_PARTITION
HIGHBD_MASKSADMXN_SSSE3(128, 128)
HIGHBD_MASKSADMXN_SSSE3(128, 64)
HIGHBD_MASKSADMXN_SSSE3(64, 128)
#endif  // CONFIG_EXT_PARTITION
HIGHBD_MASKSADMXN_SSSE3(64, 64)
HIGHBD_MASKSADMXN_SSSE3(64, 32)
HIGHBD_MASKSADMXN_SSSE3(32, 64)
HIGHBD_MASKSADMXN_SSSE3(32, 32)
HIGHBD_MASKSADMXN_SSSE3(32, 16)
HIGHBD_MASKSADMXN_SSSE3(16, 32)
HIGHBD_MASKSADMXN_SSSE3(16, 16)
HIGHBD_MASKSADMXN_SSSE3(16, 8)
HIGHBD_MASKSADMXN_SSSE3(8, 16)
HIGHBD_MASKSADMXN_SSSE3(8, 8)
HIGHBD_MASKSADMXN_SSSE3(8, 4)

#define HIGHBD_MASKSAD4XN_SSSE3(n) \
unsigned int vpx_highbd_masked_sad4x##n##_ssse3(const uint8_t *src, \
                                                int src_stride, \
                                                const uint8_t *ref, \
                                                int ref_stride, \
                                                const uint8_t *msk, \
                                                int msk_stride) { \
  return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
                                    msk_stride, n); \
}

HIGHBD_MASKSAD4XN_SSSE3(8)
HIGHBD_MASKSAD4XN_SSSE3(4)

// For width a multiple of 8
// Assumes values in m are <=64
static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr,
                                                   int a_stride,
                                                   const uint8_t *b8_ptr,
                                                   int b_stride,
                                                   const uint8_t *m_ptr,
                                                   int m_stride,
                                                   int width, int height) {
  int y, x;
  __m128i a, b, m, temp1, temp2;
  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
  __m128i res = _mm_setzero_si128();
  // For each row
  for (y = 0; y < height; y++) {
    // Covering the full width
    for (x = 0; x < width; x += 8) {
      // Load a, b, m in xmm registers
      a = _mm_loadu_si128((const __m128i*)(a_ptr + x));
      b = _mm_loadu_si128((const __m128i*)(b_ptr + x));
      m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(m_ptr + x)),
                            _mm_setzero_si128());

      // Calculate the difference between a & b
      temp1 = _mm_subs_epu16(a, b);
      temp2 = _mm_subs_epu16(b, a);
      temp1 = _mm_or_si128(temp1, temp2);

      // Add result of multiplying by m and add pairs together to running total
      res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
    }
    // Move onto the next row
    a_ptr += a_stride;
    b_ptr += b_stride;
    m_ptr += m_stride;
  }
  res = _mm_hadd_epi32(res, _mm_setzero_si128());
  res = _mm_hadd_epi32(res, _mm_setzero_si128());
  // sad = (sad + 31) >> 6;
  return (_mm_cvtsi128_si32(res) + 31) >> 6;
}

static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr,
                                                      int a_stride,
                                                      const uint8_t *b8_ptr,
                                                      int b_stride,
                                                      const uint8_t *m_ptr,
                                                      int m_stride,
                                                      int height) {
  int y;
  __m128i a, b, m, temp1, temp2;
  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
  __m128i res = _mm_setzero_si128();
  // Add the masked SAD for 2 rows at a time
  for (y = 0; y < height; y += 2) {
    // Load a, b, m in xmm registers
    a = highbd_width4_load_2rows(a_ptr, a_stride);
    b = highbd_width4_load_2rows(b_ptr, b_stride);
    temp1 = _mm_loadl_epi64((const __m128i*)m_ptr);
    temp2 = _mm_loadl_epi64((const __m128i*)(m_ptr + m_stride));
    m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
                          _mm_setzero_si128());

    // Calculate the difference between a & b
    temp1 = _mm_subs_epu16(a, b);
    temp2 = _mm_subs_epu16(b, a);
    temp1 = _mm_or_si128(temp1, temp2);

    // Multiply by m and add together
    res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));

    // Move onto the next rows
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  }
  res = _mm_hadd_epi32(res, _mm_setzero_si128());
  res = _mm_hadd_epi32(res, _mm_setzero_si128());
  // sad = (sad + 31) >> 6;
  return (_mm_cvtsi128_si32(res) + 31) >> 6;
}
#endif  // CONFIG_VPX_HIGHBITDEPTH