/* * Copyright (c) 2015 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include #include #include "vpx_ports/mem.h" #include "./vpx_config.h" #include "vpx/vpx_integer.h" static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) { __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr); __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride)); return _mm_unpacklo_epi64(temp1, temp2); } static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) { __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t*)ptr); __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride)); __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2); temp1 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 2)); temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 3)); temp1 = _mm_unpacklo_epi32(temp1, temp2); return _mm_unpacklo_epi64(temp3, temp1); } static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height); static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height); static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height); #define MASKSADMXN_SSSE3(m, n) \ unsigned int vpx_masked_sad##m##x##n##_ssse3(const uint8_t *src, \ int src_stride, \ const uint8_t *ref, \ int ref_stride, \ const uint8_t *msk, \ int msk_stride) { \ return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \ m, n); \ } #if CONFIG_EXT_PARTITION MASKSADMXN_SSSE3(128, 128) MASKSADMXN_SSSE3(128, 64) MASKSADMXN_SSSE3(64, 128) #endif // CONFIG_EXT_PARTITION MASKSADMXN_SSSE3(64, 64) MASKSADMXN_SSSE3(64, 32) MASKSADMXN_SSSE3(32, 64) MASKSADMXN_SSSE3(32, 32) MASKSADMXN_SSSE3(32, 16) MASKSADMXN_SSSE3(16, 32) MASKSADMXN_SSSE3(16, 16) MASKSADMXN_SSSE3(16, 8) #define MASKSAD8XN_SSSE3(n) \ unsigned int vpx_masked_sad8x##n##_ssse3(const uint8_t *src, \ int src_stride, \ const uint8_t *ref, \ int ref_stride, \ const uint8_t *msk, \ int msk_stride) { \ return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \ msk_stride, n); \ } MASKSAD8XN_SSSE3(16) MASKSAD8XN_SSSE3(8) MASKSAD8XN_SSSE3(4) #define MASKSAD4XN_SSSE3(n) \ unsigned int vpx_masked_sad4x##n##_ssse3(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride, \ const uint8_t *msk, int msk_stride) { \ return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ msk_stride, n); \ } MASKSAD4XN_SSSE3(8) MASKSAD4XN_SSSE3(4) // For width a multiple of 16 // Assumes values in m are <=64 static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { int y, x; __m128i a, b, m, temp1, temp2; __m128i res = _mm_setzero_si128(); __m128i one = _mm_set1_epi16(1); // For each row for (y = 0; y < height; y++) { // Covering the full width for (x = 0; x < width; x += 16) { // Load a, b, m in xmm registers a = _mm_loadu_si128((const __m128i*)(a_ptr + x)); b = _mm_loadu_si128((const __m128i*)(b_ptr + x)); m = _mm_loadu_si128((const __m128i*)(m_ptr + x)); // Calculate the difference between a & b temp1 = _mm_subs_epu8(a, b); temp2 = _mm_subs_epu8(b, a); temp1 = _mm_or_si128(temp1, temp2); // Multiply by m and add together temp2 = _mm_maddubs_epi16(temp1, m); // Pad out row result to 32 bit integers & add to running total res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one)); } // Move onto the next row a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } res = _mm_hadd_epi32(res, _mm_setzero_si128()); res = _mm_hadd_epi32(res, _mm_setzero_si128()); // sad = (sad + 31) >> 6; return (_mm_cvtsi128_si32(res) + 31) >> 6; } static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i a, b, m, temp1, temp2, row_res; __m128i res = _mm_setzero_si128(); __m128i one = _mm_set1_epi16(1); // Add the masked SAD for 2 rows at a time for (y = 0; y < height; y += 2) { // Load a, b, m in xmm registers a = width8_load_2rows(a_ptr, a_stride); b = width8_load_2rows(b_ptr, b_stride); m = width8_load_2rows(m_ptr, m_stride); // Calculate the difference between a & b temp1 = _mm_subs_epu8(a, b); temp2 = _mm_subs_epu8(b, a); temp1 = _mm_or_si128(temp1, temp2); // Multiply by m and add together row_res = _mm_maddubs_epi16(temp1, m); // Pad out row result to 32 bit integers & add to running total res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); // Move onto the next rows a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } res = _mm_hadd_epi32(res, _mm_setzero_si128()); res = _mm_hadd_epi32(res, _mm_setzero_si128()); // sad = (sad + 31) >> 6; return (_mm_cvtsi128_si32(res) + 31) >> 6; } static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i a, b, m, temp1, temp2, row_res; __m128i res = _mm_setzero_si128(); __m128i one = _mm_set1_epi16(1); // Add the masked SAD for 4 rows at a time for (y = 0; y < height; y += 4) { // Load a, b, m in xmm registers a = width4_load_4rows(a_ptr, a_stride); b = width4_load_4rows(b_ptr, b_stride); m = width4_load_4rows(m_ptr, m_stride); // Calculate the difference between a & b temp1 = _mm_subs_epu8(a, b); temp2 = _mm_subs_epu8(b, a); temp1 = _mm_or_si128(temp1, temp2); // Multiply by m and add together row_res = _mm_maddubs_epi16(temp1, m); // Pad out row result to 32 bit integers & add to running total res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); // Move onto the next rows a_ptr += a_stride * 4; b_ptr += b_stride * 4; m_ptr += m_stride * 4; } // Pad out row result to 32 bit integers & add to running total res = _mm_hadd_epi32(res, _mm_setzero_si128()); res = _mm_hadd_epi32(res, _mm_setzero_si128()); // sad = (sad + 31) >> 6; return (_mm_cvtsi128_si32(res) + 31) >> 6; } #if CONFIG_VPX_HIGHBITDEPTH static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr, int stride) { __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr); __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride)); return _mm_unpacklo_epi64(temp1, temp2); } static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height); static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height); #define HIGHBD_MASKSADMXN_SSSE3(m, n) \ unsigned int vpx_highbd_masked_sad##m##x##n##_ssse3(const uint8_t *src, \ int src_stride, \ const uint8_t *ref, \ int ref_stride, \ const uint8_t *msk, \ int msk_stride) { \ return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \ msk_stride, m, n); \ } #if CONFIG_EXT_PARTITION HIGHBD_MASKSADMXN_SSSE3(128, 128) HIGHBD_MASKSADMXN_SSSE3(128, 64) HIGHBD_MASKSADMXN_SSSE3(64, 128) #endif // CONFIG_EXT_PARTITION HIGHBD_MASKSADMXN_SSSE3(64, 64) HIGHBD_MASKSADMXN_SSSE3(64, 32) HIGHBD_MASKSADMXN_SSSE3(32, 64) HIGHBD_MASKSADMXN_SSSE3(32, 32) HIGHBD_MASKSADMXN_SSSE3(32, 16) HIGHBD_MASKSADMXN_SSSE3(16, 32) HIGHBD_MASKSADMXN_SSSE3(16, 16) HIGHBD_MASKSADMXN_SSSE3(16, 8) HIGHBD_MASKSADMXN_SSSE3(8, 16) HIGHBD_MASKSADMXN_SSSE3(8, 8) HIGHBD_MASKSADMXN_SSSE3(8, 4) #define HIGHBD_MASKSAD4XN_SSSE3(n) \ unsigned int vpx_highbd_masked_sad4x##n##_ssse3(const uint8_t *src, \ int src_stride, \ const uint8_t *ref, \ int ref_stride, \ const uint8_t *msk, \ int msk_stride) { \ return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ msk_stride, n); \ } HIGHBD_MASKSAD4XN_SSSE3(8) HIGHBD_MASKSAD4XN_SSSE3(4) // For width a multiple of 8 // Assumes values in m are <=64 static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { int y, x; __m128i a, b, m, temp1, temp2; const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); __m128i res = _mm_setzero_si128(); // For each row for (y = 0; y < height; y++) { // Covering the full width for (x = 0; x < width; x += 8) { // Load a, b, m in xmm registers a = _mm_loadu_si128((const __m128i*)(a_ptr + x)); b = _mm_loadu_si128((const __m128i*)(b_ptr + x)); m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(m_ptr + x)), _mm_setzero_si128()); // Calculate the difference between a & b temp1 = _mm_subs_epu16(a, b); temp2 = _mm_subs_epu16(b, a); temp1 = _mm_or_si128(temp1, temp2); // Add result of multiplying by m and add pairs together to running total res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); } // Move onto the next row a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } res = _mm_hadd_epi32(res, _mm_setzero_si128()); res = _mm_hadd_epi32(res, _mm_setzero_si128()); // sad = (sad + 31) >> 6; return (_mm_cvtsi128_si32(res) + 31) >> 6; } static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i a, b, m, temp1, temp2; const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); __m128i res = _mm_setzero_si128(); // Add the masked SAD for 2 rows at a time for (y = 0; y < height; y += 2) { // Load a, b, m in xmm registers a = highbd_width4_load_2rows(a_ptr, a_stride); b = highbd_width4_load_2rows(b_ptr, b_stride); temp1 = _mm_loadl_epi64((const __m128i*)m_ptr); temp2 = _mm_loadl_epi64((const __m128i*)(m_ptr + m_stride)); m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2), _mm_setzero_si128()); // Calculate the difference between a & b temp1 = _mm_subs_epu16(a, b); temp2 = _mm_subs_epu16(b, a); temp1 = _mm_or_si128(temp1, temp2); // Multiply by m and add together res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); // Move onto the next rows a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } res = _mm_hadd_epi32(res, _mm_setzero_si128()); res = _mm_hadd_epi32(res, _mm_setzero_si128()); // sad = (sad + 31) >> 6; return (_mm_cvtsi128_si32(res) + 31) >> 6; } #endif // CONFIG_VPX_HIGHBITDEPTH