Merge "move vp9_avg to vpx_dsp"
This commit is contained in:
@@ -15,9 +15,7 @@
|
|||||||
#include "third_party/googletest/src/include/gtest/gtest.h"
|
#include "third_party/googletest/src/include/gtest/gtest.h"
|
||||||
|
|
||||||
#include "./vpx_config.h"
|
#include "./vpx_config.h"
|
||||||
#if CONFIG_VP9_ENCODER
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "./vp9_rtcd.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "test/acm_random.h"
|
#include "test/acm_random.h"
|
||||||
#include "test/clear_system_state.h"
|
#include "test/clear_system_state.h"
|
||||||
@@ -323,91 +321,91 @@ using std::tr1::make_tuple;
|
|||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
C, AverageTest,
|
C, AverageTest,
|
||||||
::testing::Values(
|
::testing::Values(
|
||||||
make_tuple(16, 16, 1, 8, &vp9_avg_8x8_c),
|
make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c),
|
||||||
make_tuple(16, 16, 1, 4, &vp9_avg_4x4_c)));
|
make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c)));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
C, SatdTest,
|
C, SatdTest,
|
||||||
::testing::Values(
|
::testing::Values(
|
||||||
make_tuple(16, &vp9_satd_c),
|
make_tuple(16, &vpx_satd_c),
|
||||||
make_tuple(64, &vp9_satd_c),
|
make_tuple(64, &vpx_satd_c),
|
||||||
make_tuple(256, &vp9_satd_c),
|
make_tuple(256, &vpx_satd_c),
|
||||||
make_tuple(1024, &vp9_satd_c)));
|
make_tuple(1024, &vpx_satd_c)));
|
||||||
|
|
||||||
#if HAVE_SSE2
|
#if HAVE_SSE2
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
SSE2, AverageTest,
|
SSE2, AverageTest,
|
||||||
::testing::Values(
|
::testing::Values(
|
||||||
make_tuple(16, 16, 0, 8, &vp9_avg_8x8_sse2),
|
make_tuple(16, 16, 0, 8, &vpx_avg_8x8_sse2),
|
||||||
make_tuple(16, 16, 5, 8, &vp9_avg_8x8_sse2),
|
make_tuple(16, 16, 5, 8, &vpx_avg_8x8_sse2),
|
||||||
make_tuple(32, 32, 15, 8, &vp9_avg_8x8_sse2),
|
make_tuple(32, 32, 15, 8, &vpx_avg_8x8_sse2),
|
||||||
make_tuple(16, 16, 0, 4, &vp9_avg_4x4_sse2),
|
make_tuple(16, 16, 0, 4, &vpx_avg_4x4_sse2),
|
||||||
make_tuple(16, 16, 5, 4, &vp9_avg_4x4_sse2),
|
make_tuple(16, 16, 5, 4, &vpx_avg_4x4_sse2),
|
||||||
make_tuple(32, 32, 15, 4, &vp9_avg_4x4_sse2)));
|
make_tuple(32, 32, 15, 4, &vpx_avg_4x4_sse2)));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
SSE2, IntProRowTest, ::testing::Values(
|
SSE2, IntProRowTest, ::testing::Values(
|
||||||
make_tuple(16, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c),
|
make_tuple(16, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c),
|
||||||
make_tuple(32, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c),
|
make_tuple(32, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c),
|
||||||
make_tuple(64, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c)));
|
make_tuple(64, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c)));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
SSE2, IntProColTest, ::testing::Values(
|
SSE2, IntProColTest, ::testing::Values(
|
||||||
make_tuple(16, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
|
make_tuple(16, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c),
|
||||||
make_tuple(32, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
|
make_tuple(32, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c),
|
||||||
make_tuple(64, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c)));
|
make_tuple(64, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c)));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
SSE2, SatdTest,
|
SSE2, SatdTest,
|
||||||
::testing::Values(
|
::testing::Values(
|
||||||
make_tuple(16, &vp9_satd_sse2),
|
make_tuple(16, &vpx_satd_sse2),
|
||||||
make_tuple(64, &vp9_satd_sse2),
|
make_tuple(64, &vpx_satd_sse2),
|
||||||
make_tuple(256, &vp9_satd_sse2),
|
make_tuple(256, &vpx_satd_sse2),
|
||||||
make_tuple(1024, &vp9_satd_sse2)));
|
make_tuple(1024, &vpx_satd_sse2)));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if HAVE_NEON
|
#if HAVE_NEON
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
NEON, AverageTest,
|
NEON, AverageTest,
|
||||||
::testing::Values(
|
::testing::Values(
|
||||||
make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon),
|
make_tuple(16, 16, 0, 8, &vpx_avg_8x8_neon),
|
||||||
make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon),
|
make_tuple(16, 16, 5, 8, &vpx_avg_8x8_neon),
|
||||||
make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon),
|
make_tuple(32, 32, 15, 8, &vpx_avg_8x8_neon),
|
||||||
make_tuple(16, 16, 0, 4, &vp9_avg_4x4_neon),
|
make_tuple(16, 16, 0, 4, &vpx_avg_4x4_neon),
|
||||||
make_tuple(16, 16, 5, 4, &vp9_avg_4x4_neon),
|
make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon),
|
||||||
make_tuple(32, 32, 15, 4, &vp9_avg_4x4_neon)));
|
make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon)));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
NEON, IntProRowTest, ::testing::Values(
|
NEON, IntProRowTest, ::testing::Values(
|
||||||
make_tuple(16, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
|
make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
|
||||||
make_tuple(32, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
|
make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
|
||||||
make_tuple(64, &vp9_int_pro_row_neon, &vp9_int_pro_row_c)));
|
make_tuple(64, &vpx_int_pro_row_neon, &vpx_int_pro_row_c)));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
NEON, IntProColTest, ::testing::Values(
|
NEON, IntProColTest, ::testing::Values(
|
||||||
make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
|
make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
|
||||||
make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
|
make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
|
||||||
make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c)));
|
make_tuple(64, &vpx_int_pro_col_neon, &vpx_int_pro_col_c)));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
NEON, SatdTest,
|
NEON, SatdTest,
|
||||||
::testing::Values(
|
::testing::Values(
|
||||||
make_tuple(16, &vp9_satd_neon),
|
make_tuple(16, &vpx_satd_neon),
|
||||||
make_tuple(64, &vp9_satd_neon),
|
make_tuple(64, &vpx_satd_neon),
|
||||||
make_tuple(256, &vp9_satd_neon),
|
make_tuple(256, &vpx_satd_neon),
|
||||||
make_tuple(1024, &vp9_satd_neon)));
|
make_tuple(1024, &vpx_satd_neon)));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if HAVE_MSA
|
#if HAVE_MSA
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
MSA, AverageTest,
|
MSA, AverageTest,
|
||||||
::testing::Values(
|
::testing::Values(
|
||||||
make_tuple(16, 16, 0, 8, &vp9_avg_8x8_msa),
|
make_tuple(16, 16, 0, 8, &vpx_avg_8x8_msa),
|
||||||
make_tuple(16, 16, 5, 8, &vp9_avg_8x8_msa),
|
make_tuple(16, 16, 5, 8, &vpx_avg_8x8_msa),
|
||||||
make_tuple(32, 32, 15, 8, &vp9_avg_8x8_msa),
|
make_tuple(32, 32, 15, 8, &vpx_avg_8x8_msa),
|
||||||
make_tuple(16, 16, 0, 4, &vp9_avg_4x4_msa),
|
make_tuple(16, 16, 0, 4, &vpx_avg_4x4_msa),
|
||||||
make_tuple(16, 16, 5, 4, &vp9_avg_4x4_msa),
|
make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa),
|
||||||
make_tuple(32, 32, 15, 4, &vp9_avg_4x4_msa)));
|
make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa)));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
@@ -143,7 +143,6 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
|
|||||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
|
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
|
||||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
|
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
|
||||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
|
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
|
||||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
|
|
||||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
|
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
|
||||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
|
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
|
||||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
|
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
|
||||||
@@ -170,6 +169,11 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
|
|||||||
endif # VP10
|
endif # VP10
|
||||||
|
|
||||||
## Multi-codec / unconditional whitebox tests.
|
## Multi-codec / unconditional whitebox tests.
|
||||||
|
|
||||||
|
ifeq ($(findstring yes,$(CONFIG_VP9_ENCODER)$(CONFIG_VP10_ENCODER)),yes)
|
||||||
|
LIBVPX_TEST_SRCS-yes += avg_test.cc
|
||||||
|
endif
|
||||||
|
|
||||||
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
|
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
|
||||||
|
|
||||||
TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
|
TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
|
||||||
|
@@ -351,42 +351,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||||||
#
|
#
|
||||||
if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
|
if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
|
||||||
|
|
||||||
add_proto qw/unsigned int vp10_avg_8x8/, "const uint8_t *, int p";
|
|
||||||
specialize qw/vp10_avg_8x8 sse2 neon msa/;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp10_avg_4x4/, "const uint8_t *, int p";
|
|
||||||
specialize qw/vp10_avg_4x4 sse2 msa/;
|
|
||||||
|
|
||||||
add_proto qw/void vp10_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
|
|
||||||
specialize qw/vp10_minmax_8x8 sse2/;
|
|
||||||
|
|
||||||
add_proto qw/void vp10_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
|
|
||||||
specialize qw/vp10_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/void vp10_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
|
|
||||||
specialize qw/vp10_hadamard_16x16 sse2/;
|
|
||||||
|
|
||||||
add_proto qw/int16_t vp10_satd/, "const int16_t *coeff, int length";
|
|
||||||
specialize qw/vp10_satd sse2/;
|
|
||||||
|
|
||||||
add_proto qw/void vp10_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
|
|
||||||
specialize qw/vp10_int_pro_row sse2 neon/;
|
|
||||||
|
|
||||||
add_proto qw/int16_t vp10_int_pro_col/, "uint8_t const *ref, const int width";
|
|
||||||
specialize qw/vp10_int_pro_col sse2 neon/;
|
|
||||||
|
|
||||||
add_proto qw/int vp10_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
|
|
||||||
specialize qw/vp10_vector_var neon sse2/;
|
|
||||||
|
|
||||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|
||||||
add_proto qw/unsigned int vp10_highbd_avg_8x8/, "const uint8_t *, int p";
|
|
||||||
specialize qw/vp10_highbd_avg_8x8/;
|
|
||||||
add_proto qw/unsigned int vp10_highbd_avg_4x4/, "const uint8_t *, int p";
|
|
||||||
specialize qw/vp10_highbd_avg_4x4/;
|
|
||||||
add_proto qw/void vp10_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
|
|
||||||
specialize qw/vp10_highbd_minmax_8x8/;
|
|
||||||
}
|
|
||||||
|
|
||||||
# ENCODEMB INVOKE
|
# ENCODEMB INVOKE
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@@ -1,160 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Use of this source code is governed by a BSD-style license
|
|
||||||
* that can be found in the LICENSE file in the root of the source
|
|
||||||
* tree. An additional intellectual property rights grant can be found
|
|
||||||
* in the file PATENTS. All contributing project authors may
|
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <arm_neon.h>
|
|
||||||
#include <assert.h>
|
|
||||||
|
|
||||||
#include "./vp10_rtcd.h"
|
|
||||||
#include "./vpx_config.h"
|
|
||||||
|
|
||||||
#include "vpx/vpx_integer.h"
|
|
||||||
|
|
||||||
static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
|
|
||||||
const uint32x4_t a = vpaddlq_u16(v_16x8);
|
|
||||||
const uint64x2_t b = vpaddlq_u32(a);
|
|
||||||
const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
|
|
||||||
vreinterpret_u32_u64(vget_high_u64(b)));
|
|
||||||
return vget_lane_u32(c, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp10_avg_8x8_neon(const uint8_t *s, int p) {
|
|
||||||
uint8x8_t v_s0 = vld1_u8(s);
|
|
||||||
const uint8x8_t v_s1 = vld1_u8(s + p);
|
|
||||||
uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
|
|
||||||
|
|
||||||
v_s0 = vld1_u8(s + 2 * p);
|
|
||||||
v_sum = vaddw_u8(v_sum, v_s0);
|
|
||||||
|
|
||||||
v_s0 = vld1_u8(s + 3 * p);
|
|
||||||
v_sum = vaddw_u8(v_sum, v_s0);
|
|
||||||
|
|
||||||
v_s0 = vld1_u8(s + 4 * p);
|
|
||||||
v_sum = vaddw_u8(v_sum, v_s0);
|
|
||||||
|
|
||||||
v_s0 = vld1_u8(s + 5 * p);
|
|
||||||
v_sum = vaddw_u8(v_sum, v_s0);
|
|
||||||
|
|
||||||
v_s0 = vld1_u8(s + 6 * p);
|
|
||||||
v_sum = vaddw_u8(v_sum, v_s0);
|
|
||||||
|
|
||||||
v_s0 = vld1_u8(s + 7 * p);
|
|
||||||
v_sum = vaddw_u8(v_sum, v_s0);
|
|
||||||
|
|
||||||
return (horizontal_add_u16x8(v_sum) + 32) >> 6;
|
|
||||||
}
|
|
||||||
|
|
||||||
void vp10_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
|
|
||||||
const int ref_stride, const int height) {
|
|
||||||
int i;
|
|
||||||
uint16x8_t vec_sum_lo = vdupq_n_u16(0);
|
|
||||||
uint16x8_t vec_sum_hi = vdupq_n_u16(0);
|
|
||||||
const int shift_factor = ((height >> 5) + 3) * -1;
|
|
||||||
const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
|
|
||||||
|
|
||||||
for (i = 0; i < height; i += 8) {
|
|
||||||
const uint8x16_t vec_row1 = vld1q_u8(ref);
|
|
||||||
const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
|
|
||||||
const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
|
|
||||||
const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
|
|
||||||
const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
|
|
||||||
const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
|
|
||||||
const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
|
|
||||||
const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
|
|
||||||
|
|
||||||
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
|
|
||||||
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
|
|
||||||
|
|
||||||
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
|
|
||||||
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
|
|
||||||
|
|
||||||
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
|
|
||||||
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
|
|
||||||
|
|
||||||
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
|
|
||||||
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
|
|
||||||
|
|
||||||
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
|
|
||||||
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
|
|
||||||
|
|
||||||
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
|
|
||||||
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
|
|
||||||
|
|
||||||
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
|
|
||||||
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
|
|
||||||
|
|
||||||
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
|
|
||||||
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
|
|
||||||
|
|
||||||
ref += ref_stride * 8;
|
|
||||||
}
|
|
||||||
|
|
||||||
vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
|
|
||||||
vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
|
|
||||||
|
|
||||||
vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
|
|
||||||
hbuf += 8;
|
|
||||||
vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
|
|
||||||
}
|
|
||||||
|
|
||||||
int16_t vp10_int_pro_col_neon(uint8_t const *ref, const int width) {
|
|
||||||
int i;
|
|
||||||
uint16x8_t vec_sum = vdupq_n_u16(0);
|
|
||||||
|
|
||||||
for (i = 0; i < width; i += 16) {
|
|
||||||
const uint8x16_t vec_row = vld1q_u8(ref);
|
|
||||||
vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
|
|
||||||
vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
|
|
||||||
ref += 16;
|
|
||||||
}
|
|
||||||
|
|
||||||
return horizontal_add_u16x8(vec_sum);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ref, src = [0, 510] - max diff = 16-bits
|
|
||||||
// bwl = {2, 3, 4}, width = {16, 32, 64}
|
|
||||||
int vp10_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
|
|
||||||
int width = 4 << bwl;
|
|
||||||
int32x4_t sse = vdupq_n_s32(0);
|
|
||||||
int16x8_t total = vdupq_n_s16(0);
|
|
||||||
|
|
||||||
assert(width >= 8);
|
|
||||||
assert((width % 8) == 0);
|
|
||||||
|
|
||||||
do {
|
|
||||||
const int16x8_t r = vld1q_s16(ref);
|
|
||||||
const int16x8_t s = vld1q_s16(src);
|
|
||||||
const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits.
|
|
||||||
const int16x4_t diff_lo = vget_low_s16(diff);
|
|
||||||
const int16x4_t diff_hi = vget_high_s16(diff);
|
|
||||||
sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits.
|
|
||||||
sse = vmlal_s16(sse, diff_hi, diff_hi);
|
|
||||||
total = vaddq_s16(total, diff); // dynamic range 16 bits.
|
|
||||||
|
|
||||||
ref += 8;
|
|
||||||
src += 8;
|
|
||||||
width -= 8;
|
|
||||||
} while (width != 0);
|
|
||||||
|
|
||||||
{
|
|
||||||
// Note: 'total''s pairwise addition could be implemented similarly to
|
|
||||||
// horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
|
|
||||||
// with the summation of 'sse' performed better on a Cortex-A15.
|
|
||||||
const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total'
|
|
||||||
const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
|
|
||||||
const int32x2_t t2 = vpadd_s32(t1, t1);
|
|
||||||
const int t = vget_lane_s32(t2, 0);
|
|
||||||
const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'.
|
|
||||||
const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
|
|
||||||
vreinterpret_s32_s64(vget_high_s64(s0)));
|
|
||||||
const int s = vget_lane_s32(s1, 0);
|
|
||||||
const int shift_factor = bwl + 2;
|
|
||||||
return s - ((t * t) >> shift_factor);
|
|
||||||
}
|
|
||||||
}
|
|
@@ -536,16 +536,16 @@ static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
|
|||||||
if (x8_idx < pixels_wide && y8_idx < pixels_high) {
|
if (x8_idx < pixels_wide && y8_idx < pixels_high) {
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
|
if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
|
||||||
vp10_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
||||||
d + y8_idx * dp + x8_idx, dp,
|
d + y8_idx * dp + x8_idx, dp,
|
||||||
&min, &max);
|
&min, &max);
|
||||||
} else {
|
} else {
|
||||||
vp10_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
||||||
d + y8_idx * dp + x8_idx, dp,
|
d + y8_idx * dp + x8_idx, dp,
|
||||||
&min, &max);
|
&min, &max);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
vp10_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
||||||
d + y8_idx * dp + x8_idx, dp,
|
d + y8_idx * dp + x8_idx, dp,
|
||||||
&min, &max);
|
&min, &max);
|
||||||
#endif
|
#endif
|
||||||
@@ -577,18 +577,18 @@ static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
|
|||||||
int d_avg = 128;
|
int d_avg = 128;
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
|
if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
|
||||||
s_avg = vp10_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
|
s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
|
||||||
if (!is_key_frame)
|
if (!is_key_frame)
|
||||||
d_avg = vp10_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
|
d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
|
||||||
} else {
|
} else {
|
||||||
s_avg = vp10_avg_4x4(s + y4_idx * sp + x4_idx, sp);
|
s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
|
||||||
if (!is_key_frame)
|
if (!is_key_frame)
|
||||||
d_avg = vp10_avg_4x4(d + y4_idx * dp + x4_idx, dp);
|
d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
s_avg = vp10_avg_4x4(s + y4_idx * sp + x4_idx, sp);
|
s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
|
||||||
if (!is_key_frame)
|
if (!is_key_frame)
|
||||||
d_avg = vp10_avg_4x4(d + y4_idx * dp + x4_idx, dp);
|
d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
|
||||||
#endif
|
#endif
|
||||||
sum = s_avg - d_avg;
|
sum = s_avg - d_avg;
|
||||||
sse = sum * sum;
|
sse = sum * sum;
|
||||||
@@ -616,18 +616,18 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
|
|||||||
int d_avg = 128;
|
int d_avg = 128;
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
|
if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
|
||||||
s_avg = vp10_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
|
s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
|
||||||
if (!is_key_frame)
|
if (!is_key_frame)
|
||||||
d_avg = vp10_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
|
d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
|
||||||
} else {
|
} else {
|
||||||
s_avg = vp10_avg_8x8(s + y8_idx * sp + x8_idx, sp);
|
s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
|
||||||
if (!is_key_frame)
|
if (!is_key_frame)
|
||||||
d_avg = vp10_avg_8x8(d + y8_idx * dp + x8_idx, dp);
|
d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
s_avg = vp10_avg_8x8(s + y8_idx * sp + x8_idx, sp);
|
s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
|
||||||
if (!is_key_frame)
|
if (!is_key_frame)
|
||||||
d_avg = vp10_avg_8x8(d + y8_idx * dp + x8_idx, dp);
|
d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
|
||||||
#endif
|
#endif
|
||||||
sum = s_avg - d_avg;
|
sum = s_avg - d_avg;
|
||||||
sse = sum * sum;
|
sse = sum * sum;
|
||||||
|
@@ -1759,7 +1759,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
|
|||||||
int center, offset = 0;
|
int center, offset = 0;
|
||||||
int bw = 4 << bwl; // redundant variable, to be changed in the experiments.
|
int bw = 4 << bwl; // redundant variable, to be changed in the experiments.
|
||||||
for (d = 0; d <= bw; d += 16) {
|
for (d = 0; d <= bw; d += 16) {
|
||||||
this_sad = vp10_vector_var(&ref[d], src, bwl);
|
this_sad = vpx_vector_var(&ref[d], src, bwl);
|
||||||
if (this_sad < best_sad) {
|
if (this_sad < best_sad) {
|
||||||
best_sad = this_sad;
|
best_sad = this_sad;
|
||||||
offset = d;
|
offset = d;
|
||||||
@@ -1772,7 +1772,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
|
|||||||
// check limit
|
// check limit
|
||||||
if (this_pos < 0 || this_pos > bw)
|
if (this_pos < 0 || this_pos > bw)
|
||||||
continue;
|
continue;
|
||||||
this_sad = vp10_vector_var(&ref[this_pos], src, bwl);
|
this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
|
||||||
if (this_sad < best_sad) {
|
if (this_sad < best_sad) {
|
||||||
best_sad = this_sad;
|
best_sad = this_sad;
|
||||||
center = this_pos;
|
center = this_pos;
|
||||||
@@ -1785,7 +1785,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
|
|||||||
// check limit
|
// check limit
|
||||||
if (this_pos < 0 || this_pos > bw)
|
if (this_pos < 0 || this_pos > bw)
|
||||||
continue;
|
continue;
|
||||||
this_sad = vp10_vector_var(&ref[this_pos], src, bwl);
|
this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
|
||||||
if (this_sad < best_sad) {
|
if (this_sad < best_sad) {
|
||||||
best_sad = this_sad;
|
best_sad = this_sad;
|
||||||
center = this_pos;
|
center = this_pos;
|
||||||
@@ -1798,7 +1798,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
|
|||||||
// check limit
|
// check limit
|
||||||
if (this_pos < 0 || this_pos > bw)
|
if (this_pos < 0 || this_pos > bw)
|
||||||
continue;
|
continue;
|
||||||
this_sad = vp10_vector_var(&ref[this_pos], src, bwl);
|
this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
|
||||||
if (this_sad < best_sad) {
|
if (this_sad < best_sad) {
|
||||||
best_sad = this_sad;
|
best_sad = this_sad;
|
||||||
center = this_pos;
|
center = this_pos;
|
||||||
@@ -1811,7 +1811,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
|
|||||||
// check limit
|
// check limit
|
||||||
if (this_pos < 0 || this_pos > bw)
|
if (this_pos < 0 || this_pos > bw)
|
||||||
continue;
|
continue;
|
||||||
this_sad = vp10_vector_var(&ref[this_pos], src, bwl);
|
this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
|
||||||
if (this_sad < best_sad) {
|
if (this_sad < best_sad) {
|
||||||
best_sad = this_sad;
|
best_sad = this_sad;
|
||||||
center = this_pos;
|
center = this_pos;
|
||||||
@@ -1880,25 +1880,25 @@ unsigned int vp10_int_pro_motion_estimation(const VP10_COMP *cpi, MACROBLOCK *x,
|
|||||||
// Set up prediction 1-D reference set
|
// Set up prediction 1-D reference set
|
||||||
ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
|
ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
|
||||||
for (idx = 0; idx < search_width; idx += 16) {
|
for (idx = 0; idx < search_width; idx += 16) {
|
||||||
vp10_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
|
vpx_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
|
||||||
ref_buf += 16;
|
ref_buf += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
|
ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
|
||||||
for (idx = 0; idx < search_height; ++idx) {
|
for (idx = 0; idx < search_height; ++idx) {
|
||||||
vbuf[idx] = vp10_int_pro_col(ref_buf, bw) >> norm_factor;
|
vbuf[idx] = vpx_int_pro_col(ref_buf, bw) >> norm_factor;
|
||||||
ref_buf += ref_stride;
|
ref_buf += ref_stride;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set up src 1-D reference set
|
// Set up src 1-D reference set
|
||||||
for (idx = 0; idx < bw; idx += 16) {
|
for (idx = 0; idx < bw; idx += 16) {
|
||||||
src_buf = x->plane[0].src.buf + idx;
|
src_buf = x->plane[0].src.buf + idx;
|
||||||
vp10_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
|
vpx_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
|
||||||
}
|
}
|
||||||
|
|
||||||
src_buf = x->plane[0].src.buf;
|
src_buf = x->plane[0].src.buf;
|
||||||
for (idx = 0; idx < bh; ++idx) {
|
for (idx = 0; idx < bh; ++idx) {
|
||||||
src_vbuf[idx] = vp10_int_pro_col(src_buf, bw) >> norm_factor;
|
src_vbuf[idx] = vpx_int_pro_col(src_buf, bw) >> norm_factor;
|
||||||
src_buf += src_stride;
|
src_buf += src_stride;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -17,7 +17,6 @@ VP10_CX_SRCS_REMOVE-no += $(VP10_COMMON_SRCS_REMOVE-no)
|
|||||||
|
|
||||||
VP10_CX_SRCS-yes += vp10_cx_iface.c
|
VP10_CX_SRCS-yes += vp10_cx_iface.c
|
||||||
|
|
||||||
VP10_CX_SRCS-yes += encoder/avg.c
|
|
||||||
VP10_CX_SRCS-yes += encoder/bitstream.c
|
VP10_CX_SRCS-yes += encoder/bitstream.c
|
||||||
VP10_CX_SRCS-yes += encoder/context_tree.c
|
VP10_CX_SRCS-yes += encoder/context_tree.c
|
||||||
VP10_CX_SRCS-yes += encoder/context_tree.h
|
VP10_CX_SRCS-yes += encoder/context_tree.h
|
||||||
@@ -87,7 +86,6 @@ VP10_CX_SRCS-yes += encoder/temporal_filter.h
|
|||||||
VP10_CX_SRCS-yes += encoder/mbgraph.c
|
VP10_CX_SRCS-yes += encoder/mbgraph.c
|
||||||
VP10_CX_SRCS-yes += encoder/mbgraph.h
|
VP10_CX_SRCS-yes += encoder/mbgraph.h
|
||||||
|
|
||||||
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/avg_intrin_sse2.c
|
|
||||||
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
|
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
|
||||||
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
|
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
|
||||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||||
@@ -102,7 +100,6 @@ endif
|
|||||||
ifeq ($(ARCH_X86_64),yes)
|
ifeq ($(ARCH_X86_64),yes)
|
||||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||||
VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3_x86_64.asm
|
VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3_x86_64.asm
|
||||||
VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3_x86_64.asm
|
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
@@ -119,10 +116,8 @@ ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
|||||||
VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/dct_neon.c
|
VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/dct_neon.c
|
||||||
VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/error_neon.c
|
VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/error_neon.c
|
||||||
endif
|
endif
|
||||||
VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/avg_neon.c
|
|
||||||
VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/quantize_neon.c
|
VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/quantize_neon.c
|
||||||
|
|
||||||
VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/avg_msa.c
|
|
||||||
VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/error_msa.c
|
VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/error_msa.c
|
||||||
VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct4x4_msa.c
|
VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct4x4_msa.c
|
||||||
VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct8x8_msa.c
|
VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct8x8_msa.c
|
||||||
|
@@ -194,42 +194,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||||||
#
|
#
|
||||||
if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
|
if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
|
|
||||||
specialize qw/vp9_avg_8x8 sse2 neon msa/;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
|
|
||||||
specialize qw/vp9_avg_4x4 sse2 neon msa/;
|
|
||||||
|
|
||||||
add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
|
|
||||||
specialize qw/vp9_minmax_8x8 sse2/;
|
|
||||||
|
|
||||||
add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
|
|
||||||
specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
|
|
||||||
specialize qw/vp9_hadamard_16x16 sse2/;
|
|
||||||
|
|
||||||
add_proto qw/int vp9_satd/, "const int16_t *coeff, int length";
|
|
||||||
specialize qw/vp9_satd sse2 neon/;
|
|
||||||
|
|
||||||
add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
|
|
||||||
specialize qw/vp9_int_pro_row sse2 neon/;
|
|
||||||
|
|
||||||
add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";
|
|
||||||
specialize qw/vp9_int_pro_col sse2 neon/;
|
|
||||||
|
|
||||||
add_proto qw/int vp9_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
|
|
||||||
specialize qw/vp9_vector_var neon sse2/;
|
|
||||||
|
|
||||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|
||||||
add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
|
|
||||||
specialize qw/vp9_highbd_avg_8x8/;
|
|
||||||
add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p";
|
|
||||||
specialize qw/vp9_highbd_avg_4x4/;
|
|
||||||
add_proto qw/void vp9_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
|
|
||||||
specialize qw/vp9_highbd_minmax_8x8/;
|
|
||||||
}
|
|
||||||
|
|
||||||
# ENCODEMB INVOKE
|
# ENCODEMB INVOKE
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@@ -1,56 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Use of this source code is governed by a BSD-style license
|
|
||||||
* that can be found in the LICENSE file in the root of the source
|
|
||||||
* tree. An additional intellectual property rights grant can be found
|
|
||||||
* in the file PATENTS. All contributing project authors may
|
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "./vp9_rtcd.h"
|
|
||||||
#include "vpx_dsp/mips/macros_msa.h"
|
|
||||||
|
|
||||||
uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
|
|
||||||
uint32_t sum_out;
|
|
||||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
|
|
||||||
v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
|
|
||||||
v4u32 sum = { 0 };
|
|
||||||
|
|
||||||
LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
|
|
||||||
HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);
|
|
||||||
HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);
|
|
||||||
ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);
|
|
||||||
ADD2(sum0, sum2, sum4, sum6, sum0, sum4);
|
|
||||||
sum0 += sum4;
|
|
||||||
|
|
||||||
sum = __msa_hadd_u_w(sum0, sum0);
|
|
||||||
sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);
|
|
||||||
sum = __msa_hadd_u_w(sum0, sum0);
|
|
||||||
sum = (v4u32)__msa_srari_w((v4i32)sum, 6);
|
|
||||||
sum_out = __msa_copy_u_w((v4i32)sum, 0);
|
|
||||||
|
|
||||||
return sum_out;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t vp9_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
|
|
||||||
uint32_t sum_out;
|
|
||||||
uint32_t src0, src1, src2, src3;
|
|
||||||
v16u8 vec = { 0 };
|
|
||||||
v8u16 sum0;
|
|
||||||
v4u32 sum1;
|
|
||||||
v2u64 sum2;
|
|
||||||
|
|
||||||
LW4(src, src_stride, src0, src1, src2, src3);
|
|
||||||
INSERT_W4_UB(src0, src1, src2, src3, vec);
|
|
||||||
|
|
||||||
sum0 = __msa_hadd_u_h(vec, vec);
|
|
||||||
sum1 = __msa_hadd_u_w(sum0, sum0);
|
|
||||||
sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);
|
|
||||||
sum1 = __msa_hadd_u_w(sum0, sum0);
|
|
||||||
sum2 = __msa_hadd_u_d(sum1, sum1);
|
|
||||||
sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);
|
|
||||||
sum_out = __msa_copy_u_w((v4i32)sum1, 0);
|
|
||||||
|
|
||||||
return sum_out;
|
|
||||||
}
|
|
@@ -1,230 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Use of this source code is governed by a BSD-style license
|
|
||||||
* that can be found in the LICENSE file in the root of the source
|
|
||||||
* tree. An additional intellectual property rights grant can be found
|
|
||||||
* in the file PATENTS. All contributing project authors may
|
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
|
||||||
*/
|
|
||||||
#include "./vp9_rtcd.h"
|
|
||||||
#include "vp9/common/vp9_common.h"
|
|
||||||
#include "vpx_ports/mem.h"
|
|
||||||
|
|
||||||
unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
|
|
||||||
int i, j;
|
|
||||||
int sum = 0;
|
|
||||||
for (i = 0; i < 8; ++i, s+=p)
|
|
||||||
for (j = 0; j < 8; sum += s[j], ++j) {}
|
|
||||||
|
|
||||||
return (sum + 32) >> 6;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {
|
|
||||||
int i, j;
|
|
||||||
int sum = 0;
|
|
||||||
for (i = 0; i < 4; ++i, s+=p)
|
|
||||||
for (j = 0; j < 4; sum += s[j], ++j) {}
|
|
||||||
|
|
||||||
return (sum + 8) >> 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
// src_diff: first pass, 9 bit, dynamic range [-255, 255]
|
|
||||||
// second pass, 12 bit, dynamic range [-2040, 2040]
|
|
||||||
static void hadamard_col8(const int16_t *src_diff, int src_stride,
|
|
||||||
int16_t *coeff) {
|
|
||||||
int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
|
|
||||||
int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
|
|
||||||
int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
|
|
||||||
int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
|
|
||||||
int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
|
|
||||||
int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
|
|
||||||
int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
|
|
||||||
int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
|
|
||||||
|
|
||||||
int16_t c0 = b0 + b2;
|
|
||||||
int16_t c1 = b1 + b3;
|
|
||||||
int16_t c2 = b0 - b2;
|
|
||||||
int16_t c3 = b1 - b3;
|
|
||||||
int16_t c4 = b4 + b6;
|
|
||||||
int16_t c5 = b5 + b7;
|
|
||||||
int16_t c6 = b4 - b6;
|
|
||||||
int16_t c7 = b5 - b7;
|
|
||||||
|
|
||||||
coeff[0] = c0 + c4;
|
|
||||||
coeff[7] = c1 + c5;
|
|
||||||
coeff[3] = c2 + c6;
|
|
||||||
coeff[4] = c3 + c7;
|
|
||||||
coeff[2] = c0 - c4;
|
|
||||||
coeff[6] = c1 - c5;
|
|
||||||
coeff[1] = c2 - c6;
|
|
||||||
coeff[5] = c3 - c7;
|
|
||||||
}
|
|
||||||
|
|
||||||
void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
|
|
||||||
int16_t *coeff) {
|
|
||||||
int idx;
|
|
||||||
int16_t buffer[64];
|
|
||||||
int16_t *tmp_buf = &buffer[0];
|
|
||||||
for (idx = 0; idx < 8; ++idx) {
|
|
||||||
hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit
|
|
||||||
// dynamic range [-255, 255]
|
|
||||||
tmp_buf += 8;
|
|
||||||
++src_diff;
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp_buf = &buffer[0];
|
|
||||||
for (idx = 0; idx < 8; ++idx) {
|
|
||||||
hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit
|
|
||||||
// dynamic range [-2040, 2040]
|
|
||||||
coeff += 8; // coeff: 15 bit
|
|
||||||
// dynamic range [-16320, 16320]
|
|
||||||
++tmp_buf;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// In place 16x16 2D Hadamard transform
|
|
||||||
void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
|
|
||||||
int16_t *coeff) {
|
|
||||||
int idx;
|
|
||||||
for (idx = 0; idx < 4; ++idx) {
|
|
||||||
// src_diff: 9 bit, dynamic range [-255, 255]
|
|
||||||
int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
|
|
||||||
+ (idx & 0x01) * 8;
|
|
||||||
vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
|
|
||||||
}
|
|
||||||
|
|
||||||
// coeff: 15 bit, dynamic range [-16320, 16320]
|
|
||||||
for (idx = 0; idx < 64; ++idx) {
|
|
||||||
int16_t a0 = coeff[0];
|
|
||||||
int16_t a1 = coeff[64];
|
|
||||||
int16_t a2 = coeff[128];
|
|
||||||
int16_t a3 = coeff[192];
|
|
||||||
|
|
||||||
int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640]
|
|
||||||
int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range
|
|
||||||
int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
|
|
||||||
int16_t b3 = (a2 - a3) >> 1;
|
|
||||||
|
|
||||||
coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
|
|
||||||
coeff[64] = b1 + b3;
|
|
||||||
coeff[128] = b0 - b2;
|
|
||||||
coeff[192] = b1 - b3;
|
|
||||||
|
|
||||||
++coeff;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// coeff: 16 bits, dynamic range [-32640, 32640].
|
|
||||||
// length: value range {16, 64, 256, 1024}.
|
|
||||||
int vp9_satd_c(const int16_t *coeff, int length) {
|
|
||||||
int i;
|
|
||||||
int satd = 0;
|
|
||||||
for (i = 0; i < length; ++i)
|
|
||||||
satd += abs(coeff[i]);
|
|
||||||
|
|
||||||
// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
|
|
||||||
return satd;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Integer projection onto row vectors.
|
|
||||||
// height: value range {16, 32, 64}.
|
|
||||||
void vp9_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,
|
|
||||||
const int ref_stride, const int height) {
|
|
||||||
int idx;
|
|
||||||
const int norm_factor = height >> 1;
|
|
||||||
for (idx = 0; idx < 16; ++idx) {
|
|
||||||
int i;
|
|
||||||
hbuf[idx] = 0;
|
|
||||||
// hbuf[idx]: 14 bit, dynamic range [0, 16320].
|
|
||||||
for (i = 0; i < height; ++i)
|
|
||||||
hbuf[idx] += ref[i * ref_stride];
|
|
||||||
// hbuf[idx]: 9 bit, dynamic range [0, 510].
|
|
||||||
hbuf[idx] /= norm_factor;
|
|
||||||
++ref;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// width: value range {16, 32, 64}.
|
|
||||||
int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) {
|
|
||||||
int idx;
|
|
||||||
int16_t sum = 0;
|
|
||||||
// sum: 14 bit, dynamic range [0, 16320]
|
|
||||||
for (idx = 0; idx < width; ++idx)
|
|
||||||
sum += ref[idx];
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ref: [0 - 510]
|
|
||||||
// src: [0 - 510]
|
|
||||||
// bwl: {2, 3, 4}
|
|
||||||
int vp9_vector_var_c(int16_t const *ref, int16_t const *src,
|
|
||||||
const int bwl) {
|
|
||||||
int i;
|
|
||||||
int width = 4 << bwl;
|
|
||||||
int sse = 0, mean = 0, var;
|
|
||||||
|
|
||||||
for (i = 0; i < width; ++i) {
|
|
||||||
int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits.
|
|
||||||
mean += diff; // mean: dynamic range 16 bits.
|
|
||||||
sse += diff * diff; // sse: dynamic range 26 bits.
|
|
||||||
}
|
|
||||||
|
|
||||||
// (mean * mean): dynamic range 31 bits.
|
|
||||||
var = sse - ((mean * mean) >> (bwl + 2));
|
|
||||||
return var;
|
|
||||||
}
|
|
||||||
|
|
||||||
void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
|
|
||||||
int *min, int *max) {
|
|
||||||
int i, j;
|
|
||||||
*min = 255;
|
|
||||||
*max = 0;
|
|
||||||
for (i = 0; i < 8; ++i, s += p, d += dp) {
|
|
||||||
for (j = 0; j < 8; ++j) {
|
|
||||||
int diff = abs(s[j]-d[j]);
|
|
||||||
*min = diff < *min ? diff : *min;
|
|
||||||
*max = diff > *max ? diff : *max;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
|
||||||
unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
|
|
||||||
int i, j;
|
|
||||||
int sum = 0;
|
|
||||||
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
|
|
||||||
for (i = 0; i < 8; ++i, s+=p)
|
|
||||||
for (j = 0; j < 8; sum += s[j], ++j) {}
|
|
||||||
|
|
||||||
return (sum + 32) >> 6;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) {
|
|
||||||
int i, j;
|
|
||||||
int sum = 0;
|
|
||||||
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
|
|
||||||
for (i = 0; i < 4; ++i, s+=p)
|
|
||||||
for (j = 0; j < 4; sum += s[j], ++j) {}
|
|
||||||
|
|
||||||
return (sum + 8) >> 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
void vp9_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
|
|
||||||
int dp, int *min, int *max) {
|
|
||||||
int i, j;
|
|
||||||
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
|
|
||||||
const uint16_t* d = CONVERT_TO_SHORTPTR(d8);
|
|
||||||
*min = 255;
|
|
||||||
*max = 0;
|
|
||||||
for (i = 0; i < 8; ++i, s += p, d += dp) {
|
|
||||||
for (j = 0; j < 8; ++j) {
|
|
||||||
int diff = abs(s[j]-d[j]);
|
|
||||||
*min = diff < *min ? diff : *min;
|
|
||||||
*max = diff > *max ? diff : *max;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
|
||||||
|
|
||||||
|
|
@@ -558,16 +558,16 @@ static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
|
|||||||
if (x8_idx < pixels_wide && y8_idx < pixels_high) {
|
if (x8_idx < pixels_wide && y8_idx < pixels_high) {
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
|
if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
|
||||||
vp9_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
||||||
d + y8_idx * dp + x8_idx, dp,
|
d + y8_idx * dp + x8_idx, dp,
|
||||||
&min, &max);
|
&min, &max);
|
||||||
} else {
|
} else {
|
||||||
vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
||||||
d + y8_idx * dp + x8_idx, dp,
|
d + y8_idx * dp + x8_idx, dp,
|
||||||
&min, &max);
|
&min, &max);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
||||||
d + y8_idx * dp + x8_idx, dp,
|
d + y8_idx * dp + x8_idx, dp,
|
||||||
&min, &max);
|
&min, &max);
|
||||||
#endif
|
#endif
|
||||||
@@ -599,18 +599,18 @@ static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
|
|||||||
int d_avg = 128;
|
int d_avg = 128;
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
|
if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
|
||||||
s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
|
s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
|
||||||
if (!is_key_frame)
|
if (!is_key_frame)
|
||||||
d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
|
d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
|
||||||
} else {
|
} else {
|
||||||
s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
|
s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
|
||||||
if (!is_key_frame)
|
if (!is_key_frame)
|
||||||
d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
|
d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
|
s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
|
||||||
if (!is_key_frame)
|
if (!is_key_frame)
|
||||||
d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
|
d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
|
||||||
#endif
|
#endif
|
||||||
sum = s_avg - d_avg;
|
sum = s_avg - d_avg;
|
||||||
sse = sum * sum;
|
sse = sum * sum;
|
||||||
@@ -638,18 +638,18 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
|
|||||||
int d_avg = 128;
|
int d_avg = 128;
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
|
if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
|
||||||
s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
|
s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
|
||||||
if (!is_key_frame)
|
if (!is_key_frame)
|
||||||
d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
|
d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
|
||||||
} else {
|
} else {
|
||||||
s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
|
s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
|
||||||
if (!is_key_frame)
|
if (!is_key_frame)
|
||||||
d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
|
d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
|
s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
|
||||||
if (!is_key_frame)
|
if (!is_key_frame)
|
||||||
d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
|
d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
|
||||||
#endif
|
#endif
|
||||||
sum = s_avg - d_avg;
|
sum = s_avg - d_avg;
|
||||||
sse = sum * sum;
|
sse = sum * sum;
|
||||||
|
@@ -1755,7 +1755,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
|
|||||||
int center, offset = 0;
|
int center, offset = 0;
|
||||||
int bw = 4 << bwl; // redundant variable, to be changed in the experiments.
|
int bw = 4 << bwl; // redundant variable, to be changed in the experiments.
|
||||||
for (d = 0; d <= bw; d += 16) {
|
for (d = 0; d <= bw; d += 16) {
|
||||||
this_sad = vp9_vector_var(&ref[d], src, bwl);
|
this_sad = vpx_vector_var(&ref[d], src, bwl);
|
||||||
if (this_sad < best_sad) {
|
if (this_sad < best_sad) {
|
||||||
best_sad = this_sad;
|
best_sad = this_sad;
|
||||||
offset = d;
|
offset = d;
|
||||||
@@ -1768,7 +1768,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
|
|||||||
// check limit
|
// check limit
|
||||||
if (this_pos < 0 || this_pos > bw)
|
if (this_pos < 0 || this_pos > bw)
|
||||||
continue;
|
continue;
|
||||||
this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
|
this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
|
||||||
if (this_sad < best_sad) {
|
if (this_sad < best_sad) {
|
||||||
best_sad = this_sad;
|
best_sad = this_sad;
|
||||||
center = this_pos;
|
center = this_pos;
|
||||||
@@ -1781,7 +1781,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
|
|||||||
// check limit
|
// check limit
|
||||||
if (this_pos < 0 || this_pos > bw)
|
if (this_pos < 0 || this_pos > bw)
|
||||||
continue;
|
continue;
|
||||||
this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
|
this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
|
||||||
if (this_sad < best_sad) {
|
if (this_sad < best_sad) {
|
||||||
best_sad = this_sad;
|
best_sad = this_sad;
|
||||||
center = this_pos;
|
center = this_pos;
|
||||||
@@ -1794,7 +1794,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
|
|||||||
// check limit
|
// check limit
|
||||||
if (this_pos < 0 || this_pos > bw)
|
if (this_pos < 0 || this_pos > bw)
|
||||||
continue;
|
continue;
|
||||||
this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
|
this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
|
||||||
if (this_sad < best_sad) {
|
if (this_sad < best_sad) {
|
||||||
best_sad = this_sad;
|
best_sad = this_sad;
|
||||||
center = this_pos;
|
center = this_pos;
|
||||||
@@ -1807,7 +1807,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
|
|||||||
// check limit
|
// check limit
|
||||||
if (this_pos < 0 || this_pos > bw)
|
if (this_pos < 0 || this_pos > bw)
|
||||||
continue;
|
continue;
|
||||||
this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
|
this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
|
||||||
if (this_sad < best_sad) {
|
if (this_sad < best_sad) {
|
||||||
best_sad = this_sad;
|
best_sad = this_sad;
|
||||||
center = this_pos;
|
center = this_pos;
|
||||||
@@ -1876,25 +1876,25 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
|
|||||||
// Set up prediction 1-D reference set
|
// Set up prediction 1-D reference set
|
||||||
ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
|
ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
|
||||||
for (idx = 0; idx < search_width; idx += 16) {
|
for (idx = 0; idx < search_width; idx += 16) {
|
||||||
vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
|
vpx_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
|
||||||
ref_buf += 16;
|
ref_buf += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
|
ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
|
||||||
for (idx = 0; idx < search_height; ++idx) {
|
for (idx = 0; idx < search_height; ++idx) {
|
||||||
vbuf[idx] = vp9_int_pro_col(ref_buf, bw) >> norm_factor;
|
vbuf[idx] = vpx_int_pro_col(ref_buf, bw) >> norm_factor;
|
||||||
ref_buf += ref_stride;
|
ref_buf += ref_stride;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set up src 1-D reference set
|
// Set up src 1-D reference set
|
||||||
for (idx = 0; idx < bw; idx += 16) {
|
for (idx = 0; idx < bw; idx += 16) {
|
||||||
src_buf = x->plane[0].src.buf + idx;
|
src_buf = x->plane[0].src.buf + idx;
|
||||||
vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
|
vpx_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
|
||||||
}
|
}
|
||||||
|
|
||||||
src_buf = x->plane[0].src.buf;
|
src_buf = x->plane[0].src.buf;
|
||||||
for (idx = 0; idx < bh; ++idx) {
|
for (idx = 0; idx < bh; ++idx) {
|
||||||
src_vbuf[idx] = vp9_int_pro_col(src_buf, bw) >> norm_factor;
|
src_vbuf[idx] = vpx_int_pro_col(src_buf, bw) >> norm_factor;
|
||||||
src_buf += src_stride;
|
src_buf += src_stride;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -619,14 +619,14 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
|
|||||||
scan_order->scan, scan_order->iscan);
|
scan_order->scan, scan_order->iscan);
|
||||||
break;
|
break;
|
||||||
case TX_16X16:
|
case TX_16X16:
|
||||||
vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
|
vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
|
||||||
vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
|
vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
|
||||||
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
|
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
|
||||||
pd->dequant, eob,
|
pd->dequant, eob,
|
||||||
scan_order->scan, scan_order->iscan);
|
scan_order->scan, scan_order->iscan);
|
||||||
break;
|
break;
|
||||||
case TX_8X8:
|
case TX_8X8:
|
||||||
vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
|
vpx_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
|
||||||
vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
|
vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
|
||||||
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
|
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
|
||||||
pd->dequant, eob,
|
pd->dequant, eob,
|
||||||
@@ -673,7 +673,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
|
|||||||
if (*eob == 1)
|
if (*eob == 1)
|
||||||
*rate += (int)abs(qcoeff[0]);
|
*rate += (int)abs(qcoeff[0]);
|
||||||
else if (*eob > 1)
|
else if (*eob > 1)
|
||||||
*rate += vp9_satd((const int16_t *)qcoeff, step << 4);
|
*rate += vpx_satd((const int16_t *)qcoeff, step << 4);
|
||||||
|
|
||||||
*dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
|
*dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
|
||||||
}
|
}
|
||||||
|
@@ -1,423 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Use of this source code is governed by a BSD-style license
|
|
||||||
* that can be found in the LICENSE file in the root of the source
|
|
||||||
* tree. An additional intellectual property rights grant can be found
|
|
||||||
* in the file PATENTS. All contributing project authors may
|
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <emmintrin.h>
|
|
||||||
|
|
||||||
#include "./vp9_rtcd.h"
|
|
||||||
#include "vpx_ports/mem.h"
|
|
||||||
|
|
||||||
void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
|
|
||||||
int *min, int *max) {
|
|
||||||
__m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
|
|
||||||
u0 = _mm_setzero_si128();
|
|
||||||
// Row 0
|
|
||||||
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
|
|
||||||
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
|
|
||||||
diff = _mm_subs_epi16(s0, d0);
|
|
||||||
negdiff = _mm_subs_epi16(u0, diff);
|
|
||||||
absdiff0 = _mm_max_epi16(diff, negdiff);
|
|
||||||
// Row 1
|
|
||||||
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
|
|
||||||
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
|
|
||||||
diff = _mm_subs_epi16(s0, d0);
|
|
||||||
negdiff = _mm_subs_epi16(u0, diff);
|
|
||||||
absdiff = _mm_max_epi16(diff, negdiff);
|
|
||||||
maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
|
|
||||||
minabsdiff = _mm_min_epi16(absdiff0, absdiff);
|
|
||||||
// Row 2
|
|
||||||
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
|
|
||||||
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
|
|
||||||
diff = _mm_subs_epi16(s0, d0);
|
|
||||||
negdiff = _mm_subs_epi16(u0, diff);
|
|
||||||
absdiff = _mm_max_epi16(diff, negdiff);
|
|
||||||
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
|
||||||
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
|
||||||
// Row 3
|
|
||||||
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
|
|
||||||
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
|
|
||||||
diff = _mm_subs_epi16(s0, d0);
|
|
||||||
negdiff = _mm_subs_epi16(u0, diff);
|
|
||||||
absdiff = _mm_max_epi16(diff, negdiff);
|
|
||||||
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
|
||||||
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
|
||||||
// Row 4
|
|
||||||
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
|
|
||||||
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
|
|
||||||
diff = _mm_subs_epi16(s0, d0);
|
|
||||||
negdiff = _mm_subs_epi16(u0, diff);
|
|
||||||
absdiff = _mm_max_epi16(diff, negdiff);
|
|
||||||
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
|
||||||
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
|
||||||
// Row 5
|
|
||||||
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
|
|
||||||
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
|
|
||||||
diff = _mm_subs_epi16(s0, d0);
|
|
||||||
negdiff = _mm_subs_epi16(u0, diff);
|
|
||||||
absdiff = _mm_max_epi16(diff, negdiff);
|
|
||||||
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
|
||||||
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
|
||||||
// Row 6
|
|
||||||
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
|
|
||||||
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
|
|
||||||
diff = _mm_subs_epi16(s0, d0);
|
|
||||||
negdiff = _mm_subs_epi16(u0, diff);
|
|
||||||
absdiff = _mm_max_epi16(diff, negdiff);
|
|
||||||
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
|
||||||
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
|
||||||
// Row 7
|
|
||||||
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
|
|
||||||
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
|
|
||||||
diff = _mm_subs_epi16(s0, d0);
|
|
||||||
negdiff = _mm_subs_epi16(u0, diff);
|
|
||||||
absdiff = _mm_max_epi16(diff, negdiff);
|
|
||||||
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
|
||||||
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
|
||||||
|
|
||||||
maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
|
|
||||||
maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
|
|
||||||
maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
|
|
||||||
*max = _mm_extract_epi16(maxabsdiff, 0);
|
|
||||||
|
|
||||||
minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
|
|
||||||
minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
|
|
||||||
minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
|
|
||||||
*min = _mm_extract_epi16(minabsdiff, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
|
|
||||||
__m128i s0, s1, u0;
|
|
||||||
unsigned int avg = 0;
|
|
||||||
u0 = _mm_setzero_si128();
|
|
||||||
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
|
|
||||||
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
|
|
||||||
s0 = _mm_adds_epu16(s0, s1);
|
|
||||||
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
|
|
||||||
s0 = _mm_adds_epu16(s0, s1);
|
|
||||||
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
|
|
||||||
s0 = _mm_adds_epu16(s0, s1);
|
|
||||||
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
|
|
||||||
s0 = _mm_adds_epu16(s0, s1);
|
|
||||||
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
|
|
||||||
s0 = _mm_adds_epu16(s0, s1);
|
|
||||||
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
|
|
||||||
s0 = _mm_adds_epu16(s0, s1);
|
|
||||||
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
|
|
||||||
s0 = _mm_adds_epu16(s0, s1);
|
|
||||||
|
|
||||||
s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
|
|
||||||
s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
|
|
||||||
s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
|
|
||||||
avg = _mm_extract_epi16(s0, 0);
|
|
||||||
return (avg + 32) >> 6;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
|
|
||||||
__m128i s0, s1, u0;
|
|
||||||
unsigned int avg = 0;
|
|
||||||
u0 = _mm_setzero_si128();
|
|
||||||
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
|
|
||||||
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
|
|
||||||
s0 = _mm_adds_epu16(s0, s1);
|
|
||||||
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
|
|
||||||
s0 = _mm_adds_epu16(s0, s1);
|
|
||||||
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
|
|
||||||
s0 = _mm_adds_epu16(s0, s1);
|
|
||||||
|
|
||||||
s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
|
|
||||||
s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
|
|
||||||
avg = _mm_extract_epi16(s0, 0);
|
|
||||||
return (avg + 8) >> 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void hadamard_col8_sse2(__m128i *in, int iter) {
|
|
||||||
__m128i a0 = in[0];
|
|
||||||
__m128i a1 = in[1];
|
|
||||||
__m128i a2 = in[2];
|
|
||||||
__m128i a3 = in[3];
|
|
||||||
__m128i a4 = in[4];
|
|
||||||
__m128i a5 = in[5];
|
|
||||||
__m128i a6 = in[6];
|
|
||||||
__m128i a7 = in[7];
|
|
||||||
|
|
||||||
__m128i b0 = _mm_add_epi16(a0, a1);
|
|
||||||
__m128i b1 = _mm_sub_epi16(a0, a1);
|
|
||||||
__m128i b2 = _mm_add_epi16(a2, a3);
|
|
||||||
__m128i b3 = _mm_sub_epi16(a2, a3);
|
|
||||||
__m128i b4 = _mm_add_epi16(a4, a5);
|
|
||||||
__m128i b5 = _mm_sub_epi16(a4, a5);
|
|
||||||
__m128i b6 = _mm_add_epi16(a6, a7);
|
|
||||||
__m128i b7 = _mm_sub_epi16(a6, a7);
|
|
||||||
|
|
||||||
a0 = _mm_add_epi16(b0, b2);
|
|
||||||
a1 = _mm_add_epi16(b1, b3);
|
|
||||||
a2 = _mm_sub_epi16(b0, b2);
|
|
||||||
a3 = _mm_sub_epi16(b1, b3);
|
|
||||||
a4 = _mm_add_epi16(b4, b6);
|
|
||||||
a5 = _mm_add_epi16(b5, b7);
|
|
||||||
a6 = _mm_sub_epi16(b4, b6);
|
|
||||||
a7 = _mm_sub_epi16(b5, b7);
|
|
||||||
|
|
||||||
if (iter == 0) {
|
|
||||||
b0 = _mm_add_epi16(a0, a4);
|
|
||||||
b7 = _mm_add_epi16(a1, a5);
|
|
||||||
b3 = _mm_add_epi16(a2, a6);
|
|
||||||
b4 = _mm_add_epi16(a3, a7);
|
|
||||||
b2 = _mm_sub_epi16(a0, a4);
|
|
||||||
b6 = _mm_sub_epi16(a1, a5);
|
|
||||||
b1 = _mm_sub_epi16(a2, a6);
|
|
||||||
b5 = _mm_sub_epi16(a3, a7);
|
|
||||||
|
|
||||||
a0 = _mm_unpacklo_epi16(b0, b1);
|
|
||||||
a1 = _mm_unpacklo_epi16(b2, b3);
|
|
||||||
a2 = _mm_unpackhi_epi16(b0, b1);
|
|
||||||
a3 = _mm_unpackhi_epi16(b2, b3);
|
|
||||||
a4 = _mm_unpacklo_epi16(b4, b5);
|
|
||||||
a5 = _mm_unpacklo_epi16(b6, b7);
|
|
||||||
a6 = _mm_unpackhi_epi16(b4, b5);
|
|
||||||
a7 = _mm_unpackhi_epi16(b6, b7);
|
|
||||||
|
|
||||||
b0 = _mm_unpacklo_epi32(a0, a1);
|
|
||||||
b1 = _mm_unpacklo_epi32(a4, a5);
|
|
||||||
b2 = _mm_unpackhi_epi32(a0, a1);
|
|
||||||
b3 = _mm_unpackhi_epi32(a4, a5);
|
|
||||||
b4 = _mm_unpacklo_epi32(a2, a3);
|
|
||||||
b5 = _mm_unpacklo_epi32(a6, a7);
|
|
||||||
b6 = _mm_unpackhi_epi32(a2, a3);
|
|
||||||
b7 = _mm_unpackhi_epi32(a6, a7);
|
|
||||||
|
|
||||||
in[0] = _mm_unpacklo_epi64(b0, b1);
|
|
||||||
in[1] = _mm_unpackhi_epi64(b0, b1);
|
|
||||||
in[2] = _mm_unpacklo_epi64(b2, b3);
|
|
||||||
in[3] = _mm_unpackhi_epi64(b2, b3);
|
|
||||||
in[4] = _mm_unpacklo_epi64(b4, b5);
|
|
||||||
in[5] = _mm_unpackhi_epi64(b4, b5);
|
|
||||||
in[6] = _mm_unpacklo_epi64(b6, b7);
|
|
||||||
in[7] = _mm_unpackhi_epi64(b6, b7);
|
|
||||||
} else {
|
|
||||||
in[0] = _mm_add_epi16(a0, a4);
|
|
||||||
in[7] = _mm_add_epi16(a1, a5);
|
|
||||||
in[3] = _mm_add_epi16(a2, a6);
|
|
||||||
in[4] = _mm_add_epi16(a3, a7);
|
|
||||||
in[2] = _mm_sub_epi16(a0, a4);
|
|
||||||
in[6] = _mm_sub_epi16(a1, a5);
|
|
||||||
in[1] = _mm_sub_epi16(a2, a6);
|
|
||||||
in[5] = _mm_sub_epi16(a3, a7);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
|
|
||||||
int16_t *coeff) {
|
|
||||||
__m128i src[8];
|
|
||||||
src[0] = _mm_load_si128((const __m128i *)src_diff);
|
|
||||||
src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
|
|
||||||
src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
|
|
||||||
src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
|
|
||||||
src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
|
|
||||||
src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
|
|
||||||
src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
|
|
||||||
src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
|
|
||||||
|
|
||||||
hadamard_col8_sse2(src, 0);
|
|
||||||
hadamard_col8_sse2(src, 1);
|
|
||||||
|
|
||||||
_mm_store_si128((__m128i *)coeff, src[0]);
|
|
||||||
coeff += 8;
|
|
||||||
_mm_store_si128((__m128i *)coeff, src[1]);
|
|
||||||
coeff += 8;
|
|
||||||
_mm_store_si128((__m128i *)coeff, src[2]);
|
|
||||||
coeff += 8;
|
|
||||||
_mm_store_si128((__m128i *)coeff, src[3]);
|
|
||||||
coeff += 8;
|
|
||||||
_mm_store_si128((__m128i *)coeff, src[4]);
|
|
||||||
coeff += 8;
|
|
||||||
_mm_store_si128((__m128i *)coeff, src[5]);
|
|
||||||
coeff += 8;
|
|
||||||
_mm_store_si128((__m128i *)coeff, src[6]);
|
|
||||||
coeff += 8;
|
|
||||||
_mm_store_si128((__m128i *)coeff, src[7]);
|
|
||||||
}
|
|
||||||
|
|
||||||
void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
|
|
||||||
int16_t *coeff) {
|
|
||||||
int idx;
|
|
||||||
for (idx = 0; idx < 4; ++idx) {
|
|
||||||
int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
|
|
||||||
+ (idx & 0x01) * 8;
|
|
||||||
vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (idx = 0; idx < 64; idx += 8) {
|
|
||||||
__m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
|
|
||||||
__m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
|
|
||||||
__m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
|
|
||||||
__m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
|
|
||||||
|
|
||||||
__m128i b0 = _mm_add_epi16(coeff0, coeff1);
|
|
||||||
__m128i b1 = _mm_sub_epi16(coeff0, coeff1);
|
|
||||||
__m128i b2 = _mm_add_epi16(coeff2, coeff3);
|
|
||||||
__m128i b3 = _mm_sub_epi16(coeff2, coeff3);
|
|
||||||
|
|
||||||
b0 = _mm_srai_epi16(b0, 1);
|
|
||||||
b1 = _mm_srai_epi16(b1, 1);
|
|
||||||
b2 = _mm_srai_epi16(b2, 1);
|
|
||||||
b3 = _mm_srai_epi16(b3, 1);
|
|
||||||
|
|
||||||
coeff0 = _mm_add_epi16(b0, b2);
|
|
||||||
coeff1 = _mm_add_epi16(b1, b3);
|
|
||||||
_mm_store_si128((__m128i *)coeff, coeff0);
|
|
||||||
_mm_store_si128((__m128i *)(coeff + 64), coeff1);
|
|
||||||
|
|
||||||
coeff2 = _mm_sub_epi16(b0, b2);
|
|
||||||
coeff3 = _mm_sub_epi16(b1, b3);
|
|
||||||
_mm_store_si128((__m128i *)(coeff + 128), coeff2);
|
|
||||||
_mm_store_si128((__m128i *)(coeff + 192), coeff3);
|
|
||||||
|
|
||||||
coeff += 8;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int vp9_satd_sse2(const int16_t *coeff, int length) {
|
|
||||||
int i;
|
|
||||||
const __m128i zero = _mm_setzero_si128();
|
|
||||||
__m128i accum = zero;
|
|
||||||
|
|
||||||
for (i = 0; i < length; i += 8) {
|
|
||||||
const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
|
|
||||||
const __m128i inv = _mm_sub_epi16(zero, src_line);
|
|
||||||
const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line)
|
|
||||||
const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
|
|
||||||
const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
|
|
||||||
const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
|
|
||||||
accum = _mm_add_epi32(accum, sum);
|
|
||||||
coeff += 8;
|
|
||||||
}
|
|
||||||
|
|
||||||
{ // cascading summation of accum
|
|
||||||
__m128i hi = _mm_srli_si128(accum, 8);
|
|
||||||
accum = _mm_add_epi32(accum, hi);
|
|
||||||
hi = _mm_srli_epi64(accum, 32);
|
|
||||||
accum = _mm_add_epi32(accum, hi);
|
|
||||||
}
|
|
||||||
|
|
||||||
return _mm_cvtsi128_si32(accum);
|
|
||||||
}
|
|
||||||
|
|
||||||
void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
|
|
||||||
const int ref_stride, const int height) {
|
|
||||||
int idx;
|
|
||||||
__m128i zero = _mm_setzero_si128();
|
|
||||||
__m128i src_line = _mm_loadu_si128((const __m128i *)ref);
|
|
||||||
__m128i s0 = _mm_unpacklo_epi8(src_line, zero);
|
|
||||||
__m128i s1 = _mm_unpackhi_epi8(src_line, zero);
|
|
||||||
__m128i t0, t1;
|
|
||||||
int height_1 = height - 1;
|
|
||||||
ref += ref_stride;
|
|
||||||
|
|
||||||
for (idx = 1; idx < height_1; idx += 2) {
|
|
||||||
src_line = _mm_loadu_si128((const __m128i *)ref);
|
|
||||||
t0 = _mm_unpacklo_epi8(src_line, zero);
|
|
||||||
t1 = _mm_unpackhi_epi8(src_line, zero);
|
|
||||||
s0 = _mm_adds_epu16(s0, t0);
|
|
||||||
s1 = _mm_adds_epu16(s1, t1);
|
|
||||||
ref += ref_stride;
|
|
||||||
|
|
||||||
src_line = _mm_loadu_si128((const __m128i *)ref);
|
|
||||||
t0 = _mm_unpacklo_epi8(src_line, zero);
|
|
||||||
t1 = _mm_unpackhi_epi8(src_line, zero);
|
|
||||||
s0 = _mm_adds_epu16(s0, t0);
|
|
||||||
s1 = _mm_adds_epu16(s1, t1);
|
|
||||||
ref += ref_stride;
|
|
||||||
}
|
|
||||||
|
|
||||||
src_line = _mm_loadu_si128((const __m128i *)ref);
|
|
||||||
t0 = _mm_unpacklo_epi8(src_line, zero);
|
|
||||||
t1 = _mm_unpackhi_epi8(src_line, zero);
|
|
||||||
s0 = _mm_adds_epu16(s0, t0);
|
|
||||||
s1 = _mm_adds_epu16(s1, t1);
|
|
||||||
|
|
||||||
if (height == 64) {
|
|
||||||
s0 = _mm_srai_epi16(s0, 5);
|
|
||||||
s1 = _mm_srai_epi16(s1, 5);
|
|
||||||
} else if (height == 32) {
|
|
||||||
s0 = _mm_srai_epi16(s0, 4);
|
|
||||||
s1 = _mm_srai_epi16(s1, 4);
|
|
||||||
} else {
|
|
||||||
s0 = _mm_srai_epi16(s0, 3);
|
|
||||||
s1 = _mm_srai_epi16(s1, 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i *)hbuf, s0);
|
|
||||||
hbuf += 8;
|
|
||||||
_mm_storeu_si128((__m128i *)hbuf, s1);
|
|
||||||
}
|
|
||||||
|
|
||||||
int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
|
|
||||||
__m128i zero = _mm_setzero_si128();
|
|
||||||
__m128i src_line = _mm_load_si128((const __m128i *)ref);
|
|
||||||
__m128i s0 = _mm_sad_epu8(src_line, zero);
|
|
||||||
__m128i s1;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 16; i < width; i += 16) {
|
|
||||||
ref += 16;
|
|
||||||
src_line = _mm_load_si128((const __m128i *)ref);
|
|
||||||
s1 = _mm_sad_epu8(src_line, zero);
|
|
||||||
s0 = _mm_adds_epu16(s0, s1);
|
|
||||||
}
|
|
||||||
|
|
||||||
s1 = _mm_srli_si128(s0, 8);
|
|
||||||
s0 = _mm_adds_epu16(s0, s1);
|
|
||||||
|
|
||||||
return _mm_extract_epi16(s0, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
|
|
||||||
const int bwl) {
|
|
||||||
int idx;
|
|
||||||
int width = 4 << bwl;
|
|
||||||
int16_t mean;
|
|
||||||
__m128i v0 = _mm_loadu_si128((const __m128i *)ref);
|
|
||||||
__m128i v1 = _mm_load_si128((const __m128i *)src);
|
|
||||||
__m128i diff = _mm_subs_epi16(v0, v1);
|
|
||||||
__m128i sum = diff;
|
|
||||||
__m128i sse = _mm_madd_epi16(diff, diff);
|
|
||||||
|
|
||||||
ref += 8;
|
|
||||||
src += 8;
|
|
||||||
|
|
||||||
for (idx = 8; idx < width; idx += 8) {
|
|
||||||
v0 = _mm_loadu_si128((const __m128i *)ref);
|
|
||||||
v1 = _mm_load_si128((const __m128i *)src);
|
|
||||||
diff = _mm_subs_epi16(v0, v1);
|
|
||||||
|
|
||||||
sum = _mm_add_epi16(sum, diff);
|
|
||||||
v0 = _mm_madd_epi16(diff, diff);
|
|
||||||
sse = _mm_add_epi32(sse, v0);
|
|
||||||
|
|
||||||
ref += 8;
|
|
||||||
src += 8;
|
|
||||||
}
|
|
||||||
|
|
||||||
v0 = _mm_srli_si128(sum, 8);
|
|
||||||
sum = _mm_add_epi16(sum, v0);
|
|
||||||
v0 = _mm_srli_epi64(sum, 32);
|
|
||||||
sum = _mm_add_epi16(sum, v0);
|
|
||||||
v0 = _mm_srli_epi32(sum, 16);
|
|
||||||
sum = _mm_add_epi16(sum, v0);
|
|
||||||
|
|
||||||
v1 = _mm_srli_si128(sse, 8);
|
|
||||||
sse = _mm_add_epi32(sse, v1);
|
|
||||||
v1 = _mm_srli_epi64(sse, 32);
|
|
||||||
sse = _mm_add_epi32(sse, v1);
|
|
||||||
|
|
||||||
mean = _mm_extract_epi16(sum, 0);
|
|
||||||
|
|
||||||
return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
|
|
||||||
}
|
|
@@ -1,121 +0,0 @@
|
|||||||
;
|
|
||||||
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
|
||||||
;
|
|
||||||
; Use of this source code is governed by a BSD-style license
|
|
||||||
; that can be found in the LICENSE file in the root of the source
|
|
||||||
; tree. An additional intellectual property rights grant can be found
|
|
||||||
; in the file PATENTS. All contributing project authors may
|
|
||||||
; be found in the AUTHORS file in the root of the source tree.
|
|
||||||
;
|
|
||||||
|
|
||||||
%define private_prefix vp9
|
|
||||||
|
|
||||||
%include "third_party/x86inc/x86inc.asm"
|
|
||||||
|
|
||||||
; This file provides SSSE3 version of the forward transformation. Part
|
|
||||||
; of the macro definitions are originally derived from the ffmpeg project.
|
|
||||||
; The current version applies to x86 64-bit only.
|
|
||||||
|
|
||||||
SECTION .text
|
|
||||||
|
|
||||||
%if ARCH_X86_64
|
|
||||||
; matrix transpose
|
|
||||||
%macro INTERLEAVE_2X 4
|
|
||||||
punpckh%1 m%4, m%2, m%3
|
|
||||||
punpckl%1 m%2, m%3
|
|
||||||
SWAP %3, %4
|
|
||||||
%endmacro
|
|
||||||
|
|
||||||
%macro TRANSPOSE8X8 9
|
|
||||||
INTERLEAVE_2X wd, %1, %2, %9
|
|
||||||
INTERLEAVE_2X wd, %3, %4, %9
|
|
||||||
INTERLEAVE_2X wd, %5, %6, %9
|
|
||||||
INTERLEAVE_2X wd, %7, %8, %9
|
|
||||||
|
|
||||||
INTERLEAVE_2X dq, %1, %3, %9
|
|
||||||
INTERLEAVE_2X dq, %2, %4, %9
|
|
||||||
INTERLEAVE_2X dq, %5, %7, %9
|
|
||||||
INTERLEAVE_2X dq, %6, %8, %9
|
|
||||||
|
|
||||||
INTERLEAVE_2X qdq, %1, %5, %9
|
|
||||||
INTERLEAVE_2X qdq, %3, %7, %9
|
|
||||||
INTERLEAVE_2X qdq, %2, %6, %9
|
|
||||||
INTERLEAVE_2X qdq, %4, %8, %9
|
|
||||||
|
|
||||||
SWAP %2, %5
|
|
||||||
SWAP %4, %7
|
|
||||||
%endmacro
|
|
||||||
|
|
||||||
%macro HMD8_1D 0
|
|
||||||
psubw m8, m0, m1
|
|
||||||
psubw m9, m2, m3
|
|
||||||
paddw m0, m1
|
|
||||||
paddw m2, m3
|
|
||||||
SWAP 1, 8
|
|
||||||
SWAP 3, 9
|
|
||||||
psubw m8, m4, m5
|
|
||||||
psubw m9, m6, m7
|
|
||||||
paddw m4, m5
|
|
||||||
paddw m6, m7
|
|
||||||
SWAP 5, 8
|
|
||||||
SWAP 7, 9
|
|
||||||
|
|
||||||
psubw m8, m0, m2
|
|
||||||
psubw m9, m1, m3
|
|
||||||
paddw m0, m2
|
|
||||||
paddw m1, m3
|
|
||||||
SWAP 2, 8
|
|
||||||
SWAP 3, 9
|
|
||||||
psubw m8, m4, m6
|
|
||||||
psubw m9, m5, m7
|
|
||||||
paddw m4, m6
|
|
||||||
paddw m5, m7
|
|
||||||
SWAP 6, 8
|
|
||||||
SWAP 7, 9
|
|
||||||
|
|
||||||
psubw m8, m0, m4
|
|
||||||
psubw m9, m1, m5
|
|
||||||
paddw m0, m4
|
|
||||||
paddw m1, m5
|
|
||||||
SWAP 4, 8
|
|
||||||
SWAP 5, 9
|
|
||||||
psubw m8, m2, m6
|
|
||||||
psubw m9, m3, m7
|
|
||||||
paddw m2, m6
|
|
||||||
paddw m3, m7
|
|
||||||
SWAP 6, 8
|
|
||||||
SWAP 7, 9
|
|
||||||
%endmacro
|
|
||||||
|
|
||||||
INIT_XMM ssse3
|
|
||||||
cglobal hadamard_8x8, 3, 5, 10, input, stride, output
|
|
||||||
lea r3, [2 * strideq]
|
|
||||||
lea r4, [4 * strideq]
|
|
||||||
|
|
||||||
mova m0, [inputq]
|
|
||||||
mova m1, [inputq + r3]
|
|
||||||
lea inputq, [inputq + r4]
|
|
||||||
mova m2, [inputq]
|
|
||||||
mova m3, [inputq + r3]
|
|
||||||
lea inputq, [inputq + r4]
|
|
||||||
mova m4, [inputq]
|
|
||||||
mova m5, [inputq + r3]
|
|
||||||
lea inputq, [inputq + r4]
|
|
||||||
mova m6, [inputq]
|
|
||||||
mova m7, [inputq + r3]
|
|
||||||
|
|
||||||
HMD8_1D
|
|
||||||
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
|
|
||||||
HMD8_1D
|
|
||||||
|
|
||||||
mova [outputq + 0], m0
|
|
||||||
mova [outputq + 16], m1
|
|
||||||
mova [outputq + 32], m2
|
|
||||||
mova [outputq + 48], m3
|
|
||||||
mova [outputq + 64], m4
|
|
||||||
mova [outputq + 80], m5
|
|
||||||
mova [outputq + 96], m6
|
|
||||||
mova [outputq + 112], m7
|
|
||||||
|
|
||||||
RET
|
|
||||||
%endif
|
|
@@ -17,7 +17,6 @@ VP9_CX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no)
|
|||||||
|
|
||||||
VP9_CX_SRCS-yes += vp9_cx_iface.c
|
VP9_CX_SRCS-yes += vp9_cx_iface.c
|
||||||
|
|
||||||
VP9_CX_SRCS-yes += encoder/vp9_avg.c
|
|
||||||
VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
|
VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
|
||||||
VP9_CX_SRCS-yes += encoder/vp9_context_tree.c
|
VP9_CX_SRCS-yes += encoder/vp9_context_tree.c
|
||||||
VP9_CX_SRCS-yes += encoder/vp9_context_tree.h
|
VP9_CX_SRCS-yes += encoder/vp9_context_tree.h
|
||||||
@@ -93,7 +92,6 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
|
|||||||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
|
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
|
||||||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
|
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
|
||||||
|
|
||||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
|
|
||||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
|
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
|
||||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
|
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
|
||||||
VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
|
VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
|
||||||
@@ -114,7 +112,6 @@ endif
|
|||||||
ifeq ($(ARCH_X86_64),yes)
|
ifeq ($(ARCH_X86_64),yes)
|
||||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||||
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
|
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
|
||||||
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm
|
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
@@ -131,10 +128,8 @@ ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
|||||||
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
|
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
|
||||||
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
|
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
|
||||||
endif
|
endif
|
||||||
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
|
|
||||||
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
|
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
|
||||||
|
|
||||||
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c
|
|
||||||
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
|
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
|
||||||
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c
|
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c
|
||||||
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
|
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
|
||||||
|
@@ -11,7 +11,7 @@
|
|||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
#include "./vp9_rtcd.h"
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "./vpx_config.h"
|
#include "./vpx_config.h"
|
||||||
|
|
||||||
#include "vpx/vpx_integer.h"
|
#include "vpx/vpx_integer.h"
|
||||||
@@ -24,7 +24,7 @@ static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
|
|||||||
return vget_lane_u32(c, 0);
|
return vget_lane_u32(c, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp9_avg_4x4_neon(const uint8_t *s, int p) {
|
unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) {
|
||||||
uint16x8_t v_sum;
|
uint16x8_t v_sum;
|
||||||
uint32x2_t v_s0 = vdup_n_u32(0);
|
uint32x2_t v_s0 = vdup_n_u32(0);
|
||||||
uint32x2_t v_s1 = vdup_n_u32(0);
|
uint32x2_t v_s1 = vdup_n_u32(0);
|
||||||
@@ -36,7 +36,7 @@ unsigned int vp9_avg_4x4_neon(const uint8_t *s, int p) {
|
|||||||
return (horizontal_add_u16x8(v_sum) + 8) >> 4;
|
return (horizontal_add_u16x8(v_sum) + 8) >> 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
|
unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) {
|
||||||
uint8x8_t v_s0 = vld1_u8(s);
|
uint8x8_t v_s0 = vld1_u8(s);
|
||||||
const uint8x8_t v_s1 = vld1_u8(s + p);
|
const uint8x8_t v_s1 = vld1_u8(s + p);
|
||||||
uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
|
uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
|
||||||
@@ -64,7 +64,7 @@ unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
|
|||||||
|
|
||||||
// coeff: 16 bits, dynamic range [-32640, 32640].
|
// coeff: 16 bits, dynamic range [-32640, 32640].
|
||||||
// length: value range {16, 64, 256, 1024}.
|
// length: value range {16, 64, 256, 1024}.
|
||||||
int vp9_satd_neon(const int16_t *coeff, int length) {
|
int vpx_satd_neon(const int16_t *coeff, int length) {
|
||||||
const int16x4_t zero = vdup_n_s16(0);
|
const int16x4_t zero = vdup_n_s16(0);
|
||||||
int32x4_t accum = vdupq_n_s32(0);
|
int32x4_t accum = vdupq_n_s32(0);
|
||||||
|
|
||||||
@@ -89,7 +89,7 @@ int vp9_satd_neon(const int16_t *coeff, int length) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
|
void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
|
||||||
const int ref_stride, const int height) {
|
const int ref_stride, const int height) {
|
||||||
int i;
|
int i;
|
||||||
uint16x8_t vec_sum_lo = vdupq_n_u16(0);
|
uint16x8_t vec_sum_lo = vdupq_n_u16(0);
|
||||||
@@ -142,7 +142,7 @@ void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
|
|||||||
vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
|
vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
|
||||||
}
|
}
|
||||||
|
|
||||||
int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) {
|
int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
|
||||||
int i;
|
int i;
|
||||||
uint16x8_t vec_sum = vdupq_n_u16(0);
|
uint16x8_t vec_sum = vdupq_n_u16(0);
|
||||||
|
|
||||||
@@ -158,7 +158,7 @@ int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) {
|
|||||||
|
|
||||||
// ref, src = [0, 510] - max diff = 16-bits
|
// ref, src = [0, 510] - max diff = 16-bits
|
||||||
// bwl = {2, 3, 4}, width = {16, 32, 64}
|
// bwl = {2, 3, 4}, width = {16, 32, 64}
|
||||||
int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
|
int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
|
||||||
int width = 4 << bwl;
|
int width = 4 << bwl;
|
||||||
int32x4_t sse = vdupq_n_s32(0);
|
int32x4_t sse = vdupq_n_s32(0);
|
||||||
int16x8_t total = vdupq_n_s16(0);
|
int16x8_t total = vdupq_n_s16(0);
|
@@ -7,11 +7,12 @@
|
|||||||
* in the file PATENTS. All contributing project authors may
|
* in the file PATENTS. All contributing project authors may
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
#include "./vp10_rtcd.h"
|
#include <stdlib.h>
|
||||||
#include "vp10/common/common.h"
|
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "vpx_ports/mem.h"
|
#include "vpx_ports/mem.h"
|
||||||
|
|
||||||
unsigned int vp10_avg_8x8_c(const uint8_t *s, int p) {
|
unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
|
||||||
int i, j;
|
int i, j;
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
for (i = 0; i < 8; ++i, s+=p)
|
for (i = 0; i < 8; ++i, s+=p)
|
||||||
@@ -20,7 +21,7 @@ unsigned int vp10_avg_8x8_c(const uint8_t *s, int p) {
|
|||||||
return (sum + 32) >> 6;
|
return (sum + 32) >> 6;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp10_avg_4x4_c(const uint8_t *s, int p) {
|
unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
|
||||||
int i, j;
|
int i, j;
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
for (i = 0; i < 4; ++i, s+=p)
|
for (i = 0; i < 4; ++i, s+=p)
|
||||||
@@ -61,7 +62,7 @@ static void hadamard_col8(const int16_t *src_diff, int src_stride,
|
|||||||
coeff[5] = c3 - c7;
|
coeff[5] = c3 - c7;
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp10_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
|
void vpx_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
|
||||||
int16_t *coeff) {
|
int16_t *coeff) {
|
||||||
int idx;
|
int idx;
|
||||||
int16_t buffer[64];
|
int16_t buffer[64];
|
||||||
@@ -84,14 +85,14 @@ void vp10_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// In place 16x16 2D Hadamard transform
|
// In place 16x16 2D Hadamard transform
|
||||||
void vp10_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
|
void vpx_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
|
||||||
int16_t *coeff) {
|
int16_t *coeff) {
|
||||||
int idx;
|
int idx;
|
||||||
for (idx = 0; idx < 4; ++idx) {
|
for (idx = 0; idx < 4; ++idx) {
|
||||||
// src_diff: 9 bit, dynamic range [-255, 255]
|
// src_diff: 9 bit, dynamic range [-255, 255]
|
||||||
int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
|
int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
|
||||||
+ (idx & 0x01) * 8;
|
+ (idx & 0x01) * 8;
|
||||||
vp10_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
|
vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
|
||||||
}
|
}
|
||||||
|
|
||||||
// coeff: 15 bit, dynamic range [-16320, 16320]
|
// coeff: 15 bit, dynamic range [-16320, 16320]
|
||||||
@@ -117,19 +118,19 @@ void vp10_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
|
|||||||
|
|
||||||
// coeff: 16 bits, dynamic range [-32640, 32640].
|
// coeff: 16 bits, dynamic range [-32640, 32640].
|
||||||
// length: value range {16, 64, 256, 1024}.
|
// length: value range {16, 64, 256, 1024}.
|
||||||
int16_t vp10_satd_c(const int16_t *coeff, int length) {
|
int vpx_satd_c(const int16_t *coeff, int length) {
|
||||||
int i;
|
int i;
|
||||||
int satd = 0;
|
int satd = 0;
|
||||||
for (i = 0; i < length; ++i)
|
for (i = 0; i < length; ++i)
|
||||||
satd += abs(coeff[i]);
|
satd += abs(coeff[i]);
|
||||||
|
|
||||||
// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
|
// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
|
||||||
return (int16_t)satd;
|
return satd;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Integer projection onto row vectors.
|
// Integer projection onto row vectors.
|
||||||
// height: value range {16, 32, 64}.
|
// height: value range {16, 32, 64}.
|
||||||
void vp10_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,
|
void vpx_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,
|
||||||
const int ref_stride, const int height) {
|
const int ref_stride, const int height) {
|
||||||
int idx;
|
int idx;
|
||||||
const int norm_factor = height >> 1;
|
const int norm_factor = height >> 1;
|
||||||
@@ -146,7 +147,7 @@ void vp10_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// width: value range {16, 32, 64}.
|
// width: value range {16, 32, 64}.
|
||||||
int16_t vp10_int_pro_col_c(uint8_t const *ref, const int width) {
|
int16_t vpx_int_pro_col_c(uint8_t const *ref, const int width) {
|
||||||
int idx;
|
int idx;
|
||||||
int16_t sum = 0;
|
int16_t sum = 0;
|
||||||
// sum: 14 bit, dynamic range [0, 16320]
|
// sum: 14 bit, dynamic range [0, 16320]
|
||||||
@@ -158,7 +159,7 @@ int16_t vp10_int_pro_col_c(uint8_t const *ref, const int width) {
|
|||||||
// ref: [0 - 510]
|
// ref: [0 - 510]
|
||||||
// src: [0 - 510]
|
// src: [0 - 510]
|
||||||
// bwl: {2, 3, 4}
|
// bwl: {2, 3, 4}
|
||||||
int vp10_vector_var_c(int16_t const *ref, int16_t const *src,
|
int vpx_vector_var_c(int16_t const *ref, int16_t const *src,
|
||||||
const int bwl) {
|
const int bwl) {
|
||||||
int i;
|
int i;
|
||||||
int width = 4 << bwl;
|
int width = 4 << bwl;
|
||||||
@@ -175,7 +176,7 @@ int vp10_vector_var_c(int16_t const *ref, int16_t const *src,
|
|||||||
return var;
|
return var;
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp10_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
|
void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
|
||||||
int *min, int *max) {
|
int *min, int *max) {
|
||||||
int i, j;
|
int i, j;
|
||||||
*min = 255;
|
*min = 255;
|
||||||
@@ -190,7 +191,7 @@ void vp10_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
unsigned int vp10_highbd_avg_8x8_c(const uint8_t *s8, int p) {
|
unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
|
||||||
int i, j;
|
int i, j;
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
|
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
|
||||||
@@ -200,7 +201,7 @@ unsigned int vp10_highbd_avg_8x8_c(const uint8_t *s8, int p) {
|
|||||||
return (sum + 32) >> 6;
|
return (sum + 32) >> 6;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp10_highbd_avg_4x4_c(const uint8_t *s8, int p) {
|
unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
|
||||||
int i, j;
|
int i, j;
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
|
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
|
||||||
@@ -210,7 +211,7 @@ unsigned int vp10_highbd_avg_4x4_c(const uint8_t *s8, int p) {
|
|||||||
return (sum + 8) >> 4;
|
return (sum + 8) >> 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp10_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
|
void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
|
||||||
int dp, int *min, int *max) {
|
int dp, int *min, int *max) {
|
||||||
int i, j;
|
int i, j;
|
||||||
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
|
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
|
@@ -8,10 +8,10 @@
|
|||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "./vp10_rtcd.h"
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "vpx_dsp/mips/macros_msa.h"
|
#include "vpx_dsp/mips/macros_msa.h"
|
||||||
|
|
||||||
uint32_t vp10_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
|
uint32_t vpx_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
|
||||||
uint32_t sum_out;
|
uint32_t sum_out;
|
||||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
|
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||||
v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
|
v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
|
||||||
@@ -33,7 +33,7 @@ uint32_t vp10_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
|
|||||||
return sum_out;
|
return sum_out;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t vp10_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
|
uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
|
||||||
uint32_t sum_out;
|
uint32_t sum_out;
|
||||||
uint32_t src0, src1, src2, src3;
|
uint32_t src0, src1, src2, src3;
|
||||||
v16u8 vec = { 0 };
|
v16u8 vec = { 0 };
|
@@ -252,6 +252,18 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
|
|||||||
DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm
|
DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# avg
|
||||||
|
DSP_SRCS-yes += avg.c
|
||||||
|
DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c
|
||||||
|
DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c
|
||||||
|
DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c
|
||||||
|
ifeq ($(ARCH_X86_64),yes)
|
||||||
|
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||||
|
DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
endif # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
|
endif # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
|
||||||
|
|
||||||
ifeq ($(CONFIG_ENCODERS),yes)
|
ifeq ($(CONFIG_ENCODERS),yes)
|
||||||
|
@@ -998,6 +998,35 @@ specialize qw/vpx_sad4x4 mmx neon msa/, "$sse_x86inc";
|
|||||||
#
|
#
|
||||||
# Avg
|
# Avg
|
||||||
#
|
#
|
||||||
|
if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
|
||||||
|
add_proto qw/unsigned int vpx_avg_8x8/, "const uint8_t *, int p";
|
||||||
|
specialize qw/vpx_avg_8x8 sse2 neon msa/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_avg_4x4/, "const uint8_t *, int p";
|
||||||
|
specialize qw/vpx_avg_4x4 sse2 neon msa/;
|
||||||
|
|
||||||
|
add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
|
||||||
|
specialize qw/vpx_minmax_8x8 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/void vpx_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
|
||||||
|
specialize qw/vpx_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
|
||||||
|
|
||||||
|
add_proto qw/void vpx_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
|
||||||
|
specialize qw/vpx_hadamard_16x16 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
|
||||||
|
specialize qw/vpx_satd sse2 neon/;
|
||||||
|
|
||||||
|
add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
|
||||||
|
specialize qw/vpx_int_pro_row sse2 neon/;
|
||||||
|
|
||||||
|
add_proto qw/int16_t vpx_int_pro_col/, "uint8_t const *ref, const int width";
|
||||||
|
specialize qw/vpx_int_pro_col sse2 neon/;
|
||||||
|
|
||||||
|
add_proto qw/int vpx_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
|
||||||
|
specialize qw/vpx_vector_var neon sse2/;
|
||||||
|
} # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
|
||||||
|
|
||||||
add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
||||||
specialize qw/vpx_sad64x64_avg avx2 msa/, "$sse2_x86inc";
|
specialize qw/vpx_sad64x64_avg avx2 msa/, "$sse2_x86inc";
|
||||||
|
|
||||||
@@ -1195,6 +1224,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||||||
#
|
#
|
||||||
# Avg
|
# Avg
|
||||||
#
|
#
|
||||||
|
add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
|
||||||
|
specialize qw/vpx_highbd_avg_8x8/;
|
||||||
|
add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
|
||||||
|
specialize qw/vpx_highbd_avg_4x4/;
|
||||||
|
add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
|
||||||
|
specialize qw/vpx_highbd_minmax_8x8/;
|
||||||
|
|
||||||
add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
||||||
specialize qw/vpx_highbd_sad64x64_avg/, "$sse2_x86inc";
|
specialize qw/vpx_highbd_sad64x64_avg/, "$sse2_x86inc";
|
||||||
|
|
||||||
|
@@ -10,10 +10,10 @@
|
|||||||
|
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
|
||||||
#include "./vp10_rtcd.h"
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "vpx_ports/mem.h"
|
#include "vpx_ports/mem.h"
|
||||||
|
|
||||||
void vp10_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
|
void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
|
||||||
int *min, int *max) {
|
int *min, int *max) {
|
||||||
__m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
|
__m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
|
||||||
u0 = _mm_setzero_si128();
|
u0 = _mm_setzero_si128();
|
||||||
@@ -91,7 +91,7 @@ void vp10_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
|
|||||||
*min = _mm_extract_epi16(minabsdiff, 0);
|
*min = _mm_extract_epi16(minabsdiff, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp10_avg_8x8_sse2(const uint8_t *s, int p) {
|
unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {
|
||||||
__m128i s0, s1, u0;
|
__m128i s0, s1, u0;
|
||||||
unsigned int avg = 0;
|
unsigned int avg = 0;
|
||||||
u0 = _mm_setzero_si128();
|
u0 = _mm_setzero_si128();
|
||||||
@@ -118,7 +118,7 @@ unsigned int vp10_avg_8x8_sse2(const uint8_t *s, int p) {
|
|||||||
return (avg + 32) >> 6;
|
return (avg + 32) >> 6;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp10_avg_4x4_sse2(const uint8_t *s, int p) {
|
unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
|
||||||
__m128i s0, s1, u0;
|
__m128i s0, s1, u0;
|
||||||
unsigned int avg = 0;
|
unsigned int avg = 0;
|
||||||
u0 = _mm_setzero_si128();
|
u0 = _mm_setzero_si128();
|
||||||
@@ -212,7 +212,7 @@ static void hadamard_col8_sse2(__m128i *in, int iter) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp10_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
|
void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
|
||||||
int16_t *coeff) {
|
int16_t *coeff) {
|
||||||
__m128i src[8];
|
__m128i src[8];
|
||||||
src[0] = _mm_load_si128((const __m128i *)src_diff);
|
src[0] = _mm_load_si128((const __m128i *)src_diff);
|
||||||
@@ -244,13 +244,13 @@ void vp10_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
|
|||||||
_mm_store_si128((__m128i *)coeff, src[7]);
|
_mm_store_si128((__m128i *)coeff, src[7]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp10_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
|
void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
|
||||||
int16_t *coeff) {
|
int16_t *coeff) {
|
||||||
int idx;
|
int idx;
|
||||||
for (idx = 0; idx < 4; ++idx) {
|
for (idx = 0; idx < 4; ++idx) {
|
||||||
int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
|
int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
|
||||||
+ (idx & 0x01) * 8;
|
+ (idx & 0x01) * 8;
|
||||||
vp10_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
|
vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (idx = 0; idx < 64; idx += 8) {
|
for (idx = 0; idx < 64; idx += 8) {
|
||||||
@@ -283,34 +283,33 @@ void vp10_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int16_t vp10_satd_sse2(const int16_t *coeff, int length) {
|
int vpx_satd_sse2(const int16_t *coeff, int length) {
|
||||||
int i;
|
int i;
|
||||||
__m128i sum = _mm_load_si128((const __m128i *)coeff);
|
const __m128i zero = _mm_setzero_si128();
|
||||||
__m128i sign = _mm_srai_epi16(sum, 15);
|
__m128i accum = zero;
|
||||||
__m128i val = _mm_xor_si128(sum, sign);
|
|
||||||
sum = _mm_sub_epi16(val, sign);
|
|
||||||
coeff += 8;
|
|
||||||
|
|
||||||
for (i = 8; i < length; i += 8) {
|
for (i = 0; i < length; i += 8) {
|
||||||
__m128i src_line = _mm_load_si128((const __m128i *)coeff);
|
const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
|
||||||
sign = _mm_srai_epi16(src_line, 15);
|
const __m128i inv = _mm_sub_epi16(zero, src_line);
|
||||||
val = _mm_xor_si128(src_line, sign);
|
const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line)
|
||||||
val = _mm_sub_epi16(val, sign);
|
const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
|
||||||
sum = _mm_add_epi16(sum, val);
|
const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
|
||||||
|
const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
|
||||||
|
accum = _mm_add_epi32(accum, sum);
|
||||||
coeff += 8;
|
coeff += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
val = _mm_srli_si128(sum, 8);
|
{ // cascading summation of accum
|
||||||
sum = _mm_add_epi16(sum, val);
|
__m128i hi = _mm_srli_si128(accum, 8);
|
||||||
val = _mm_srli_epi64(sum, 32);
|
accum = _mm_add_epi32(accum, hi);
|
||||||
sum = _mm_add_epi16(sum, val);
|
hi = _mm_srli_epi64(accum, 32);
|
||||||
val = _mm_srli_epi32(sum, 16);
|
accum = _mm_add_epi32(accum, hi);
|
||||||
sum = _mm_add_epi16(sum, val);
|
}
|
||||||
|
|
||||||
return _mm_extract_epi16(sum, 0);
|
return _mm_cvtsi128_si32(accum);
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp10_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
|
void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
|
||||||
const int ref_stride, const int height) {
|
const int ref_stride, const int height) {
|
||||||
int idx;
|
int idx;
|
||||||
__m128i zero = _mm_setzero_si128();
|
__m128i zero = _mm_setzero_si128();
|
||||||
@@ -359,7 +358,7 @@ void vp10_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
|
|||||||
_mm_storeu_si128((__m128i *)hbuf, s1);
|
_mm_storeu_si128((__m128i *)hbuf, s1);
|
||||||
}
|
}
|
||||||
|
|
||||||
int16_t vp10_int_pro_col_sse2(uint8_t const *ref, const int width) {
|
int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) {
|
||||||
__m128i zero = _mm_setzero_si128();
|
__m128i zero = _mm_setzero_si128();
|
||||||
__m128i src_line = _mm_load_si128((const __m128i *)ref);
|
__m128i src_line = _mm_load_si128((const __m128i *)ref);
|
||||||
__m128i s0 = _mm_sad_epu8(src_line, zero);
|
__m128i s0 = _mm_sad_epu8(src_line, zero);
|
||||||
@@ -379,7 +378,7 @@ int16_t vp10_int_pro_col_sse2(uint8_t const *ref, const int width) {
|
|||||||
return _mm_extract_epi16(s0, 0);
|
return _mm_extract_epi16(s0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
int vp10_vector_var_sse2(int16_t const *ref, int16_t const *src,
|
int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src,
|
||||||
const int bwl) {
|
const int bwl) {
|
||||||
int idx;
|
int idx;
|
||||||
int width = 4 << bwl;
|
int width = 4 << bwl;
|
@@ -8,11 +8,11 @@
|
|||||||
; be found in the AUTHORS file in the root of the source tree.
|
; be found in the AUTHORS file in the root of the source tree.
|
||||||
;
|
;
|
||||||
|
|
||||||
%define private_prefix vp10
|
%define private_prefix vpx
|
||||||
|
|
||||||
%include "third_party/x86inc/x86inc.asm"
|
%include "third_party/x86inc/x86inc.asm"
|
||||||
|
|
||||||
; This file provides SSSE3 version of the forward transformation. Part
|
; This file provides SSSE3 version of the hadamard transformation. Part
|
||||||
; of the macro definitions are originally derived from the ffmpeg project.
|
; of the macro definitions are originally derived from the ffmpeg project.
|
||||||
; The current version applies to x86 64-bit only.
|
; The current version applies to x86 64-bit only.
|
||||||
|
|
Reference in New Issue
Block a user