consolidate block_error functions

vp9_highbd_block_error_8bit_c was a very simple wrapper around vp9_block_error_c. The SSE2 implemention was practically identical to the non-HBD one. It was missing some minor improvements which only went into the original version. In quick speed tests, the AVX implementation showed minimal improvement over SSE2 when it does not detect overflow. However, when overflow is detected the function is run a second time. The OperationCheck test seems to trigger this case and reverses any speed benefits by running ~60% slower. AVX2 on the other hand is always 30-40% faster. Change-Id: I9fcb9afbcb560f234c7ae1b13ddb69eca3988ba1
2017-02-16 17:57:44 -08:00 · 2017-02-16 17:57:44 -08:00 · 904b957ae9
commit 904b957ae9
parent aa911e8b41
8 changed files with 54 additions and 438 deletions
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@ -234,11 +234,11 @@ class SatdTest : public ::testing::Test,

 typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,
                                  const tran_low_t *dqcoeff, int block_size);
-typedef std::tr1::tuple<int, BlockErrorFunc> BlockErrorTestParam;
+typedef std::tr1::tuple<int, BlockErrorFunc> BlockErrorTestFPParam;

-class BlockErrorTest
+class BlockErrorTestFP
    : public ::testing::Test,
-      public ::testing::WithParamInterface<BlockErrorTestParam> {
+      public ::testing::WithParamInterface<BlockErrorTestFPParam> {
 protected:
  virtual void SetUp() {
    txfm_size_ = GET_PARAM(0);
@ -367,21 +367,21 @@ TEST_P(SatdTest, Random) {
  Check(expected);
 }

-TEST_P(BlockErrorTest, MinValue) {
+TEST_P(BlockErrorTestFP, MinValue) {
  const int64_t kMin = -32640;
  const int64_t expected = kMin * kMin * txfm_size_;
  FillConstant(kMin, 0);
  Check(expected);
 }

-TEST_P(BlockErrorTest, MaxValue) {
+TEST_P(BlockErrorTestFP, MaxValue) {
  const int64_t kMax = 32640;
  const int64_t expected = kMax * kMax * txfm_size_;
  FillConstant(kMax, 0);
  Check(expected);
 }

-TEST_P(BlockErrorTest, Random) {
+TEST_P(BlockErrorTestFP, Random) {
  int64_t expected;
  switch (txfm_size_) {
    case 16: expected = 2051681432; break;
@ -410,7 +410,7 @@ INSTANTIATE_TEST_CASE_P(C, SatdTest,
                                          make_tuple(1024, &vpx_satd_c)));

 INSTANTIATE_TEST_CASE_P(
-    C, BlockErrorTest,
+    C, BlockErrorTestFP,
    ::testing::Values(make_tuple(16, &vp9_block_error_fp_c),
                      make_tuple(64, &vp9_block_error_fp_c),
                      make_tuple(256, &vp9_block_error_fp_c),
@ -447,7 +447,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, SatdTest,
                                          make_tuple(1024, &vpx_satd_sse2)));

 INSTANTIATE_TEST_CASE_P(
-    SSE2, BlockErrorTest,
+    SSE2, BlockErrorTestFP,
    ::testing::Values(make_tuple(16, &vp9_block_error_fp_sse2),
                      make_tuple(64, &vp9_block_error_fp_sse2),
                      make_tuple(256, &vp9_block_error_fp_sse2),
@ -488,7 +488,7 @@ INSTANTIATE_TEST_CASE_P(NEON, SatdTest,
 // in place.
 #if !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
-    NEON, BlockErrorTest,
+    NEON, BlockErrorTestFP,
    ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon),
                      make_tuple(64, &vp9_block_error_fp_neon),
                      make_tuple(256, &vp9_block_error_fp_neon),
--- a/test/test.mk
+++ b/test/test.mk
@ -157,7 +157,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc

--- a/test/vp9_block_error_test.cc
+++ b/test/vp9_block_error_test.cc
@ -23,36 +23,36 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"

 using libvpx_test::ACMRandom;

 namespace {
-#if CONFIG_VP9_HIGHBITDEPTH
 const int kNumIterations = 1000;

-typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
+typedef int64_t (*HBDBlockErrorFunc)(const tran_low_t *coeff,
+                                     const tran_low_t *dqcoeff,
+                                     intptr_t block_size, int64_t *ssz,
+                                     int bps);
+
+typedef std::tr1::tuple<HBDBlockErrorFunc, HBDBlockErrorFunc, vpx_bit_depth_t>
+    BlockErrorParam;
+
+typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,
                                  const tran_low_t *dqcoeff,
-                                  intptr_t block_size, int64_t *ssz, int bps);
+                                  intptr_t block_size, int64_t *ssz);

-typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>
-    ErrorBlockParam;
-
-// wrapper for 8-bit block error functions without a 'bps' param.
-typedef int64_t (*HighBdBlockError8bit)(const tran_low_t *coeff,
-                                        const tran_low_t *dqcoeff,
-                                        intptr_t block_size, int64_t *ssz);
-template <HighBdBlockError8bit fn>
-int64_t HighBdBlockError8bitWrapper(const tran_low_t *coeff,
-                                    const tran_low_t *dqcoeff,
-                                    intptr_t block_size, int64_t *ssz,
-                                    int bps) {
-  EXPECT_EQ(8, bps);
+template <BlockErrorFunc fn>
+int64_t BlockError8BitWrapper(const tran_low_t *coeff,
+                              const tran_low_t *dqcoeff, intptr_t block_size,
+                              int64_t *ssz, int bps) {
+  EXPECT_EQ(bps, 8);
  return fn(coeff, dqcoeff, block_size, ssz);
 }

-class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
+class BlockErrorTest : public ::testing::TestWithParam<BlockErrorParam> {
 public:
-  virtual ~ErrorBlockTest() {}
+  virtual ~BlockErrorTest() {}
  virtual void SetUp() {
    error_block_op_ = GET_PARAM(0);
    ref_error_block_op_ = GET_PARAM(1);
@ -63,11 +63,11 @@ class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {

 protected:
  vpx_bit_depth_t bit_depth_;
-  ErrorBlockFunc error_block_op_;
-  ErrorBlockFunc ref_error_block_op_;
+  HBDBlockErrorFunc error_block_op_;
+  HBDBlockErrorFunc ref_error_block_op_;
 };

-TEST_P(ErrorBlockTest, OperationCheck) {
+TEST_P(BlockErrorTest, OperationCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
@ -110,7 +110,7 @@ TEST_P(ErrorBlockTest, OperationCheck) {
      << "First failed at test case " << first_failure;
 }

-TEST_P(ErrorBlockTest, ExtremeValues) {
+TEST_P(BlockErrorTest, ExtremeValues) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
@ -171,29 +171,28 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
 using std::tr1::make_tuple;

 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, ErrorBlockTest,
-    ::testing::Values(
-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
-                   VPX_BITS_10),
-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
-                   VPX_BITS_12),
-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
-                   VPX_BITS_8),
-        make_tuple(
-            &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_sse2>,
-            &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_c>,
-            VPX_BITS_8)));
+const BlockErrorParam sse2_block_error_tests[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+             VPX_BITS_10),
+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+             VPX_BITS_12),
+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+             VPX_BITS_8),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&BlockError8BitWrapper<vp9_block_error_sse2>,
+             &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, BlockErrorTest,
+                        ::testing::ValuesIn(sse2_block_error_tests));
 #endif  // HAVE_SSE2

-#if HAVE_AVX
+#if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(
-    AVX, ErrorBlockTest,
-    ::testing::Values(make_tuple(
-        &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_avx>,
-        &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_c>,
-        VPX_BITS_8)));
-#endif  // HAVE_AVX
-
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+    AVX2, BlockErrorTest,
+    ::testing::Values(make_tuple(&BlockError8BitWrapper<vp9_block_error_avx2>,
+                                 &BlockError8BitWrapper<vp9_block_error_c>,
+                                 VPX_BITS_8)));
+#endif  // HAVE_AVX2
 }  // namespace
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -130,9 +130,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
  specialize qw/vp9_highbd_block_error sse2/;

-  add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp9_highbd_block_error_8bit sse2 avx/;
-
  add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
  specialize qw/vp9_block_error_fp sse2/;

--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -284,22 +284,12 @@ int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
  return error;
 }

-int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
-                                      const tran_low_t *dqcoeff,
-                                      intptr_t block_size, int64_t *ssz) {
-  // Note that the C versions of these 2 functions (vp9_block_error and
-  // vp9_highbd_block_error_8bit are the same, but the optimized assembly
-  // routines are not compatible in the non high bitdepth configuration, so
-  // they still cannot share the same name.
-  return vp9_block_error_c(coeff, dqcoeff, block_size, ssz);
-}
-
 static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
                                               const tran_low_t *dqcoeff,
                                               intptr_t block_size,
                                               int64_t *ssz, int bd) {
  if (bd == 8) {
-    return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz);
+    return vp9_block_error(coeff, dqcoeff, block_size, ssz);
  } else {
    return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd);
  }
@ -1130,16 +1120,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
          ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                               so->neighbors, cpi->sf.use_fast_coef_costing);
          tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
-#if CONFIG_VP9_HIGHBITDEPTH
-          distortion +=
-              vp9_highbd_block_error_8bit(
-                  coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >>
-              2;
-#else
          distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                                        16, &unused) >>
                        2;
-#endif
          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
            goto next;
          vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), dst,
--- a/vp9/encoder/x86/vp9_highbd_error_avx.asm
+++ b/vp9/encoder/x86/vp9_highbd_error_avx.asm
@ -1,261 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-ALIGN 16
-
-;
-; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
-;                                     intptr_t block_size, int64_t *ssz)
-;
-
-INIT_XMM avx
-cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
-  vzeroupper
-
-  ; If only one iteration is required, then handle this as a special case.
-  ; It is the most frequent case, so we can have a significant gain here
-  ; by not setting up a loop and accumulators.
-  cmp    sizeq, 16
-  jne   .generic
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Common case of size == 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-  ; Load input vectors
-  mova      xm0, [dqcq]
-  packssdw  xm0, [dqcq+16]
-  mova      xm2, [uqcq]
-  packssdw  xm2, [uqcq+16]
-
-  mova      xm1, [dqcq+32]
-  packssdw  xm1, [dqcq+48]
-  mova      xm3, [uqcq+32]
-  packssdw  xm3, [uqcq+48]
-
-  ; Compute the errors.
-  psubw     xm0, xm2
-  psubw     xm1, xm3
-
-  ; Individual errors are max 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
-  pmaddwd   xm2, xm2
-  pmaddwd   xm3, xm3
-
-  pmaddwd   xm0, xm0
-  pmaddwd   xm1, xm1
-
-  ; Squares are always positive, so we can use unsigned arithmetic after
-  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
-  ; fit in 32bits
-  paddd     xm2, xm3
-  paddd     xm0, xm1
-
-  ; Accumulate horizontally in 64 bits, there is no chance of overflow here
-  pxor      xm5, xm5
-
-  pblendw   xm3, xm5, xm2, 0x33 ; Zero extended  low of a pair of 32 bits
-  psrlq     xm2, 32             ; Zero extended high of a pair of 32 bits
-
-  pblendw   xm1, xm5, xm0, 0x33 ; Zero extended  low of a pair of 32 bits
-  psrlq     xm0, 32             ; Zero extended high of a pair of 32 bits
-
-  paddq     xm2, xm3
-  paddq     xm0, xm1
-
-  psrldq    xm3, xm2, 8
-  psrldq    xm1, xm0, 8
-
-  paddq     xm2, xm3
-  paddq     xm0, xm1
-
-  ; Store the return value
-%if ARCH_X86_64
-  movq      rax, xm0
-  movq   [sszq], xm2
-%else
-  movd      eax, xm0
-  pextrd    edx, xm0, 1
-  movq   [sszd], xm2
-%endif
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of size != 16, speculative low precision
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ALIGN 16
-.generic:
-  pxor      xm4, xm4                ; sse accumulator
-  pxor      xm5, xm5                ; overflow detection register for xm4
-  pxor      xm6, xm6                ; ssz accumulator
-  pxor      xm7, xm7                ; overflow detection register for xm6
-  lea      uqcq, [uqcq+sizeq*4]
-  lea      dqcq, [dqcq+sizeq*4]
-  neg     sizeq
-
-  ; Push the negative size as the high precision code might need it
-  push    sizeq
-
-.loop:
-  ; Load input vectors
-  mova      xm0, [dqcq+sizeq*4]
-  packssdw  xm0, [dqcq+sizeq*4+16]
-  mova      xm2, [uqcq+sizeq*4]
-  packssdw  xm2, [uqcq+sizeq*4+16]
-
-  mova      xm1, [dqcq+sizeq*4+32]
-  packssdw  xm1, [dqcq+sizeq*4+48]
-  mova      xm3, [uqcq+sizeq*4+32]
-  packssdw  xm3, [uqcq+sizeq*4+48]
-
-  add     sizeq, 16
-
-  ; Compute the squared errors.
-  ; Individual errors are max 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
-  psubw     xm0, xm2
-  pmaddwd   xm2, xm2
-  pmaddwd   xm0, xm0
-
-  psubw     xm1, xm3
-  pmaddwd   xm3, xm3
-  pmaddwd   xm1, xm1
-
-  ; Squares are always positive, so we can use unsigned arithmetic after
-  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
-  ; fit in 32bits
-  paddd     xm2, xm3
-  paddd     xm0, xm1
-
-  ; We accumulate using 32 bit arithmetic, but detect potential overflow
-  ; by checking if the MSB of the accumulators have ever been a set bit.
-  ; If yes, we redo the whole compute at the end on higher precision, but
-  ; this happens extremely rarely, so we still achieve a net gain.
-  paddd     xm4, xm0
-  paddd     xm6, xm2
-  por       xm5, xm4  ; OR in the accumulator for overflow detection
-  por       xm7, xm6  ; OR in the accumulator for overflow detection
-
-  jnz .loop
-
-  ; Add pairs horizontally (still only on 32 bits)
-  phaddd    xm4, xm4
-  por       xm5, xm4  ; OR in the accumulator for overflow detection
-  phaddd    xm6, xm6
-  por       xm7, xm6  ; OR in the accumulator for overflow detection
-
-  ; Check for possibility of overflow by testing if bit 32 of each dword lane
-  ; have ever been set. If they were not, then there was no overflow and the
-  ; final sum will fit in 32 bits. If overflow happened, then
-  ; we redo the whole computation on higher precision.
-  por       xm7, xm5
-  pmovmskb   r4, xm7
-  test       r4, 0x8888
-  jnz .highprec
-
-  phaddd    xm4, xm4
-  phaddd    xm6, xm6
-  pmovzxdq  xm4, xm4
-  pmovzxdq  xm6, xm6
-
-  ; Restore stack
-  pop     sizeq
-
-  ; Store the return value
-%if ARCH_X86_64
-  movq      rax, xm4
-  movq   [sszq], xm6
-%else
-  movd      eax, xm4
-  pextrd    edx, xm4, 1
-  movq   [sszd], xm6
-%endif
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of size != 16, high precision case
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-.highprec:
-  pxor      xm4, xm4                 ; sse accumulator
-  pxor      xm5, xm5                 ; dedicated zero register
-  pxor      xm6, xm6                 ; ssz accumulator
-  pop     sizeq
-
-.loophp:
-  mova      xm0, [dqcq+sizeq*4]
-  packssdw  xm0, [dqcq+sizeq*4+16]
-  mova      xm2, [uqcq+sizeq*4]
-  packssdw  xm2, [uqcq+sizeq*4+16]
-
-  mova      xm1, [dqcq+sizeq*4+32]
-  packssdw  xm1, [dqcq+sizeq*4+48]
-  mova      xm3, [uqcq+sizeq*4+32]
-  packssdw  xm3, [uqcq+sizeq*4+48]
-
-  add     sizeq, 16
-
-  ; individual errors are max. 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-
-  psubw     xm0, xm2
-  pmaddwd   xm2, xm2
-  pmaddwd   xm0, xm0
-
-  psubw     xm1, xm3
-  pmaddwd   xm3, xm3
-  pmaddwd   xm1, xm1
-
-  ; accumulate in 64bit
-  punpckldq xm7, xm0, xm5
-  punpckhdq xm0, xm5
-  paddq     xm4, xm7
-
-  punpckldq xm7, xm2, xm5
-  punpckhdq xm2, xm5
-  paddq     xm6, xm7
-
-  punpckldq xm7, xm1, xm5
-  punpckhdq xm1, xm5
-  paddq     xm4, xm7
-
-  punpckldq xm7, xm3, xm5
-  punpckhdq xm3, xm5
-  paddq     xm6, xm7
-
-  paddq     xm4, xm0
-  paddq     xm4, xm1
-  paddq     xm6, xm2
-  paddq     xm6, xm3
-
-  jnz .loophp
-
-  ; Accumulate horizontally
-  movhlps   xm5, xm4
-  movhlps   xm7, xm6
-  paddq     xm4, xm5
-  paddq     xm6, xm7
-
-  ; Store the return value
-%if ARCH_X86_64
-  movq      rax, xm4
-  movq   [sszq], xm6
-%else
-  movd      eax, xm4
-  pextrd    edx, xm4, 1
-  movq   [sszd], xm6
-%endif
-  RET
-
-END
--- a/vp9/encoder/x86/vp9_highbd_error_sse2.asm
+++ b/vp9/encoder/x86/vp9_highbd_error_sse2.asm
@ -1,98 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-ALIGN 16
-
-;
-; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
-;                                     intptr_t block_size, int64_t *ssz)
-;
-
-INIT_XMM sse2
-cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz
-  pxor      m4, m4                 ; sse accumulator
-  pxor      m6, m6                 ; ssz accumulator
-  pxor      m5, m5                 ; dedicated zero register
-  lea     uqcq, [uqcq+sizeq*4]
-  lea     dqcq, [dqcq+sizeq*4]
-  neg    sizeq
-
-  ALIGN 16
-
-.loop:
-  mova      m0, [dqcq+sizeq*4]
-  packssdw  m0, [dqcq+sizeq*4+mmsize]
-  mova      m2, [uqcq+sizeq*4]
-  packssdw  m2, [uqcq+sizeq*4+mmsize]
-
-  mova      m1, [dqcq+sizeq*4+mmsize*2]
-  packssdw  m1, [dqcq+sizeq*4+mmsize*3]
-  mova      m3, [uqcq+sizeq*4+mmsize*2]
-  packssdw  m3, [uqcq+sizeq*4+mmsize*3]
-
-  add    sizeq, mmsize
-
-  ; individual errors are max. 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-
-  psubw     m0, m2
-  pmaddwd   m2, m2
-  pmaddwd   m0, m0
-
-  psubw     m1, m3
-  pmaddwd   m3, m3
-  pmaddwd   m1, m1
-
-  ; accumulate in 64bit
-  punpckldq m7, m0, m5
-  punpckhdq m0, m5
-  paddq     m4, m7
-
-  punpckldq m7, m2, m5
-  punpckhdq m2, m5
-  paddq     m6, m7
-
-  punpckldq m7, m1, m5
-  punpckhdq m1, m5
-  paddq     m4, m7
-
-  punpckldq m7, m3, m5
-  punpckhdq m3, m5
-  paddq     m6, m7
-
-  paddq     m4, m0
-  paddq     m4, m1
-  paddq     m6, m2
-  paddq     m6, m3
-
-  jnz .loop
-
-  ; accumulate horizontally and store in return value
-  movhlps   m5, m4
-  movhlps   m7, m6
-  paddq     m4, m5
-  paddq     m6, m7
-
-%if ARCH_X86_64
-  movq    rax, m4
-  movq [sszq], m6
-%else
-  mov     eax, sszm
-  pshufd   m5, m4, 0x1
-  movq  [eax], m6
-  movd    eax, m4
-  movd    edx, m5
-%endif
-  RET
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@ -108,10 +108,6 @@ endif

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
-endif

 ifeq ($(ARCH_X86_64),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm