Remove vp8, vp9 folders

Change-Id: I09b8acd22d031ece52e1fee18b998349bf1cf06b
2016-07-25 15:39:46 -07:00
parent b124b243d4
commit 3bd709fafe
433 changed files with 24 additions and 152665 deletions
--- a/13
+++ b/13
@@ -38,8 +38,6 @@ Advanced options:
  ${toggle_better_hw_compatibility}
                                  enable encoder to produce streams with better
                                  hardware decoder compatibility
  ${toggle_vp8}                   VP8 codec support
  ${toggle_vp9}                   VP9 codec support
  ${toggle_vp10}                  VP10 codec support
  ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
  ${toggle_postproc}              postprocessing
@@ -191,13 +189,8 @@ if [ ${doxy_major:-0} -ge 1 ]; then
 fi
 # disable codecs when their source directory does not exist
 [ -d "${source_path}/vp8" ] || disable_codec vp8
 [ -d "${source_path}/vp9" ] || disable_codec vp9
 [ -d "${source_path}/vp10" ] || disable_codec vp10
 # disable vp10 codec by default
 disable_codec vp10
 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
 # case.
@@ -214,16 +207,10 @@ enable_feature os_support
 enable_feature temporal_denoising
 CODECS="
    vp8_encoder
    vp8_decoder
    vp9_encoder
    vp9_decoder
    vp10_encoder
    vp10_decoder
 "
 CODEC_FAMILIES="
    vp8
    vp9
    vp10
 "
--- a/examples.mk
+++ b/examples.mk
@@ -248,10 +248,12 @@ endif
 ifeq ($(CONFIG_OS_SUPPORT), yes)
 CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
 CODEC_EXTRA_LIBS-$(CONFIG_VP9)         += m
 CODEC_EXTRA_LIBS-$(CONFIG_VP10)        += m
 else
    ifeq ($(CONFIG_GCC), yes)
    CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
    CODEC_EXTRA_LIBS-$(CONFIG_VP9)         += m
    CODEC_EXTRA_LIBS-$(CONFIG_VP10)        += m
    endif
 endif
 #
--- a/libs.mk
+++ b/libs.mk
@@ -53,62 +53,6 @@ CODEC_SRCS-yes += $(addprefix vpx_dsp/,$(call enabled,DSP_SRCS))
 include $(SRC_PATH_BARE)/vpx_util/vpx_util.mk
 CODEC_SRCS-yes += $(addprefix vpx_util/,$(call enabled,UTIL_SRCS))
 ifeq ($(CONFIG_VP8),yes)
  VP8_PREFIX=vp8/
  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
 endif
 ifeq ($(CONFIG_VP8_ENCODER),yes)
  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx.mk
  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))
  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
  CODEC_DOC_SECTIONS += vp8 vp8_encoder
 endif
 ifeq ($(CONFIG_VP8_DECODER),yes)
  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx.mk
  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))
  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))
  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
  CODEC_DOC_SECTIONS += vp8 vp8_decoder
 endif
 ifeq ($(CONFIG_VP9),yes)
  VP9_PREFIX=vp9/
  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk
 endif
 ifeq ($(CONFIG_VP9_ENCODER),yes)
  VP9_PREFIX=vp9/
  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9cx.mk
  CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
  CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
  CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
  INSTALL-LIBS-$(CONFIG_SPATIAL_SVC) += include/vpx/svc_context.h
  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
  CODEC_DOC_SECTIONS += vp9 vp9_encoder
 endif
 ifeq ($(CONFIG_VP9_DECODER),yes)
  VP9_PREFIX=vp9/
  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9dx.mk
  CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_DX_SRCS))
  CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_DX_EXPORTS))
  CODEC_SRCS-yes += $(VP9_PREFIX)vp9dx.mk vpx/vp8.h vpx/vp8dx.h
  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
  CODEC_DOC_SECTIONS += vp9 vp9_decoder
 endif
 VP9_PREFIX=vp9/
 $(BUILD_PFX)$(VP9_PREFIX)%.c.o: CFLAGS += -Wextra
 #  VP10 make file
 ifeq ($(CONFIG_VP10),yes)
  VP10_PREFIX=vp10/
--- a/test/blockiness_test.cc
+++ b/test/blockiness_test.cc
@@ -15,9 +15,6 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_config.h"
 #if CONFIG_VP9_ENCODER
 #include "./vp9_rtcd.h"
 #endif
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
--- a/test/consistency_test.cc
+++ b/test/consistency_test.cc
@@ -15,9 +15,6 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_config.h"
 #if CONFIG_VP9_ENCODER
 #include "./vp9_rtcd.h"
 #endif
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
@@ -132,78 +129,10 @@ class ConsistencyTestBase : public ::testing::Test {
  ACMRandom rnd_;
 };
 #if CONFIG_VP9_ENCODER
 typedef std::tr1::tuple<int, int> ConsistencyParam;
 class ConsistencyVP9Test
    : public ConsistencyTestBase,
      public ::testing::WithParamInterface<ConsistencyParam> {
 public:
  ConsistencyVP9Test() : ConsistencyTestBase(GET_PARAM(0), GET_PARAM(1)) {}
 protected:
  double CheckConsistency(int frame) {
    EXPECT_LT(frame, 2)<< "Frame to check has to be less than 2.";
    return
        vpx_get_ssim_metrics(source_data_[frame], source_stride_,
                             reference_data_[frame], reference_stride_,
                             width_, height_, ssim_array_, &metrics_, 1);
  }
 };
 #endif  // CONFIG_VP9_ENCODER
 uint8_t* ConsistencyTestBase::source_data_[2] = {NULL, NULL};
 uint8_t* ConsistencyTestBase::reference_data_[2] = {NULL, NULL};
 Ssimv* ConsistencyTestBase::ssim_array_ = NULL;
 #if CONFIG_VP9_ENCODER
 TEST_P(ConsistencyVP9Test, ConsistencyIsZero) {
  FillRandom(source_data_[0], source_stride_);
  Copy(source_data_[1], source_data_[0]);
  Copy(reference_data_[0], source_data_[0]);
  Blur(reference_data_[0], reference_stride_, 3);
  Copy(reference_data_[1], source_data_[0]);
  Blur(reference_data_[1], reference_stride_, 3);
  double inconsistency = CheckConsistency(1);
  inconsistency = CheckConsistency(0);
  EXPECT_EQ(inconsistency, 0.0)
      << "Should have 0 inconsistency if they are exactly the same.";
  // If sources are not consistent reference frames inconsistency should
  // be less than if the source is consistent.
  FillRandom(source_data_[0], source_stride_);
  FillRandom(source_data_[1], source_stride_);
  FillRandom(reference_data_[0], reference_stride_);
  FillRandom(reference_data_[1], reference_stride_);
  CheckConsistency(0);
  inconsistency = CheckConsistency(1);
  Copy(source_data_[1], source_data_[0]);
  CheckConsistency(0);
  double inconsistency2 = CheckConsistency(1);
  EXPECT_LT(inconsistency, inconsistency2)
      << "Should have less inconsistency if source itself is inconsistent.";
  // Less of a blur should be less inconsistent than more blur coming off a
  // a frame with no blur.
  ClearSsim();
  FillRandom(source_data_[0], source_stride_);
  Copy(source_data_[1], source_data_[0]);
  Copy(reference_data_[0], source_data_[0]);
  Copy(reference_data_[1], source_data_[0]);
  Blur(reference_data_[1], reference_stride_, 4);
  CheckConsistency(0);
  inconsistency = CheckConsistency(1);
  ClearSsim();
  Copy(reference_data_[1], source_data_[0]);
  Blur(reference_data_[1], reference_stride_, 8);
  CheckConsistency(0);
  inconsistency2 = CheckConsistency(1);
  EXPECT_LT(inconsistency, inconsistency2)
      << "Stronger Blur should produce more inconsistency.";
 }
 #endif  // CONFIG_VP9_ENCODER
 using std::tr1::make_tuple;
@@ -211,14 +140,4 @@ using std::tr1::make_tuple;
 //------------------------------------------------------------------------------
 // C functions
 #if CONFIG_VP9_ENCODER
 const ConsistencyParam c_vp9_tests[] = {
  make_tuple(320, 240),
  make_tuple(318, 242),
  make_tuple(318, 238),
 };
 INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test,
                        ::testing::ValuesIn(c_vp9_tests));
 #endif
 }  // namespace
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -13,14 +13,11 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_filter.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_mem/vpx_mem.h"
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -14,14 +14,11 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -14,14 +14,12 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -20,12 +20,6 @@ namespace {
 TEST(EncodeAPI, InvalidParams) {
  static const vpx_codec_iface_t *kCodecs[] = {
 #if CONFIG_VP8_ENCODER
    &vpx_codec_vp8_cx_algo,
 #endif
 #if CONFIG_VP9_ENCODER
    &vpx_codec_vp9_cx_algo,
 #endif
 #if CONFIG_VP10_ENCODER
    &vpx_codec_vp10_cx_algo,
 #endif
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -14,7 +14,6 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -14,14 +14,11 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -9,7 +9,6 @@
 */
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -14,14 +14,11 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_integer.h"
 using libvpx_test::ACMRandom;
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -13,7 +13,6 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
--- a/test/sixtap_predict_test.cc
+++ b/test/sixtap_predict_test.cc
@@ -15,7 +15,6 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
--- a/test/test.mk
+++ b/test/test.mk
@@ -10,6 +10,7 @@ LIBVPX_TEST_SRCS-yes += test_vectors.h
 LIBVPX_TEST_SRCS-yes += util.h
 LIBVPX_TEST_SRCS-yes += video_source.h
 LIBVPX_TEST_SRCS-yes += transform_test_base.h
 LIBVPX_TEST_SRCS-yes += function_equivalence_test.h
 ##
 ## BLACK BOX TESTS
@@ -142,7 +143,7 @@ LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_encoder_parms_get_to_decoder.cc
 endif
-LIBVPX_TEST_SRCS-yes                   += convolve_test.cc
+#LIBVPX_TEST_SRCS-yes                   += convolve_test.cc
 LIBVPX_TEST_SRCS-yes                   += lpf_8_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_intrapred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_decrypt_test.cc
@@ -173,7 +174,7 @@ endif # VP9
 ## VP10
 ifeq ($(CONFIG_VP10),yes)
-LIBVPX_TEST_SRCS-yes                    += vp10_inv_txfm_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_inv_txfm_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht8x8_test.cc
--- a/test/vp10_convolve_test.cc
+++ b/test/vp10_convolve_test.cc
@@ -12,7 +12,7 @@ using libvpx_test::ACMRandom;
 namespace {
 void setup_convolve() {
-#if HAVE_SSSE3
+#if HAVE_SSSE3 && CONFIG_RUNTIME_CPU_DETECT
  vp10_convolve_horiz = vp10_convolve_horiz_c;
  vp10_convolve_vert = vp10_convolve_vert_c;
 #endif
--- a/test/vp10_inv_txfm_test.cc
+++ b/test/vp10_inv_txfm_test.cc
@@ -289,31 +289,31 @@ using std::tr1::make_tuple;
 INSTANTIATE_TEST_CASE_P(
    C, Vp10PartialIDctTest,
    ::testing::Values(
-        make_tuple(&vpx_fdct32x32_c,
+        make_tuple(&vp10_fdct32x32_c,
                   &vp10_idct32x32_1024_add_c,
                   &vp10_idct32x32_34_add_c,
                   TX_32X32, 34),
-        make_tuple(&vpx_fdct32x32_c,
+        make_tuple(&vp10_fdct32x32_c,
                   &vp10_idct32x32_1024_add_c,
                   &vp10_idct32x32_1_add_c,
                   TX_32X32, 1),
-        make_tuple(&vpx_fdct16x16_c,
+        make_tuple(&vp10_fdct16x16_c,
                   &vp10_idct16x16_256_add_c,
                   &vp10_idct16x16_10_add_c,
                   TX_16X16, 10),
-        make_tuple(&vpx_fdct16x16_c,
+        make_tuple(&vp10_fdct16x16_c,
                   &vp10_idct16x16_256_add_c,
                   &vp10_idct16x16_1_add_c,
                   TX_16X16, 1),
-        make_tuple(&vpx_fdct8x8_c,
+        make_tuple(&vp10_fdct8x8_c,
                   &vp10_idct8x8_64_add_c,
                   &vp10_idct8x8_12_add_c,
                   TX_8X8, 12),
-        make_tuple(&vpx_fdct8x8_c,
+        make_tuple(&vp10_fdct8x8_c,
                   &vp10_idct8x8_64_add_c,
                   &vp10_idct8x8_1_add_c,
                   TX_8X8, 1),
-        make_tuple(&vpx_fdct4x4_c,
+        make_tuple(&vp10_fdct4x4_c,
                   &vp10_idct4x4_16_add_c,
                   &vp10_idct4x4_1_add_c,
                   TX_4X4, 1)));
--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@@ -17,7 +17,6 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vp8_rtcd.h"
 #include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
--- a/test/vp9_error_block_test.cc
+++ b/test/vp9_error_block_test.cc
@@ -15,12 +15,10 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
--- a/tools_common.c
+++ b/tools_common.c
@@ -16,11 +16,11 @@
 #include "./tools_common.h"
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+#if CONFIG_VP10_ENCODER
 #include "vpx/vp8cx.h"
 #endif
-#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
+#if CONFIG_VP10_DECODER
 #include "vpx/vp8dx.h"
 #endif
@@ -136,14 +136,6 @@ static const VpxInterface vpx_encoders[] = {
 #if CONFIG_VP10_ENCODER
  {"vp10", VP10_FOURCC, &vpx_codec_vp10_cx},
 #endif
 #if CONFIG_VP8_ENCODER
  {"vp8", VP8_FOURCC, &vpx_codec_vp8_cx},
 #endif
 #if CONFIG_VP9_ENCODER
  {"vp9", VP9_FOURCC, &vpx_codec_vp9_cx},
 #endif
 };
 int get_vpx_encoder_count(void) {
@@ -171,13 +163,6 @@ const VpxInterface *get_vpx_encoder_by_name(const char *name) {
 #if CONFIG_DECODERS
 static const VpxInterface vpx_decoders[] = {
 #if CONFIG_VP8_DECODER
  {"vp8", VP8_FOURCC, &vpx_codec_vp8_dx},
 #endif
 #if CONFIG_VP9_DECODER
  {"vp9", VP9_FOURCC, &vpx_codec_vp9_dx},
 #endif
 #if CONFIG_VP10_DECODER
  {"vp10", VP10_FOURCC, &vpx_codec_vp10_dx},
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@@ -10,10 +10,12 @@
 VP10_COMMON_SRCS-yes += vp10_common.mk
 VP10_COMMON_SRCS-yes += vp10_iface_common.h
 VP10_COMMON_SRCS-yes += common/ans.h
 VP10_COMMON_SRCS-yes += common/ppflags.h
 VP10_COMMON_SRCS-yes += common/alloccommon.c
 VP10_COMMON_SRCS-yes += common/blockd.c
 VP10_COMMON_SRCS-yes += common/debugmodes.c
 VP10_COMMON_SRCS-yes += common/divide.h
 VP10_COMMON_SRCS-yes += common/entropy.c
 VP10_COMMON_SRCS-yes += common/entropymode.c
 VP10_COMMON_SRCS-yes += common/entropymv.c
@@ -57,6 +59,7 @@ VP10_COMMON_SRCS-yes += common/mvref_common.h
 VP10_COMMON_SRCS-yes += common/quant_common.c
 VP10_COMMON_SRCS-yes += common/reconinter.c
 VP10_COMMON_SRCS-yes += common/reconintra.c
 VP10_COMMON_SRCS-yes += common/restoration.h
 VP10_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
 VP10_COMMON_SRCS-yes += common/common_data.h
 VP10_COMMON_SRCS-yes += common/scan.c
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -1,190 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "alloccommon.h"
 #include "blockd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "onyxc_int.h"
 #include "findnearmv.h"
 #include "entropymode.h"
 #include "systemdependent.h"
 void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
 {
    int i;
    for (i = 0; i < NUM_YV12_BUFFERS; i++)
        vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
    vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
 #if CONFIG_POSTPROC
    vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
    if (oci->post_proc_buffer_int_used)
        vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int);
    vpx_free(oci->pp_limits_buffer);
    oci->pp_limits_buffer = NULL;
 #endif
    vpx_free(oci->above_context);
    vpx_free(oci->mip);
 #if CONFIG_ERROR_CONCEALMENT
    vpx_free(oci->prev_mip);
    oci->prev_mip = NULL;
 #endif
    oci->above_context = NULL;
    oci->mip = NULL;
 }
 int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
 {
    int i;
    vp8_de_alloc_frame_buffers(oci);
    /* our internal buffers are always multiples of 16 */
    if ((width & 0xf) != 0)
        width += 16 - (width & 0xf);
    if ((height & 0xf) != 0)
        height += 16 - (height & 0xf);
    for (i = 0; i < NUM_YV12_BUFFERS; i++)
    {
        oci->fb_idx_ref_cnt[i] = 0;
        oci->yv12_fb[i].flags = 0;
        if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0)
            goto allocation_fail;
    }
    oci->new_fb_idx = 0;
    oci->lst_fb_idx = 1;
    oci->gld_fb_idx = 2;
    oci->alt_fb_idx = 3;
    oci->fb_idx_ref_cnt[0] = 1;
    oci->fb_idx_ref_cnt[1] = 1;
    oci->fb_idx_ref_cnt[2] = 1;
    oci->fb_idx_ref_cnt[3] = 1;
    if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame,   width, 16, VP8BORDERINPIXELS) < 0)
        goto allocation_fail;
    oci->mb_rows = height >> 4;
    oci->mb_cols = width >> 4;
    oci->MBs = oci->mb_rows * oci->mb_cols;
    oci->mode_info_stride = oci->mb_cols + 1;
    oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
    if (!oci->mip)
        goto allocation_fail;
    oci->mi = oci->mip + oci->mode_info_stride + 1;
    /* Allocation of previous mode info will be done in vp8_decode_frame()
     * as it is a decoder only data */
    oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
    if (!oci->above_context)
        goto allocation_fail;
 #if CONFIG_POSTPROC
    if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0)
        goto allocation_fail;
    oci->post_proc_buffer_int_used = 0;
    memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
    memset(oci->post_proc_buffer.buffer_alloc, 128,
           oci->post_proc_buffer.frame_size);
    /* Allocate buffer to store post-processing filter coefficients.
     *
     * Note: Round up mb_cols to support SIMD reads
     */
    oci->pp_limits_buffer = vpx_memalign(16, 24 * ((oci->mb_cols + 1) & ~1));
    if (!oci->pp_limits_buffer)
        goto allocation_fail;
 #endif
    return 0;
 allocation_fail:
    vp8_de_alloc_frame_buffers(oci);
    return 1;
 }
 void vp8_setup_version(VP8_COMMON *cm)
 {
    switch (cm->version)
    {
    case 0:
        cm->no_lpf = 0;
        cm->filter_type = NORMAL_LOOPFILTER;
        cm->use_bilinear_mc_filter = 0;
        cm->full_pixel = 0;
        break;
    case 1:
        cm->no_lpf = 0;
        cm->filter_type = SIMPLE_LOOPFILTER;
        cm->use_bilinear_mc_filter = 1;
        cm->full_pixel = 0;
        break;
    case 2:
        cm->no_lpf = 1;
        cm->filter_type = NORMAL_LOOPFILTER;
        cm->use_bilinear_mc_filter = 1;
        cm->full_pixel = 0;
        break;
    case 3:
        cm->no_lpf = 1;
        cm->filter_type = SIMPLE_LOOPFILTER;
        cm->use_bilinear_mc_filter = 1;
        cm->full_pixel = 1;
        break;
    default:
        /*4,5,6,7 are reserved for future use*/
        cm->no_lpf = 0;
        cm->filter_type = NORMAL_LOOPFILTER;
        cm->use_bilinear_mc_filter = 0;
        cm->full_pixel = 0;
        break;
    }
 }
 void vp8_create_common(VP8_COMMON *oci)
 {
    vp8_machine_specific_config(oci);
    vp8_init_mbmode_probs(oci);
    vp8_default_bmode_probs(oci->fc.bmode_prob);
    oci->mb_no_coeff_skip = 1;
    oci->no_lpf = 0;
    oci->filter_type = NORMAL_LOOPFILTER;
    oci->use_bilinear_mc_filter = 0;
    oci->full_pixel = 0;
    oci->multi_token_partition = ONE_PARTITION;
    oci->clamp_type = RECON_CLAMP_REQUIRED;
    /* Initialize reference frame sign bias structure to defaults */
    memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
    /* Default disable buffer to buffer copying */
    oci->copy_buffer_to_gf = 0;
    oci->copy_buffer_to_arf = 0;
 }
 void vp8_remove_common(VP8_COMMON *oci)
 {
    vp8_de_alloc_frame_buffers(oci);
 }
--- a/vp8/common/alloccommon.h
+++ b/vp8/common/alloccommon.h
@@ -1,31 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_ALLOCCOMMON_H_
 #define VP8_COMMON_ALLOCCOMMON_H_
 #include "onyxc_int.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 void vp8_create_common(VP8_COMMON *oci);
 void vp8_remove_common(VP8_COMMON *oci);
 void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
 int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
 void vp8_setup_version(VP8_COMMON *oci);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_ALLOCCOMMON_H_
--- a/vp8/common/arm/armv6/bilinearfilter_v6.asm
+++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm
@@ -1,237 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_filter_block2d_bil_first_pass_armv6|
    EXPORT  |vp8_filter_block2d_bil_second_pass_armv6|
    AREA    |.text|, CODE, READONLY  ; name this block of code
 ;-------------------------------------
 ; r0    unsigned char  *src_ptr,
 ; r1    unsigned short *dst_ptr,
 ; r2    unsigned int    src_pitch,
 ; r3    unsigned int    height,
 ; stack unsigned int    width,
 ; stack const short    *vp8_filter
 ;-------------------------------------
 ; The output is transposed stroed in output array to make it easy for second pass filtering.
 |vp8_filter_block2d_bil_first_pass_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}
    ldr     r11, [sp, #40]                  ; vp8_filter address
    ldr     r4, [sp, #36]                   ; width
    mov     r12, r3                         ; outer-loop counter
    add     r7, r2, r4                      ; preload next row
    pld     [r0, r7]
    sub     r2, r2, r4                      ; src increment for height loop
    ldr     r5, [r11]                       ; load up filter coefficients
    mov     r3, r3, lsl #1                  ; height*2
    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
    mov     r11, r1                         ; save dst_ptr for each row
    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
    beq     bil_null_1st_filter
 |bil_height_loop_1st_v6|
    ldrb    r6, [r0]                        ; load source data
    ldrb    r7, [r0, #1]
    ldrb    r8, [r0, #2]
    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
 |bil_width_loop_1st_v6|
    ldrb    r9, [r0, #3]
    ldrb    r10, [r0, #4]
    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
    smuad   r6, r6, r5                      ; apply the filter
    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
    smuad   r7, r7, r5
    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
    smuad   r8, r8, r5
    smuad   r9, r9, r5
    add     r0, r0, #4
    subs    lr, lr, #1
    add     r6, r6, #0x40                   ; round_shift_and_clamp
    add     r7, r7, #0x40
    usat    r6, #16, r6, asr #7
    usat    r7, #16, r7, asr #7
    strh    r6, [r1], r3                    ; result is transposed and stored
    add     r8, r8, #0x40                   ; round_shift_and_clamp
    strh    r7, [r1], r3
    add     r9, r9, #0x40
    usat    r8, #16, r8, asr #7
    usat    r9, #16, r9, asr #7
    strh    r8, [r1], r3                    ; result is transposed and stored
    ldrneb  r6, [r0]                        ; load source data
    strh    r9, [r1], r3
    ldrneb  r7, [r0, #1]
    ldrneb  r8, [r0, #2]
    bne     bil_width_loop_1st_v6
    add     r0, r0, r2                      ; move to next input row
    subs    r12, r12, #1
    add     r9, r2, r4, lsl #1              ; adding back block width
    pld     [r0, r9]                        ; preload next row
    add     r11, r11, #2                    ; move over to next column
    mov     r1, r11
    bne     bil_height_loop_1st_v6
    ldmia   sp!, {r4 - r11, pc}
 |bil_null_1st_filter|
 |bil_height_loop_null_1st|
    mov     lr, r4, lsr #2                  ; loop counter
 |bil_width_loop_null_1st|
    ldrb    r6, [r0]                        ; load data
    ldrb    r7, [r0, #1]
    ldrb    r8, [r0, #2]
    ldrb    r9, [r0, #3]
    strh    r6, [r1], r3                    ; store it to immediate buffer
    add     r0, r0, #4
    strh    r7, [r1], r3
    subs    lr, lr, #1
    strh    r8, [r1], r3
    strh    r9, [r1], r3
    bne     bil_width_loop_null_1st
    subs    r12, r12, #1
    add     r0, r0, r2                      ; move to next input line
    add     r11, r11, #2                    ; move over to next column
    mov     r1, r11
    bne     bil_height_loop_null_1st
    ldmia   sp!, {r4 - r11, pc}
    ENDP  ; |vp8_filter_block2d_bil_first_pass_armv6|
 ;---------------------------------
 ; r0    unsigned short *src_ptr,
 ; r1    unsigned char  *dst_ptr,
 ; r2    int             dst_pitch,
 ; r3    unsigned int    height,
 ; stack unsigned int    width,
 ; stack const short    *vp8_filter
 ;---------------------------------
 |vp8_filter_block2d_bil_second_pass_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}
    ldr     r11, [sp, #40]                  ; vp8_filter address
    ldr     r4, [sp, #36]                   ; width
    ldr     r5, [r11]                       ; load up filter coefficients
    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
    mov     r11, r1
    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
    beq     bil_null_2nd_filter
 |bil_height_loop_2nd|
    ldr     r6, [r0]                        ; load the data
    ldr     r8, [r0, #4]
    ldrh    r10, [r0, #8]
    mov     lr, r3, lsr #2                  ; loop counter
 |bil_width_loop_2nd|
    pkhtb   r7, r6, r8                      ; src[1] | src[2]
    pkhtb   r9, r8, r10                     ; src[3] | src[4]
    smuad   r6, r6, r5                      ; apply filter
    smuad   r8, r8, r5                      ; apply filter
    subs    lr, lr, #1
    smuadx  r7, r7, r5                      ; apply filter
    smuadx  r9, r9, r5                      ; apply filter
    add     r0, r0, #8
    add     r6, r6, #0x40                   ; round_shift_and_clamp
    add     r7, r7, #0x40
    usat    r6, #8, r6, asr #7
    usat    r7, #8, r7, asr #7
    strb    r6, [r1], r2                    ; the result is transposed back and stored
    add     r8, r8, #0x40                   ; round_shift_and_clamp
    strb    r7, [r1], r2
    add     r9, r9, #0x40
    usat    r8, #8, r8, asr #7
    usat    r9, #8, r9, asr #7
    strb    r8, [r1], r2                    ; the result is transposed back and stored
    ldrne   r6, [r0]                        ; load data
    strb    r9, [r1], r2
    ldrne   r8, [r0, #4]
    ldrneh  r10, [r0, #8]
    bne     bil_width_loop_2nd
    subs    r12, r12, #1
    add     r0, r0, #4                      ; update src for next row
    add     r11, r11, #1
    mov     r1, r11
    bne     bil_height_loop_2nd
    ldmia   sp!, {r4 - r11, pc}
 |bil_null_2nd_filter|
 |bil_height_loop_null_2nd|
    mov     lr, r3, lsr #2
 |bil_width_loop_null_2nd|
    ldr     r6, [r0], #4                    ; load data
    subs    lr, lr, #1
    ldr     r8, [r0], #4
    strb    r6, [r1], r2                    ; store data
    mov     r7, r6, lsr #16
    strb    r7, [r1], r2
    mov     r9, r8, lsr #16
    strb    r8, [r1], r2
    strb    r9, [r1], r2
    bne     bil_width_loop_null_2nd
    subs    r12, r12, #1
    add     r0, r0, #4
    add     r11, r11, #1
    mov     r1, r11
    bne     bil_height_loop_null_2nd
    ldmia   sp!, {r4 - r11, pc}
    ENDP  ; |vp8_filter_block2d_second_pass_armv6|
    END
--- a/vp8/common/arm/armv6/copymem16x16_v6.asm
+++ b/vp8/common/arm/armv6/copymem16x16_v6.asm
@@ -1,186 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_copy_mem16x16_v6|
    ; ARM
    ; REQUIRE8
    ; PRESERVE8
    AREA    Block, CODE, READONLY ; name this block of code
 ;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
 |vp8_copy_mem16x16_v6| PROC
    stmdb       sp!, {r4 - r7}
    ;push   {r4-r7}
    ;preload
    pld     [r0, #31]                ; preload for next 16x16 block
    ands    r4, r0, #15
    beq     copy_mem16x16_fast
    ands    r4, r0, #7
    beq     copy_mem16x16_8
    ands    r4, r0, #3
    beq     copy_mem16x16_4
    ;copy one byte each time
    ldrb    r4, [r0]
    ldrb    r5, [r0, #1]
    ldrb    r6, [r0, #2]
    ldrb    r7, [r0, #3]
    mov     r12, #16
 copy_mem16x16_1_loop
    strb    r4, [r2]
    strb    r5, [r2, #1]
    strb    r6, [r2, #2]
    strb    r7, [r2, #3]
    ldrb    r4, [r0, #4]
    ldrb    r5, [r0, #5]
    ldrb    r6, [r0, #6]
    ldrb    r7, [r0, #7]
    subs    r12, r12, #1
    strb    r4, [r2, #4]
    strb    r5, [r2, #5]
    strb    r6, [r2, #6]
    strb    r7, [r2, #7]
    ldrb    r4, [r0, #8]
    ldrb    r5, [r0, #9]
    ldrb    r6, [r0, #10]
    ldrb    r7, [r0, #11]
    strb    r4, [r2, #8]
    strb    r5, [r2, #9]
    strb    r6, [r2, #10]
    strb    r7, [r2, #11]
    ldrb    r4, [r0, #12]
    ldrb    r5, [r0, #13]
    ldrb    r6, [r0, #14]
    ldrb    r7, [r0, #15]
    add     r0, r0, r1
    strb    r4, [r2, #12]
    strb    r5, [r2, #13]
    strb    r6, [r2, #14]
    strb    r7, [r2, #15]
    add     r2, r2, r3
    ldrneb  r4, [r0]
    ldrneb  r5, [r0, #1]
    ldrneb  r6, [r0, #2]
    ldrneb  r7, [r0, #3]
    pld     [r0, #31]               ; preload for next 16x16 block
    bne     copy_mem16x16_1_loop
    ldmia       sp!, {r4 - r7}
    ;pop        {r4-r7}
    mov     pc, lr
 ;copy 4 bytes each time
 copy_mem16x16_4
    ldr     r4, [r0]
    ldr     r5, [r0, #4]
    ldr     r6, [r0, #8]
    ldr     r7, [r0, #12]
    mov     r12, #16
 copy_mem16x16_4_loop
    subs    r12, r12, #1
    add     r0, r0, r1
    str     r4, [r2]
    str     r5, [r2, #4]
    str     r6, [r2, #8]
    str     r7, [r2, #12]
    add     r2, r2, r3
    ldrne   r4, [r0]
    ldrne   r5, [r0, #4]
    ldrne   r6, [r0, #8]
    ldrne   r7, [r0, #12]
    pld     [r0, #31]               ; preload for next 16x16 block
    bne     copy_mem16x16_4_loop
    ldmia       sp!, {r4 - r7}
    ;pop        {r4-r7}
    mov     pc, lr
 ;copy 8 bytes each time
 copy_mem16x16_8
    sub     r1, r1, #16
    sub     r3, r3, #16
    mov     r12, #16
 copy_mem16x16_8_loop
    ldmia   r0!, {r4-r5}
    ;ldm        r0, {r4-r5}
    ldmia   r0!, {r6-r7}
    add     r0, r0, r1
    stmia   r2!, {r4-r5}
    subs    r12, r12, #1
    ;stm        r2, {r4-r5}
    stmia   r2!, {r6-r7}
    add     r2, r2, r3
    pld     [r0, #31]               ; preload for next 16x16 block
    bne     copy_mem16x16_8_loop
    ldmia       sp!, {r4 - r7}
    ;pop        {r4-r7}
    mov     pc, lr
 ;copy 16 bytes each time
 copy_mem16x16_fast
    ;sub        r1, r1, #16
    ;sub        r3, r3, #16
    mov     r12, #16
 copy_mem16x16_fast_loop
    ldmia   r0, {r4-r7}
    ;ldm        r0, {r4-r7}
    add     r0, r0, r1
    subs    r12, r12, #1
    stmia   r2, {r4-r7}
    ;stm        r2, {r4-r7}
    add     r2, r2, r3
    pld     [r0, #31]               ; preload for next 16x16 block
    bne     copy_mem16x16_fast_loop
    ldmia       sp!, {r4 - r7}
    ;pop        {r4-r7}
    mov     pc, lr
    ENDP  ; |vp8_copy_mem16x16_v6|
    END
--- a/vp8/common/arm/armv6/copymem8x4_v6.asm
+++ b/vp8/common/arm/armv6/copymem8x4_v6.asm
@@ -1,128 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_copy_mem8x4_v6|
    ; ARM
    ; REQUIRE8
    ; PRESERVE8
    AREA    Block, CODE, READONLY ; name this block of code
 ;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
 |vp8_copy_mem8x4_v6| PROC
    ;push   {r4-r5}
    stmdb  sp!, {r4-r5}
    ;preload
    pld     [r0]
    pld     [r0, r1]
    pld     [r0, r1, lsl #1]
    ands    r4, r0, #7
    beq     copy_mem8x4_fast
    ands    r4, r0, #3
    beq     copy_mem8x4_4
    ;copy 1 byte each time
    ldrb    r4, [r0]
    ldrb    r5, [r0, #1]
    mov     r12, #4
 copy_mem8x4_1_loop
    strb    r4, [r2]
    strb    r5, [r2, #1]
    ldrb    r4, [r0, #2]
    ldrb    r5, [r0, #3]
    subs    r12, r12, #1
    strb    r4, [r2, #2]
    strb    r5, [r2, #3]
    ldrb    r4, [r0, #4]
    ldrb    r5, [r0, #5]
    strb    r4, [r2, #4]
    strb    r5, [r2, #5]
    ldrb    r4, [r0, #6]
    ldrb    r5, [r0, #7]
    add     r0, r0, r1
    strb    r4, [r2, #6]
    strb    r5, [r2, #7]
    add     r2, r2, r3
    ldrneb  r4, [r0]
    ldrneb  r5, [r0, #1]
    bne     copy_mem8x4_1_loop
    ldmia       sp!, {r4 - r5}
    ;pop        {r4-r5}
    mov     pc, lr
 ;copy 4 bytes each time
 copy_mem8x4_4
    ldr     r4, [r0]
    ldr     r5, [r0, #4]
    mov     r12, #4
 copy_mem8x4_4_loop
    subs    r12, r12, #1
    add     r0, r0, r1
    str     r4, [r2]
    str     r5, [r2, #4]
    add     r2, r2, r3
    ldrne   r4, [r0]
    ldrne   r5, [r0, #4]
    bne     copy_mem8x4_4_loop
    ldmia  sp!, {r4-r5}
    ;pop        {r4-r5}
    mov     pc, lr
 ;copy 8 bytes each time
 copy_mem8x4_fast
    ;sub        r1, r1, #8
    ;sub        r3, r3, #8
    mov     r12, #4
 copy_mem8x4_fast_loop
    ldmia   r0, {r4-r5}
    ;ldm        r0, {r4-r5}
    add     r0, r0, r1
    subs    r12, r12, #1
    stmia   r2, {r4-r5}
    ;stm        r2, {r4-r5}
    add     r2, r2, r3
    bne     copy_mem8x4_fast_loop
    ldmia  sp!, {r4-r5}
    ;pop        {r4-r5}
    mov     pc, lr
    ENDP  ; |vp8_copy_mem8x4_v6|
    END
--- a/vp8/common/arm/armv6/copymem8x8_v6.asm
+++ b/vp8/common/arm/armv6/copymem8x8_v6.asm
@@ -1,128 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_copy_mem8x8_v6|
    ; ARM
    ; REQUIRE8
    ; PRESERVE8
    AREA    Block, CODE, READONLY ; name this block of code
 ;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
 |vp8_copy_mem8x8_v6| PROC
    ;push   {r4-r5}
    stmdb  sp!, {r4-r5}
    ;preload
    pld     [r0]
    pld     [r0, r1]
    pld     [r0, r1, lsl #1]
    ands    r4, r0, #7
    beq     copy_mem8x8_fast
    ands    r4, r0, #3
    beq     copy_mem8x8_4
    ;copy 1 byte each time
    ldrb    r4, [r0]
    ldrb    r5, [r0, #1]
    mov     r12, #8
 copy_mem8x8_1_loop
    strb    r4, [r2]
    strb    r5, [r2, #1]
    ldrb    r4, [r0, #2]
    ldrb    r5, [r0, #3]
    subs    r12, r12, #1
    strb    r4, [r2, #2]
    strb    r5, [r2, #3]
    ldrb    r4, [r0, #4]
    ldrb    r5, [r0, #5]
    strb    r4, [r2, #4]
    strb    r5, [r2, #5]
    ldrb    r4, [r0, #6]
    ldrb    r5, [r0, #7]
    add     r0, r0, r1
    strb    r4, [r2, #6]
    strb    r5, [r2, #7]
    add     r2, r2, r3
    ldrneb  r4, [r0]
    ldrneb  r5, [r0, #1]
    bne     copy_mem8x8_1_loop
    ldmia       sp!, {r4 - r5}
    ;pop        {r4-r5}
    mov     pc, lr
 ;copy 4 bytes each time
 copy_mem8x8_4
    ldr     r4, [r0]
    ldr     r5, [r0, #4]
    mov     r12, #8
 copy_mem8x8_4_loop
    subs    r12, r12, #1
    add     r0, r0, r1
    str     r4, [r2]
    str     r5, [r2, #4]
    add     r2, r2, r3
    ldrne   r4, [r0]
    ldrne   r5, [r0, #4]
    bne     copy_mem8x8_4_loop
    ldmia       sp!, {r4 - r5}
    ;pop        {r4-r5}
    mov     pc, lr
 ;copy 8 bytes each time
 copy_mem8x8_fast
    ;sub        r1, r1, #8
    ;sub        r3, r3, #8
    mov     r12, #8
 copy_mem8x8_fast_loop
    ldmia   r0, {r4-r5}
    ;ldm        r0, {r4-r5}
    add     r0, r0, r1
    subs    r12, r12, #1
    stmia   r2, {r4-r5}
    ;stm        r2, {r4-r5}
    add     r2, r2, r3
    bne     copy_mem8x8_fast_loop
    ldmia  sp!, {r4-r5}
    ;pop        {r4-r5}
    mov     pc, lr
    ENDP  ; |vp8_copy_mem8x8_v6|
    END
--- a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
+++ b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
@@ -1,70 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license and patent
 ;  grant that can be found in the LICENSE file in the root of the source
 ;  tree. All contributing project authors may be found in the AUTHORS
 ;  file in the root of the source tree.
 ;
    EXPORT  |vp8_dc_only_idct_add_v6|
    AREA    |.text|, CODE, READONLY
 ;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
 ;                            int pred_stride, unsigned char *dst_ptr,
 ;                            int dst_stride)
 ; r0  input_dc
 ; r1  pred_ptr
 ; r2  pred_stride
 ; r3  dst_ptr
 ; sp  dst_stride
 |vp8_dc_only_idct_add_v6| PROC
    stmdb       sp!, {r4 - r7}
    add         r0, r0, #4                ; input_dc += 4
    ldr         r12, c0x0000FFFF
    ldr         r4, [r1], r2
    and         r0, r12, r0, asr #3       ; input_dc >> 3 + mask
    ldr         r6, [r1], r2
    orr         r0, r0, r0, lsl #16       ; a1 | a1
    ldr         r12, [sp, #16]            ; dst stride
    uxtab16     r5, r0, r4                ; a1+2 | a1+0
    uxtab16     r4, r0, r4, ror #8        ; a1+3 | a1+1
    uxtab16     r7, r0, r6
    uxtab16     r6, r0, r6, ror #8
    usat16      r5, #8, r5
    usat16      r4, #8, r4
    usat16      r7, #8, r7
    usat16      r6, #8, r6
    orr         r5, r5, r4, lsl #8
    orr         r7, r7, r6, lsl #8
    ldr         r4, [r1], r2
    str         r5, [r3], r12
    ldr         r6, [r1]
    str         r7, [r3], r12
    uxtab16     r5, r0, r4
    uxtab16     r4, r0, r4, ror #8
    uxtab16     r7, r0, r6
    uxtab16     r6, r0, r6, ror #8
    usat16      r5, #8, r5
    usat16      r4, #8, r4
    usat16      r7, #8, r7
    usat16      r6, #8, r6
    orr         r5, r5, r4, lsl #8
    orr         r7, r7, r6, lsl #8
    str         r5, [r3], r12
    str         r7, [r3]
    ldmia       sp!, {r4 - r7}
    bx          lr
    ENDP  ; |vp8_dc_only_idct_add_v6|
 ; Constant Pool
 c0x0000FFFF DCD 0x0000FFFF
    END
--- a/vp8/common/arm/armv6/dequant_idct_v6.asm
+++ b/vp8/common/arm/armv6/dequant_idct_v6.asm
@@ -1,190 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license and patent
 ;  grant that can be found in the LICENSE file in the root of the source
 ;  tree. All contributing project authors may be found in the AUTHORS
 ;  file in the root of the source tree.
 ;
    EXPORT |vp8_dequant_idct_add_v6|
    AREA |.text|, CODE, READONLY
 ;void vp8_dequant_idct_v6(short *input, short *dq,
 ;                         unsigned char *dest, int stride)
 ; r0 = q
 ; r1 = dq
 ; r2 = dst
 ; r3 = stride
 |vp8_dequant_idct_add_v6| PROC
    stmdb   sp!, {r4-r11, lr}
    ldr     r4, [r0]                ;input
    ldr     r5, [r1], #4            ;dq
    sub     sp, sp, #4
    str     r3, [sp]
    mov     r12, #4
 vp8_dequant_add_loop
    smulbb  r6, r4, r5
    smultt  r7, r4, r5
    ldr     r4, [r0, #4]            ;input
    ldr     r5, [r1], #4            ;dq
    strh    r6, [r0], #2
    strh    r7, [r0], #2
    smulbb  r6, r4, r5
    smultt  r7, r4, r5
    subs    r12, r12, #1
    ldrne   r4, [r0, #4]
    ldrne   r5, [r1], #4
    strh    r6, [r0], #2
    strh    r7, [r0], #2
    bne     vp8_dequant_add_loop
    sub     r0, r0, #32
    mov     r1, r0
 ; short_idct4x4llm_v6_dual
    ldr     r3, cospi8sqrt2minus1
    ldr     r4, sinpi8sqrt2
    ldr     r6, [r0, #8]
    mov     r5, #2
 vp8_dequant_idct_loop1_v6
    ldr     r12, [r0, #24]
    ldr     r14, [r0, #16]
    smulwt  r9, r3, r6
    smulwb  r7, r3, r6
    smulwt  r10, r4, r6
    smulwb  r8, r4, r6
    pkhbt   r7, r7, r9, lsl #16
    smulwt  r11, r3, r12
    pkhbt   r8, r8, r10, lsl #16
    uadd16  r6, r6, r7
    smulwt  r7, r4, r12
    smulwb  r9, r3, r12
    smulwb  r10, r4, r12
    subs    r5, r5, #1
    pkhbt   r9, r9, r11, lsl #16
    ldr     r11, [r0], #4
    pkhbt   r10, r10, r7, lsl #16
    uadd16  r7, r12, r9
    usub16  r7, r8, r7
    uadd16  r6, r6, r10
    uadd16  r10, r11, r14
    usub16  r8, r11, r14
    uadd16  r9, r10, r6
    usub16  r10, r10, r6
    uadd16  r6, r8, r7
    usub16  r7, r8, r7
    str     r6, [r1, #8]
    ldrne   r6, [r0, #8]
    str     r7, [r1, #16]
    str     r10, [r1, #24]
    str     r9, [r1], #4
    bne     vp8_dequant_idct_loop1_v6
    mov     r5, #2
    sub     r0, r1, #8
 vp8_dequant_idct_loop2_v6
    ldr     r6, [r0], #4
    ldr     r7, [r0], #4
    ldr     r8, [r0], #4
    ldr     r9, [r0], #4
    smulwt  r1, r3, r6
    smulwt  r12, r4, r6
    smulwt  lr, r3, r8
    smulwt  r10, r4, r8
    pkhbt   r11, r8, r6, lsl #16
    pkhbt   r1, lr, r1, lsl #16
    pkhbt   r12, r10, r12, lsl #16
    pkhtb   r6, r6, r8, asr #16
    uadd16  r6, r1, r6
    pkhbt   lr, r9, r7, lsl #16
    uadd16  r10, r11, lr
    usub16  lr, r11, lr
    pkhtb   r8, r7, r9, asr #16
    subs    r5, r5, #1
    smulwt  r1, r3, r8
    smulwb  r7, r3, r8
    smulwt  r11, r4, r8
    smulwb  r9, r4, r8
    pkhbt   r1, r7, r1, lsl #16
    uadd16  r8, r1, r8
    pkhbt   r11, r9, r11, lsl #16
    usub16  r1, r12, r8
    uadd16  r8, r11, r6
    ldr     r9, c0x00040004
    ldr     r12, [sp]               ; get stride from stack
    uadd16  r6, r10, r8
    usub16  r7, r10, r8
    uadd16  r7, r7, r9
    uadd16  r6, r6, r9
    uadd16  r10, r14, r1
    usub16  r1, r14, r1
    uadd16  r10, r10, r9
    uadd16  r1, r1, r9
    ldr     r11, [r2]               ; load input from dst
    mov     r8, r7, asr #3
    pkhtb   r9, r8, r10, asr #19
    mov     r8, r1, asr #3
    pkhtb   r8, r8, r6, asr #19
    uxtb16  lr, r11, ror #8
    qadd16  r9, r9, lr
    uxtb16  lr, r11
    qadd16  r8, r8, lr
    usat16  r9, #8, r9
    usat16  r8, #8, r8
    orr     r9, r8, r9, lsl #8
    ldr     r11, [r2, r12]          ; load input from dst
    mov     r7, r7, lsl #16
    mov     r1, r1, lsl #16
    mov     r10, r10, lsl #16
    mov     r6, r6, lsl #16
    mov     r7, r7, asr #3
    pkhtb   r7, r7, r10, asr #19
    mov     r1, r1, asr #3
    pkhtb   r1, r1, r6, asr #19
    uxtb16  r8, r11, ror #8
    qadd16  r7, r7, r8
    uxtb16  r8, r11
    qadd16  r1, r1, r8
    usat16  r7, #8, r7
    usat16  r1, #8, r1
    orr     r1, r1, r7, lsl #8
    str     r9, [r2], r12           ; store output to dst
    str     r1, [r2], r12           ; store output to dst
    bne     vp8_dequant_idct_loop2_v6
 ; memset
    sub     r0, r0, #32
    add     sp, sp, #4
    mov     r12, #0
    str     r12, [r0]
    str     r12, [r0, #4]
    str     r12, [r0, #8]
    str     r12, [r0, #12]
    str     r12, [r0, #16]
    str     r12, [r0, #20]
    str     r12, [r0, #24]
    str     r12, [r0, #28]
    ldmia   sp!, {r4 - r11, pc}
    ENDP    ; |vp8_dequant_idct_add_v6|
 ; Constant Pool
 cospi8sqrt2minus1 DCD 0x00004E7B
 sinpi8sqrt2       DCD 0x00008A8C
 c0x00040004       DCD 0x00040004
    END
--- a/vp8/common/arm/armv6/dequantize_v6.asm
+++ b/vp8/common/arm/armv6/dequantize_v6.asm
@@ -1,69 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_dequantize_b_loop_v6|
    AREA    |.text|, CODE, READONLY  ; name this block of code
 ;-------------------------------
 ;void   vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
 ; r0    short *Q,
 ; r1    short *DQC
 ; r2    short *DQ
 |vp8_dequantize_b_loop_v6| PROC
    stmdb   sp!, {r4-r9, lr}
    ldr     r3, [r0]                ;load Q
    ldr     r4, [r1]                ;load DQC
    ldr     r5, [r0, #4]
    ldr     r6, [r1, #4]
    mov     r12, #2                 ;loop counter
 dequant_loop
    smulbb  r7, r3, r4              ;multiply
    smultt  r8, r3, r4
    smulbb  r9, r5, r6
    smultt  lr, r5, r6
    ldr     r3, [r0, #8]
    ldr     r4, [r1, #8]
    ldr     r5, [r0, #12]
    ldr     r6, [r1, #12]
    strh    r7, [r2], #2            ;store result
    smulbb  r7, r3, r4              ;multiply
    strh    r8, [r2], #2
    smultt  r8, r3, r4
    strh    r9, [r2], #2
    smulbb  r9, r5, r6
    strh    lr, [r2], #2
    smultt  lr, r5, r6
    subs    r12, r12, #1
    add     r0, r0, #16
    add     r1, r1, #16
    ldrne       r3, [r0]
    strh    r7, [r2], #2            ;store result
    ldrne       r4, [r1]
    strh    r8, [r2], #2
    ldrne       r5, [r0, #4]
    strh    r9, [r2], #2
    ldrne       r6, [r1, #4]
    strh    lr, [r2], #2
    bne     dequant_loop
    ldmia   sp!, {r4-r9, pc}
    ENDP    ;|vp8_dequantize_b_loop_v6|
    END
--- a/vp8/common/arm/armv6/filter_v6.asm
+++ b/vp8/common/arm/armv6/filter_v6.asm
@@ -1,624 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_filter_block2d_first_pass_armv6|
    EXPORT  |vp8_filter_block2d_first_pass_16x16_armv6|
    EXPORT  |vp8_filter_block2d_first_pass_8x8_armv6|
    EXPORT  |vp8_filter_block2d_second_pass_armv6|
    EXPORT  |vp8_filter4_block2d_second_pass_armv6|
    EXPORT  |vp8_filter_block2d_first_pass_only_armv6|
    EXPORT  |vp8_filter_block2d_second_pass_only_armv6|
    AREA    |.text|, CODE, READONLY  ; name this block of code
 ;-------------------------------------
 ; r0    unsigned char *src_ptr
 ; r1    short         *output_ptr
 ; r2    unsigned int src_pixels_per_line
 ; r3    unsigned int output_width
 ; stack unsigned int output_height
 ; stack const short *vp8_filter
 ;-------------------------------------
 ; vp8_filter the input and put in the output array.  Apply the 6 tap FIR filter with
 ; the output being a 2 byte value and the intput being a 1 byte value.
 |vp8_filter_block2d_first_pass_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}
    ldr     r11, [sp, #40]                  ; vp8_filter address
    ldr     r7, [sp, #36]                   ; output height
    sub     r2, r2, r3                      ; inside loop increments input array,
                                            ; so the height loop only needs to add
                                            ; r2 - width to the input pointer
    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
    add     r12, r3, #16                    ; square off the output
    sub     sp, sp, #4
    ldr     r4, [r11]                       ; load up packed filter coefficients
    ldr     r5, [r11, #4]
    ldr     r6, [r11, #8]
    str     r1, [sp]                        ; push destination to stack
    mov     r7, r7, lsl #16                 ; height is top part of counter
 ; six tap filter
 |height_loop_1st_6|
    ldrb    r8, [r0, #-2]                   ; load source data
    ldrb    r9, [r0, #-1]
    ldrb    r10, [r0], #2
    orr     r7, r7, r3, lsr #2              ; construct loop counter
 |width_loop_1st_6|
    ldrb    r11, [r0, #-1]
    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
    ldrb    r9, [r0]
    smuad   lr, lr, r4                      ; apply the filter
    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
    smuad   r8, r8, r4
    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
    smlad   lr, r10, r5, lr
    ldrb    r10, [r0, #1]
    smlad   r8, r11, r5, r8
    ldrb    r11, [r0, #2]
    sub     r7, r7, #1
    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
    smlad   lr, r9, r6, lr
    smlad   r11, r10, r6, r8
    ands    r10, r7, #0xff                  ; test loop counter
    add     lr, lr, #0x40                   ; round_shift_and_clamp
    ldrneb  r8, [r0, #-2]                   ; load data for next loop
    usat    lr, #8, lr, asr #7
    add     r11, r11, #0x40
    ldrneb  r9, [r0, #-1]
    usat    r11, #8, r11, asr #7
    strh    lr, [r1], r12                   ; result is transposed and stored, which
                                            ; will make second pass filtering easier.
    ldrneb  r10, [r0], #2
    strh    r11, [r1], r12
    bne     width_loop_1st_6
    ldr     r1, [sp]                        ; load and update dst address
    subs    r7, r7, #0x10000
    add     r0, r0, r2                      ; move to next input line
    add     r1, r1, #2                      ; move over to next column
    str     r1, [sp]
    bne     height_loop_1st_6
    add     sp, sp, #4
    ldmia   sp!, {r4 - r11, pc}
    ENDP
 ; --------------------------
 ; 16x16 version
 ; -----------------------------
 |vp8_filter_block2d_first_pass_16x16_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}
    ldr     r11, [sp, #40]                  ; vp8_filter address
    ldr     r7, [sp, #36]                   ; output height
    add     r4, r2, #18                     ; preload next low
    pld     [r0, r4]
    sub     r2, r2, r3                      ; inside loop increments input array,
                                            ; so the height loop only needs to add
                                            ; r2 - width to the input pointer
    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
    add     r12, r3, #16                    ; square off the output
    sub     sp, sp, #4
    ldr     r4, [r11]                       ; load up packed filter coefficients
    ldr     r5, [r11, #4]
    ldr     r6, [r11, #8]
    str     r1, [sp]                        ; push destination to stack
    mov     r7, r7, lsl #16                 ; height is top part of counter
 ; six tap filter
 |height_loop_1st_16_6|
    ldrb    r8, [r0, #-2]                   ; load source data
    ldrb    r9, [r0, #-1]
    ldrb    r10, [r0], #2
    orr     r7, r7, r3, lsr #2              ; construct loop counter
 |width_loop_1st_16_6|
    ldrb    r11, [r0, #-1]
    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
    ldrb    r9, [r0]
    smuad   lr, lr, r4                      ; apply the filter
    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
    smuad   r8, r8, r4
    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
    smlad   lr, r10, r5, lr
    ldrb    r10, [r0, #1]
    smlad   r8, r11, r5, r8
    ldrb    r11, [r0, #2]
    sub     r7, r7, #1
    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
    smlad   lr, r9, r6, lr
    smlad   r11, r10, r6, r8
    ands    r10, r7, #0xff                  ; test loop counter
    add     lr, lr, #0x40                   ; round_shift_and_clamp
    ldrneb  r8, [r0, #-2]                   ; load data for next loop
    usat    lr, #8, lr, asr #7
    add     r11, r11, #0x40
    ldrneb  r9, [r0, #-1]
    usat    r11, #8, r11, asr #7
    strh    lr, [r1], r12                   ; result is transposed and stored, which
                                            ; will make second pass filtering easier.
    ldrneb  r10, [r0], #2
    strh    r11, [r1], r12
    bne     width_loop_1st_16_6
    ldr     r1, [sp]                        ; load and update dst address
    subs    r7, r7, #0x10000
    add     r0, r0, r2                      ; move to next input line
    add     r11, r2, #34                    ; adding back block width(=16)
    pld     [r0, r11]                       ; preload next low
    add     r1, r1, #2                      ; move over to next column
    str     r1, [sp]
    bne     height_loop_1st_16_6
    add     sp, sp, #4
    ldmia   sp!, {r4 - r11, pc}
    ENDP
 ; --------------------------
 ; 8x8 version
 ; -----------------------------
 |vp8_filter_block2d_first_pass_8x8_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}
    ldr     r11, [sp, #40]                  ; vp8_filter address
    ldr     r7, [sp, #36]                   ; output height
    add     r4, r2, #10                     ; preload next low
    pld     [r0, r4]
    sub     r2, r2, r3                      ; inside loop increments input array,
                                            ; so the height loop only needs to add
                                            ; r2 - width to the input pointer
    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
    add     r12, r3, #16                    ; square off the output
    sub     sp, sp, #4
    ldr     r4, [r11]                       ; load up packed filter coefficients
    ldr     r5, [r11, #4]
    ldr     r6, [r11, #8]
    str     r1, [sp]                        ; push destination to stack
    mov     r7, r7, lsl #16                 ; height is top part of counter
 ; six tap filter
 |height_loop_1st_8_6|
    ldrb    r8, [r0, #-2]                   ; load source data
    ldrb    r9, [r0, #-1]
    ldrb    r10, [r0], #2
    orr     r7, r7, r3, lsr #2              ; construct loop counter
 |width_loop_1st_8_6|
    ldrb    r11, [r0, #-1]
    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
    ldrb    r9, [r0]
    smuad   lr, lr, r4                      ; apply the filter
    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
    smuad   r8, r8, r4
    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
    smlad   lr, r10, r5, lr
    ldrb    r10, [r0, #1]
    smlad   r8, r11, r5, r8
    ldrb    r11, [r0, #2]
    sub     r7, r7, #1
    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
    smlad   lr, r9, r6, lr
    smlad   r11, r10, r6, r8
    ands    r10, r7, #0xff                  ; test loop counter
    add     lr, lr, #0x40                   ; round_shift_and_clamp
    ldrneb  r8, [r0, #-2]                   ; load data for next loop
    usat    lr, #8, lr, asr #7
    add     r11, r11, #0x40
    ldrneb  r9, [r0, #-1]
    usat    r11, #8, r11, asr #7
    strh    lr, [r1], r12                   ; result is transposed and stored, which
                                            ; will make second pass filtering easier.
    ldrneb  r10, [r0], #2
    strh    r11, [r1], r12
    bne     width_loop_1st_8_6
    ldr     r1, [sp]                        ; load and update dst address
    subs    r7, r7, #0x10000
    add     r0, r0, r2                      ; move to next input line
    add     r11, r2, #18                    ; adding back block width(=8)
    pld     [r0, r11]                       ; preload next low
    add     r1, r1, #2                      ; move over to next column
    str     r1, [sp]
    bne     height_loop_1st_8_6
    add     sp, sp, #4
    ldmia   sp!, {r4 - r11, pc}
    ENDP
 ;---------------------------------
 ; r0    short         *src_ptr,
 ; r1    unsigned char *output_ptr,
 ; r2    unsigned int output_pitch,
 ; r3    unsigned int cnt,
 ; stack const short *vp8_filter
 ;---------------------------------
 |vp8_filter_block2d_second_pass_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}
    ldr     r11, [sp, #36]                  ; vp8_filter address
    sub     sp, sp, #4
    mov     r7, r3, lsl #16                 ; height is top part of counter
    str     r1, [sp]                        ; push destination to stack
    ldr     r4, [r11]                       ; load up packed filter coefficients
    ldr     r5, [r11, #4]
    ldr     r6, [r11, #8]
    pkhbt   r12, r5, r4                     ; pack the filter differently
    pkhbt   r11, r6, r5
    sub     r0, r0, #4                      ; offset input buffer
 |height_loop_2nd|
    ldr     r8, [r0]                        ; load the data
    ldr     r9, [r0, #4]
    orr     r7, r7, r3, lsr #1              ; loop counter
 |width_loop_2nd|
    smuad   lr, r4, r8                      ; apply filter
    sub     r7, r7, #1
    smulbt  r8, r4, r8
    ldr     r10, [r0, #8]
    smlad   lr, r5, r9, lr
    smladx  r8, r12, r9, r8
    ldrh    r9, [r0, #12]
    smlad   lr, r6, r10, lr
    smladx  r8, r11, r10, r8
    add     r0, r0, #4
    smlatb  r10, r6, r9, r8
    add     lr, lr, #0x40                   ; round_shift_and_clamp
    ands    r8, r7, #0xff
    usat    lr, #8, lr, asr #7
    add     r10, r10, #0x40
    strb    lr, [r1], r2                    ; the result is transposed back and stored
    usat    r10, #8, r10, asr #7
    ldrne   r8, [r0]                        ; load data for next loop
    ldrne   r9, [r0, #4]
    strb    r10, [r1], r2
    bne     width_loop_2nd
    ldr     r1, [sp]                        ; update dst for next loop
    subs    r7, r7, #0x10000
    add     r0, r0, #16                     ; updata src for next loop
    add     r1, r1, #1
    str     r1, [sp]
    bne     height_loop_2nd
    add     sp, sp, #4
    ldmia   sp!, {r4 - r11, pc}
    ENDP
 ;---------------------------------
 ; r0    short         *src_ptr,
 ; r1    unsigned char *output_ptr,
 ; r2    unsigned int output_pitch,
 ; r3    unsigned int cnt,
 ; stack const short *vp8_filter
 ;---------------------------------
 |vp8_filter4_block2d_second_pass_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}
    ldr     r11, [sp, #36]                  ; vp8_filter address
    mov     r7, r3, lsl #16                 ; height is top part of counter
    ldr     r4, [r11]                       ; load up packed filter coefficients
    add     lr, r1, r3                      ; save final destination pointer
    ldr     r5, [r11, #4]
    ldr     r6, [r11, #8]
    pkhbt   r12, r5, r4                     ; pack the filter differently
    pkhbt   r11, r6, r5
    mov     r4, #0x40                       ; rounding factor (for smlad{x})
 |height_loop_2nd_4|
    ldrd    r8, r9, [r0, #-4]               ; load the data
    orr     r7, r7, r3, lsr #1              ; loop counter
 |width_loop_2nd_4|
    ldr     r10, [r0, #4]!
    smladx  r6, r9, r12, r4                 ; apply filter
    pkhbt   r8, r9, r8
    smlad   r5, r8, r12, r4
    pkhbt   r8, r10, r9
    smladx  r6, r10, r11, r6
    sub     r7, r7, #1
    smlad   r5, r8, r11, r5
    mov     r8, r9                          ; shift the data for the next loop
    mov     r9, r10
    usat    r6, #8, r6, asr #7              ; shift and clamp
    usat    r5, #8, r5, asr #7
    strb    r5, [r1], r2                    ; the result is transposed back and stored
    tst     r7, #0xff
    strb    r6, [r1], r2
    bne     width_loop_2nd_4
    subs    r7, r7, #0x10000
    add     r0, r0, #16                     ; update src for next loop
    sub     r1, lr, r7, lsr #16             ; update dst for next loop
    bne     height_loop_2nd_4
    ldmia   sp!, {r4 - r11, pc}
    ENDP
 ;------------------------------------
 ; r0    unsigned char *src_ptr
 ; r1    unsigned char *output_ptr,
 ; r2    unsigned int src_pixels_per_line
 ; r3    unsigned int cnt,
 ; stack unsigned int output_pitch,
 ; stack const short *vp8_filter
 ;------------------------------------
 |vp8_filter_block2d_first_pass_only_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}
    add     r7, r2, r3                      ; preload next low
    add     r7, r7, #2
    pld     [r0, r7]
    ldr     r4, [sp, #36]                   ; output pitch
    ldr     r11, [sp, #40]                  ; HFilter address
    sub     sp, sp, #8
    mov     r7, r3
    sub     r2, r2, r3                      ; inside loop increments input array,
                                            ; so the height loop only needs to add
                                            ; r2 - width to the input pointer
    sub     r4, r4, r3
    str     r4, [sp]                        ; save modified output pitch
    str     r2, [sp, #4]
    mov     r2, #0x40
    ldr     r4, [r11]                       ; load up packed filter coefficients
    ldr     r5, [r11, #4]
    ldr     r6, [r11, #8]
 ; six tap filter
 |height_loop_1st_only_6|
    ldrb    r8, [r0, #-2]                   ; load data
    ldrb    r9, [r0, #-1]
    ldrb    r10, [r0], #2
    mov     r12, r3, lsr #1                 ; loop counter
 |width_loop_1st_only_6|
    ldrb    r11, [r0, #-1]
    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
    ldrb    r9, [r0]
 ;;  smuad   lr, lr, r4
    smlad   lr, lr, r4, r2
    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
 ;;  smuad   r8, r8, r4
    smlad   r8, r8, r4, r2
    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
    smlad   lr, r10, r5, lr
    ldrb    r10, [r0, #1]
    smlad   r8, r11, r5, r8
    ldrb    r11, [r0, #2]
    subs    r12, r12, #1
    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
    smlad   lr, r9, r6, lr
    smlad   r10, r10, r6, r8
 ;;  add     lr, lr, #0x40                   ; round_shift_and_clamp
    ldrneb  r8, [r0, #-2]                   ; load data for next loop
    usat    lr, #8, lr, asr #7
 ;;  add     r10, r10, #0x40
    strb    lr, [r1], #1                    ; store the result
    usat    r10, #8, r10, asr #7
    ldrneb  r9, [r0, #-1]
    strb    r10, [r1], #1
    ldrneb  r10, [r0], #2
    bne     width_loop_1st_only_6
    ldr     lr, [sp]                        ; load back output pitch
    ldr     r12, [sp, #4]                   ; load back output pitch
    subs    r7, r7, #1
    add     r0, r0, r12                     ; updata src for next loop
    add     r11, r12, r3                    ; preload next low
    add     r11, r11, #2
    pld     [r0, r11]
    add     r1, r1, lr                      ; update dst for next loop
    bne     height_loop_1st_only_6
    add     sp, sp, #8
    ldmia   sp!, {r4 - r11, pc}
    ENDP  ; |vp8_filter_block2d_first_pass_only_armv6|
 ;------------------------------------
 ; r0    unsigned char *src_ptr,
 ; r1    unsigned char *output_ptr,
 ; r2    unsigned int src_pixels_per_line
 ; r3    unsigned int cnt,
 ; stack unsigned int output_pitch,
 ; stack const short *vp8_filter
 ;------------------------------------
 |vp8_filter_block2d_second_pass_only_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}
    ldr     r11, [sp, #40]                  ; VFilter address
    ldr     r12, [sp, #36]                  ; output pitch
    mov     r7, r3, lsl #16                 ; height is top part of counter
    sub     r0, r0, r2, lsl #1              ; need 6 elements for filtering, 2 before, 3 after
    sub     sp, sp, #8
    ldr     r4, [r11]                       ; load up packed filter coefficients
    ldr     r5, [r11, #4]
    ldr     r6, [r11, #8]
    str     r0, [sp]                        ; save r0 to stack
    str     r1, [sp, #4]                    ; save dst to stack
 ; six tap filter
 |width_loop_2nd_only_6|
    ldrb    r8, [r0], r2                    ; load data
    orr     r7, r7, r3                      ; loop counter
    ldrb    r9, [r0], r2
    ldrb    r10, [r0], r2
 |height_loop_2nd_only_6|
    ; filter first column in this inner loop, than, move to next colum.
    ldrb    r11, [r0], r2
    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
    ldrb    r9, [r0], r2
    smuad   lr, lr, r4
    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
    smuad   r8, r8, r4
    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
    smlad   lr, r10, r5, lr
    ldrb    r10, [r0], r2
    smlad   r8, r11, r5, r8
    ldrb    r11, [r0]
    sub     r7, r7, #2
    sub     r0, r0, r2, lsl #2
    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
    smlad   lr, r9, r6, lr
    smlad   r10, r10, r6, r8
    ands    r9, r7, #0xff
    add     lr, lr, #0x40                   ; round_shift_and_clamp
    ldrneb  r8, [r0], r2                    ; load data for next loop
    usat    lr, #8, lr, asr #7
    add     r10, r10, #0x40
    strb    lr, [r1], r12                   ; store the result for the column
    usat    r10, #8, r10, asr #7
    ldrneb  r9, [r0], r2
    strb    r10, [r1], r12
    ldrneb  r10, [r0], r2
    bne     height_loop_2nd_only_6
    ldr     r0, [sp]
    ldr     r1, [sp, #4]
    subs    r7, r7, #0x10000
    add     r0, r0, #1                      ; move to filter next column
    str     r0, [sp]
    add     r1, r1, #1
    str     r1, [sp, #4]
    bne     width_loop_2nd_only_6
    add     sp, sp, #8
    ldmia   sp!, {r4 - r11, pc}
    ENDP  ; |vp8_filter_block2d_second_pass_only_armv6|
    END
--- a/vp8/common/arm/armv6/idct_blk_v6.c
+++ b/vp8/common/arm/armv6/idct_blk_v6.c
@@ -1,115 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
                                     unsigned char *dst,
                                     int stride, char *eobs)
 {
    int i;
    for (i = 0; i < 4; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_idct_add_v6 (q, dq, dst, stride);
        else if (eobs[0] == 1)
        {
            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dst, stride, dst, stride);
            ((int *)q)[0] = 0;
        }
        if (eobs[1] > 1)
            vp8_dequant_idct_add_v6 (q+16, dq, dst+4, stride);
        else if (eobs[1] == 1)
        {
            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dst+4, stride, dst+4, stride);
            ((int *)(q+16))[0] = 0;
        }
        if (eobs[2] > 1)
            vp8_dequant_idct_add_v6 (q+32, dq, dst+8, stride);
        else if (eobs[2] == 1)
        {
            vp8_dc_only_idct_add_v6 (q[32]*dq[0], dst+8, stride, dst+8, stride);
            ((int *)(q+32))[0] = 0;
        }
        if (eobs[3] > 1)
            vp8_dequant_idct_add_v6 (q+48, dq, dst+12, stride);
        else if (eobs[3] == 1)
        {
            vp8_dc_only_idct_add_v6 (q[48]*dq[0], dst+12, stride,dst+12,stride);
            ((int *)(q+48))[0] = 0;
        }
        q    += 64;
        dst  += 4*stride;
        eobs += 4;
    }
 }
 void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq,
                                      unsigned char *dstu,
                                      unsigned char *dstv,
                                      int stride, char *eobs)
 {
    int i;
    for (i = 0; i < 2; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_idct_add_v6 (q, dq, dstu, stride);
        else if (eobs[0] == 1)
        {
            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstu, stride, dstu, stride);
            ((int *)q)[0] = 0;
        }
        if (eobs[1] > 1)
            vp8_dequant_idct_add_v6 (q+16, dq, dstu+4, stride);
        else if (eobs[1] == 1)
        {
            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstu+4, stride,
                                                  dstu+4, stride);
            ((int *)(q+16))[0] = 0;
        }
        q    += 32;
        dstu += 4*stride;
        eobs += 2;
    }
    for (i = 0; i < 2; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_idct_add_v6 (q, dq, dstv, stride);
        else if (eobs[0] == 1)
        {
            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstv, stride, dstv, stride);
            ((int *)q)[0] = 0;
        }
        if (eobs[1] > 1)
            vp8_dequant_idct_add_v6 (q+16, dq, dstv+4, stride);
        else if (eobs[1] == 1)
        {
            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstv+4, stride,
                                                  dstv+4, stride);
            ((int *)(q+16))[0] = 0;
        }
        q    += 32;
        dstv += 4*stride;
        eobs += 2;
    }
 }
--- a/vp8/common/arm/armv6/idct_v6.asm
+++ b/vp8/common/arm/armv6/idct_v6.asm
@@ -1,202 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_short_idct4x4llm_v6_dual|
    AREA    |.text|, CODE, READONLY
 ; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
 ;                             unsigned char *dst, int stride)
 ; r0    short* input
 ; r1    unsigned char* pred
 ; r2    int pitch
 ; r3    unsigned char* dst
 ; sp    int stride
 |vp8_short_idct4x4llm_v6_dual| PROC
    stmdb   sp!, {r4-r11, lr}
    sub     sp, sp, #4
    mov     r4, #0x00008A00         ; sin
    orr     r4, r4, #0x0000008C     ; sinpi8sqrt2
    mov     r5, #0x00004E00         ; cos
    orr     r5, r5, #0x0000007B     ; cospi8sqrt2minus1
    orr     r5, r5, #1<<31          ; loop counter on top bit
 loop1_dual
    ldr     r6, [r0, #(4*2)]        ; i5 | i4
    ldr     r12, [r0, #(12*2)]      ; i13|i12
    ldr     r14, [r0, #(8*2)]       ; i9 | i8
    smulbt  r9, r5, r6              ; (ip[5] * cospi8sqrt2minus1) >> 16
    smulbb  r7, r5, r6              ; (ip[4] * cospi8sqrt2minus1) >> 16
    smulwt  r10, r4, r6             ; (ip[5] * sinpi8sqrt2) >> 16
    smulwb  r8, r4, r6              ; (ip[4] * sinpi8sqrt2) >> 16
    smulbt  r11, r5, r12            ; (ip[13] * cospi8sqrt2minus1) >> 16
    pkhtb   r7, r9, r7, asr #16     ; 5c | 4c
    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s
    uadd16  r6, r6, r7              ; 5c+5 | 4c+4
    smulwt  r7, r4, r12             ; (ip[13] * sinpi8sqrt2) >> 16
    smulbb  r9, r5, r12             ; (ip[12] * cospi8sqrt2minus1) >> 16
    smulwb  r10, r4, r12            ; (ip[12] * sinpi8sqrt2) >> 16
    subs    r5, r5, #1<<31          ; i--
    pkhtb   r9, r11, r9, asr #16    ; 13c | 12c
    ldr     r11, [r0]               ; i1 | i0
    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s
    uadd16  r7, r12, r9             ; 13c+13 | 12c+12
    usub16  r7, r8, r7              ; c
    uadd16  r6, r6, r10             ; d
    uadd16  r10, r11, r14           ; a
    usub16  r8, r11, r14            ; b
    uadd16  r9, r10, r6             ; a+d
    usub16  r10, r10, r6            ; a-d
    uadd16  r6, r8, r7              ; b+c
    usub16  r7, r8, r7              ; b-c
    ; use input buffer to store intermediate results
    str      r6, [r0, #(4*2)]       ; o5 | o4
    str      r7, [r0, #(8*2)]       ; o9 | o8
    str      r10,[r0, #(12*2)]      ; o13|o12
    str      r9, [r0], #4           ; o1 | o0
    bcs loop1_dual
    sub     r0, r0, #8              ; reset input/output
    str     r0, [sp]
 loop2_dual
    ldr     r6, [r0, #(4*2)]        ; i5 | i4
    ldr     r12,[r0, #(2*2)]        ; i3 | i2
    ldr     r14,[r0, #(6*2)]        ; i7 | i6
    ldr     r0, [r0, #(0*2)]        ; i1 | i0
    smulbt  r9, r5, r6              ; (ip[5] * cospi8sqrt2minus1) >> 16
    smulbt  r7, r5, r0              ; (ip[1] * cospi8sqrt2minus1) >> 16
    smulwt  r10, r4, r6             ; (ip[5] * sinpi8sqrt2) >> 16
    smulwt  r8, r4, r0              ; (ip[1] * sinpi8sqrt2) >> 16
    pkhbt   r11, r6, r0, lsl #16    ; i0 | i4
    pkhtb   r7, r7, r9, asr #16     ; 1c | 5c
    pkhtb   r0, r0, r6, asr #16     ; i1 | i5
    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1
    uadd16  r0, r7, r0              ; 1c+1 | 5c+5 = temp2
    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6
    uadd16  r10, r11, r9            ; a
    usub16  r9, r11, r9             ; b
    pkhtb   r6, r12, r14, asr #16   ; i3 | i7
    subs    r5, r5, #1<<31          ; i--
    smulbt  r7, r5, r6              ; (ip[3] * cospi8sqrt2minus1) >> 16
    smulwt  r11, r4, r6             ; (ip[3] * sinpi8sqrt2) >> 16
    smulbb  r12, r5, r6             ; (ip[7] * cospi8sqrt2minus1) >> 16
    smulwb  r14, r4, r6             ; (ip[7] * sinpi8sqrt2) >> 16
    pkhtb   r7, r7, r12, asr #16    ; 3c | 7c
    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1
    uadd16  r6, r7, r6              ; 3c+3 | 7c+7 = temp2
    usub16  r12, r8, r6             ; c (o1 | o5)
    uadd16  r6, r11, r0             ; d (o3 | o7)
    uadd16  r7, r10, r6             ; a+d
    mov     r8, #4                  ; set up 4's
    orr     r8, r8, #0x40000        ; 4|4
    usub16  r6, r10, r6             ; a-d
    uadd16  r6, r6, r8              ; a-d+4, 3|7
    uadd16  r7, r7, r8              ; a+d+4, 0|4
    uadd16  r10, r9, r12            ; b+c
    usub16  r0, r9, r12             ; b-c
    uadd16  r10, r10, r8            ; b+c+4, 1|5
    uadd16  r8, r0, r8              ; b-c+4, 2|6
    ldr     lr, [sp, #40]           ; dst stride
    ldrb    r0, [r1]                ; pred p0
    ldrb    r11, [r1, #1]           ; pred p1
    ldrb    r12, [r1, #2]           ; pred p2
    add     r0, r0, r7, asr #19     ; p0 + o0
    add     r11, r11, r10, asr #19  ; p1 + o1
    add     r12, r12, r8, asr #19   ; p2 + o2
    usat    r0, #8, r0              ; d0 = clip8(p0 + o0)
    usat    r11, #8, r11            ; d1 = clip8(p1 + o1)
    usat    r12, #8, r12            ; d2 = clip8(p2 + o2)
    add     r0, r0, r11, lsl #8     ; |--|--|d1|d0|
    ldrb    r11, [r1, #3]           ; pred p3
    add     r0, r0, r12, lsl #16    ; |--|d2|d1|d0|
    add     r11, r11, r6, asr #19   ; p3 + o3
    sxth    r7, r7                  ;
    sxth    r10, r10                ;
    usat    r11, #8, r11            ; d3 = clip8(p3 + o3)
    sxth    r8, r8                  ;
    sxth    r6, r6                  ;
    add     r0, r0, r11, lsl #24    ; |d3|d2|d1|d0|
    ldrb    r12, [r1, r2]!          ; pred p4
    str     r0, [r3], lr
    ldrb    r11, [r1, #1]           ; pred p5
    add     r12, r12, r7, asr #3    ; p4 + o4
    add     r11, r11, r10, asr #3   ; p5 + o5
    usat    r12, #8, r12            ; d4 = clip8(p4 + o4)
    usat    r11, #8, r11            ; d5 = clip8(p5 + o5)
    ldrb    r7, [r1, #2]            ; pred p6
    ldrb    r10, [r1, #3]           ; pred p6
    add     r12, r12, r11, lsl #8   ; |--|--|d5|d4|
    add     r7, r7, r8, asr #3      ; p6 + o6
    add     r10, r10, r6, asr #3    ; p7 + o7
    ldr     r0, [sp]                ; load input pointer
    usat    r7, #8, r7              ; d6 = clip8(p6 + o6)
    usat    r10, #8, r10            ; d7 = clip8(p7 + o7)
    add     r12, r12, r7, lsl #16   ; |--|d6|d5|d4|
    add     r12, r12, r10, lsl #24  ; |d7|d6|d5|d4|
    str     r12, [r3], lr
    add     r0, r0, #16
    add     r1, r1, r2              ; pred + pitch
    bcs loop2_dual
    add     sp, sp, #4              ; idct_output buffer
    ldmia   sp!, {r4 - r11, pc}
    ENDP
    END
--- a/vp8/common/arm/armv6/iwalsh_v6.asm
+++ b/vp8/common/arm/armv6/iwalsh_v6.asm
@@ -1,136 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT |vp8_short_inv_walsh4x4_v6|
    ARM
    REQUIRE8
    PRESERVE8
    AREA    |.text|, CODE, READONLY  ; name this block of code
 ;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)
 |vp8_short_inv_walsh4x4_v6| PROC
    stmdb       sp!, {r4 - r12, lr}
    ldr         r2, [r0, #0]         ; [1  |  0]
    ldr         r3, [r0, #4]         ; [3  |  2]
    ldr         r4, [r0, #8]         ; [5  |  4]
    ldr         r5, [r0, #12]        ; [7  |  6]
    ldr         r6, [r0, #16]        ; [9  |  8]
    ldr         r7, [r0, #20]        ; [11 | 10]
    ldr         r8, [r0, #24]        ; [13 | 12]
    ldr         r9, [r0, #28]        ; [15 | 14]
    qadd16      r10, r2, r8          ; a1 [1+13  |  0+12]
    qadd16      r11, r4, r6          ; b1 [5+9   |  4+8]
    qsub16      r12, r4, r6          ; c1 [5-9   |  4-8]
    qsub16      lr, r2, r8           ; d1 [1-13  |  0-12]
    qadd16      r2, r10, r11         ; a1 + b1 [1  |  0]
    qadd16      r4, r12, lr          ; c1 + d1 [5  |  4]
    qsub16      r6, r10, r11         ; a1 - b1 [9  |  8]
    qsub16      r8, lr, r12          ; d1 - c1 [13 | 12]
    qadd16      r10, r3, r9          ; a1 [3+15  |  2+14]
    qadd16      r11, r5, r7          ; b1 [7+11  |  6+10]
    qsub16      r12, r5, r7          ; c1 [7-11  |  6-10]
    qsub16      lr, r3, r9           ; d1 [3-15  |  2-14]
    qadd16      r3, r10, r11         ; a1 + b1 [3  |  2]
    qadd16      r5, r12, lr          ; c1 + d1 [7  |  6]
    qsub16      r7, r10, r11         ; a1 - b1 [11 | 10]
    qsub16      r9, lr, r12          ; d1 - c1 [15 | 14]
    ; first transform complete
    qsubaddx    r10, r2, r3          ; [c1|a1] [1-2   |   0+3]
    qaddsubx    r11, r2, r3          ; [b1|d1] [1+2   |   0-3]
    qsubaddx    r12, r4, r5          ; [c1|a1] [5-6   |   4+7]
    qaddsubx    lr, r4, r5           ; [b1|d1] [5+6   |   4-7]
    qaddsubx    r2, r10, r11         ; [b2|c2] [c1+d1 | a1-b1]
    qaddsubx    r3, r11, r10         ; [a2|d2] [b1+a1 | d1-c1]
    ldr         r10, c0x00030003
    qaddsubx    r4, r12, lr          ; [b2|c2] [c1+d1 | a1-b1]
    qaddsubx    r5, lr, r12          ; [a2|d2] [b1+a1 | d1-c1]
    qadd16      r2, r2, r10          ; [b2+3|c2+3]
    qadd16      r3, r3, r10          ; [a2+3|d2+3]
    qadd16      r4, r4, r10          ; [b2+3|c2+3]
    qadd16      r5, r5, r10          ; [a2+3|d2+3]
    asr         r12, r3, #19         ; [0]
    strh        r12, [r1], #32
    asr         lr, r2, #19          ; [1]
    strh        lr, [r1], #32
    sxth        r2, r2
    sxth        r3, r3
    asr         r2, r2, #3           ; [2]
    strh        r2, [r1], #32
    asr         r3, r3, #3           ; [3]
    strh        r3, [r1], #32
    asr         r12, r5, #19         ; [4]
    strh        r12, [r1], #32
    asr         lr, r4, #19          ; [5]
    strh        lr, [r1], #32
    sxth        r4, r4
    sxth        r5, r5
    asr         r4, r4, #3           ; [6]
    strh        r4, [r1], #32
    asr         r5, r5, #3           ; [7]
    strh        r5, [r1], #32
    qsubaddx    r2, r6, r7           ; [c1|a1] [9-10  |  8+11]
    qaddsubx    r3, r6, r7           ; [b1|d1] [9+10  |  8-11]
    qsubaddx    r4, r8, r9           ; [c1|a1] [13-14 | 12+15]
    qaddsubx    r5, r8, r9           ; [b1|d1] [13+14 | 12-15]
    qaddsubx    r6, r2, r3           ; [b2|c2] [c1+d1 | a1-b1]
    qaddsubx    r7, r3, r2           ; [a2|d2] [b1+a1 | d1-c1]
    qaddsubx    r8, r4, r5           ; [b2|c2] [c1+d1 | a1-b1]
    qaddsubx    r9, r5, r4           ; [a2|d2] [b1+a1 | d1-c1]
    qadd16      r6, r6, r10          ; [b2+3|c2+3]
    qadd16      r7, r7, r10          ; [a2+3|d2+3]
    qadd16      r8, r8, r10          ; [b2+3|c2+3]
    qadd16      r9, r9, r10          ; [a2+3|d2+3]
    asr         r12, r7, #19         ; [8]
    strh        r12, [r1], #32
    asr         lr, r6, #19          ; [9]
    strh        lr, [r1], #32
    sxth        r6, r6
    sxth        r7, r7
    asr         r6, r6, #3           ; [10]
    strh        r6, [r1], #32
    asr         r7, r7, #3           ; [11]
    strh        r7, [r1], #32
    asr         r12, r9, #19         ; [12]
    strh        r12, [r1], #32
    asr         lr, r8, #19          ; [13]
    strh        lr, [r1], #32
    sxth        r8, r8
    sxth        r9, r9
    asr         r8, r8, #3           ; [14]
    strh        r8, [r1], #32
    asr         r9, r9, #3           ; [15]
    strh        r9, [r1], #32
    ldmia       sp!, {r4 - r12, pc}
    ENDP        ; |vp8_short_inv_walsh4x4_v6|
 ; Constant Pool
 c0x00030003 DCD 0x00030003
    END
--- a/vp8/common/arm/armv6/loopfilter_v6.asm
+++ b/vp8/common/arm/armv6/loopfilter_v6.asm
--- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm
+++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -1,286 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6|
    EXPORT |vp8_loop_filter_simple_vertical_edge_armv6|
    AREA    |.text|, CODE, READONLY  ; name this block of code
    MACRO
    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
    ; a0: 03 02 01 00
    ; a1: 13 12 11 10
    ; a2: 23 22 21 20
    ; a3: 33 32 31 30
    ;     b3 b2 b1 b0
    uxtb16      $b1, $a1                    ; xx 12 xx 10
    uxtb16      $b0, $a0                    ; xx 02 xx 00
    uxtb16      $b3, $a3                    ; xx 32 xx 30
    uxtb16      $b2, $a2                    ; xx 22 xx 20
    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
    MEND
 src         RN  r0
 pstep       RN  r1
 ;r0     unsigned char *src_ptr,
 ;r1     int src_pixel_step,
 ;r2     const char *blimit
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 |vp8_loop_filter_simple_horizontal_edge_armv6| PROC
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    stmdb       sp!, {r4 - r11, lr}
    ldrb        r12, [r2]                   ; blimit
    ldr         r3, [src, -pstep, lsl #1]   ; p1
    ldr         r4, [src, -pstep]           ; p0
    ldr         r5, [src]                   ; q0
    ldr         r6, [src, pstep]            ; q1
    orr         r12, r12, r12, lsl #8       ; blimit
    ldr         r2, c0x80808080
    orr         r12, r12, r12, lsl #16      ; blimit
    mov         r9, #4                      ; double the count. we're doing 4 at a time
    mov         lr, #0                      ; need 0 in a couple places
 |simple_hnext8|
    ; vp8_simple_filter_mask()
    uqsub8      r7, r3, r6                  ; p1 - q1
    uqsub8      r8, r6, r3                  ; q1 - p1
    uqsub8      r10, r4, r5                 ; p0 - q0
    uqsub8      r11, r5, r4                 ; q0 - p0
    orr         r8, r8, r7                  ; abs(p1 - q1)
    orr         r10, r10, r11               ; abs(p0 - q0)
    uqadd8      r10, r10, r10               ; abs(p0 - q0) * 2
    uhadd8      r8, r8, lr                  ; abs(p1 - q2) >> 1
    uqadd8      r10, r10, r8                ; abs(p0 - q0)*2 + abs(p1 - q1)/2
    mvn         r8, #0
    usub8       r10, r12, r10               ; compare to flimit. usub8 sets GE flags
    sel         r10, r8, lr                 ; filter mask: F or 0
    cmp         r10, #0
    beq         simple_hskip_filter         ; skip filtering if all masks are 0x00
    ;vp8_simple_filter()
    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
    qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
    qsub8       r6, r5, r4                  ; q0 - p0
    qadd8       r3, r3, r6                  ; += q0 - p0
    ldr         r7, c0x04040404
    qadd8       r3, r3, r6                  ; += q0 - p0
    ldr         r8, c0x03030303
    qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
    ;STALL
    and         r3, r3, r10                 ; vp8_filter &= mask
    qadd8       r7 , r3 , r7                ; Filter1 = vp8_filter + 4
    qadd8       r8 , r3 , r8                ; Filter2 = vp8_filter + 3
    shadd8      r7 , r7 , lr
    shadd8      r8 , r8 , lr
    shadd8      r7 , r7 , lr
    shadd8      r8 , r8 , lr
    shadd8      r7 , r7 , lr                ; Filter1 >>= 3
    shadd8      r8 , r8 , lr                ; Filter2 >>= 3
    qsub8       r5 ,r5, r7                  ; u = q0 - Filter1
    qadd8       r4, r4, r8                  ; u = p0 + Filter2
    eor         r5, r5, r2                  ; *oq0 = u^0x80
    str         r5, [src]                   ; store oq0 result
    eor         r4, r4, r2                  ; *op0 = u^0x80
    str         r4, [src, -pstep]           ; store op0 result
 |simple_hskip_filter|
    subs        r9, r9, #1
    addne       src, src, #4                ; next row
    ldrne       r3, [src, -pstep, lsl #1]   ; p1
    ldrne       r4, [src, -pstep]           ; p0
    ldrne       r5, [src]                   ; q0
    ldrne       r6, [src, pstep]            ; q1
    bne         simple_hnext8
    ldmia       sp!, {r4 - r11, pc}
    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_armv6|
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 |vp8_loop_filter_simple_vertical_edge_armv6| PROC
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    stmdb       sp!, {r4 - r11, lr}
    ldrb        r12, [r2]                   ; r12: blimit
    ldr         r2, c0x80808080
    orr         r12, r12, r12, lsl #8
    ; load soure data to r7, r8, r9, r10
    ldrh        r3, [src, #-2]
    pld         [src, #23]                  ; preload for next block
    ldrh        r4, [src], pstep
    orr         r12, r12, r12, lsl #16
    ldrh        r5, [src, #-2]
    pld         [src, #23]
    ldrh        r6, [src], pstep
    pkhbt       r7, r3, r4, lsl #16
    ldrh        r3, [src, #-2]
    pld         [src, #23]
    ldrh        r4, [src], pstep
    pkhbt       r8, r5, r6, lsl #16
    ldrh        r5, [src, #-2]
    pld         [src, #23]
    ldrh        r6, [src], pstep
    mov         r11, #4                     ; double the count. we're doing 4 at a time
 |simple_vnext8|
    ; vp8_simple_filter_mask() function
    pkhbt       r9, r3, r4, lsl #16
    pkhbt       r10, r5, r6, lsl #16
    ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
    TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
    uqsub8      r7, r3, r6                  ; p1 - q1
    uqsub8      r8, r6, r3                  ; q1 - p1
    uqsub8      r9, r4, r5                  ; p0 - q0
    uqsub8      r10, r5, r4                 ; q0 - p0
    orr         r7, r7, r8                  ; abs(p1 - q1)
    orr         r9, r9, r10                 ; abs(p0 - q0)
    mov         r8, #0
    uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2
    uhadd8      r7, r7, r8                  ; abs(p1 - q1) / 2
    uqadd8      r7, r7, r9                  ; abs(p0 - q0)*2 + abs(p1 - q1)/2
    mvn         r10, #0                     ; r10 == -1
    usub8       r7, r12, r7                 ; compare to flimit
    sel         lr, r10, r8                 ; filter mask
    cmp         lr, #0
    beq         simple_vskip_filter         ; skip filtering
    ;vp8_simple_filter() function
    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
    qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
    qsub8       r6, r5, r4                  ; q0 - p0
    qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
    ldr         r9, c0x03030303             ; r9 = 3
    qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
    ldr         r7, c0x04040404
    qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
    ;STALL
    and         r3, r3, lr                  ; vp8_filter &= mask
    qadd8       r9 , r3 , r9                ; Filter2 = vp8_filter + 3
    qadd8       r3 , r3 , r7                ; Filter1 = vp8_filter + 4
    shadd8      r9 , r9 , r8
    shadd8      r3 , r3 , r8
    shadd8      r9 , r9 , r8
    shadd8      r3 , r3 , r8
    shadd8      r9 , r9 , r8                ; Filter2 >>= 3
    shadd8      r3 , r3 , r8                ; Filter1 >>= 3
    ;calculate output
    sub         src, src, pstep, lsl #2
    qadd8       r4, r4, r9                  ; u = p0 + Filter2
    qsub8       r5, r5, r3                  ; u = q0 - Filter1
    eor         r4, r4, r2                  ; *op0 = u^0x80
    eor         r5, r5, r2                  ; *oq0 = u^0x80
    strb        r4, [src, #-1]              ; store the result
    mov         r4, r4, lsr #8
    strb        r5, [src], pstep
    mov         r5, r5, lsr #8
    strb        r4, [src, #-1]
    mov         r4, r4, lsr #8
    strb        r5, [src], pstep
    mov         r5, r5, lsr #8
    strb        r4, [src, #-1]
    mov         r4, r4, lsr #8
    strb        r5, [src], pstep
    mov         r5, r5, lsr #8
    strb        r4, [src, #-1]
    strb        r5, [src], pstep
 |simple_vskip_filter|
    subs        r11, r11, #1
    ; load soure data to r7, r8, r9, r10
    ldrneh      r3, [src, #-2]
    pld         [src, #23]                  ; preload for next block
    ldrneh      r4, [src], pstep
    ldrneh      r5, [src, #-2]
    pld         [src, #23]
    ldrneh      r6, [src], pstep
    pkhbt       r7, r3, r4, lsl #16
    ldrneh      r3, [src, #-2]
    pld         [src, #23]
    ldrneh      r4, [src], pstep
    pkhbt       r8, r5, r6, lsl #16
    ldrneh      r5, [src, #-2]
    pld         [src, #23]
    ldrneh      r6, [src], pstep
    bne         simple_vnext8
    ldmia       sp!, {r4 - r11, pc}
    ENDP        ; |vp8_loop_filter_simple_vertical_edge_armv6|
 ; Constant Pool
 c0x80808080 DCD     0x80808080
 c0x03030303 DCD     0x03030303
 c0x04040404 DCD     0x04040404
    END
--- a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
+++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -1,273 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_sixtap_predict8x4_armv6|
    AREA    |.text|, CODE, READONLY  ; name this block of code
 ;-------------------------------------
 ; r0    unsigned char *src_ptr,
 ; r1    int  src_pixels_per_line,
 ; r2    int  xoffset,
 ; r3    int  yoffset,
 ; stack unsigned char *dst_ptr,
 ; stack int  dst_pitch
 ;-------------------------------------
 ;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
 ;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
 ;and the result is stored in transpose.
 |vp8_sixtap_predict8x4_armv6| PROC
    stmdb       sp!, {r4 - r11, lr}
    str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset
    cmp         r2, #0                      ;skip first_pass filter if xoffset=0
    add         lr, sp, #4                  ;point to temporary buffer
    beq         skip_firstpass_filter
 ;first-pass filter
    adr         r12, filter8_coeff
    sub         r0, r0, r1, lsl #1
    add         r3, r1, #10                 ; preload next low
    pld         [r0, r3]
    add         r2, r12, r2, lsl #4         ;calculate filter location
    add         r0, r0, #3                  ;adjust src only for loading convinience
    ldr         r3, [r2]                    ; load up packed filter coefficients
    ldr         r4, [r2, #4]
    ldr         r5, [r2, #8]
    mov         r2, #0x90000                ; height=9 is top part of counter
    sub         r1, r1, #8
 |first_pass_hloop_v6|
    ldrb        r6, [r0, #-5]               ; load source data
    ldrb        r7, [r0, #-4]
    ldrb        r8, [r0, #-3]
    ldrb        r9, [r0, #-2]
    ldrb        r10, [r0, #-1]
    orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2
    pkhbt       r6, r6, r7, lsl #16         ; r7 | r6
    pkhbt       r7, r7, r8, lsl #16         ; r8 | r7
    pkhbt       r8, r8, r9, lsl #16         ; r9 | r8
    pkhbt       r9, r9, r10, lsl #16        ; r10 | r9
 |first_pass_wloop_v6|
    smuad       r11, r6, r3                 ; vp8_filter[0], vp8_filter[1]
    smuad       r12, r7, r3
    ldrb        r6, [r0], #1
    smlad       r11, r8, r4, r11            ; vp8_filter[2], vp8_filter[3]
    ldrb        r7, [r0], #1
    smlad       r12, r9, r4, r12
    pkhbt       r10, r10, r6, lsl #16       ; r10 | r9
    pkhbt       r6, r6, r7, lsl #16         ; r11 | r10
    smlad       r11, r10, r5, r11           ; vp8_filter[4], vp8_filter[5]
    smlad       r12, r6, r5, r12
    sub         r2, r2, #1
    add         r11, r11, #0x40             ; round_shift_and_clamp
    tst         r2, #0xff                   ; test loop counter
    usat        r11, #8, r11, asr #7
    add         r12, r12, #0x40
    strh        r11, [lr], #20              ; result is transposed and stored, which
    usat        r12, #8, r12, asr #7
    strh        r12, [lr], #20
    movne       r11, r6
    movne       r12, r7
    movne       r6, r8
    movne       r7, r9
    movne       r8, r10
    movne       r9, r11
    movne       r10, r12
    bne         first_pass_wloop_v6
    ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines
    ;;IF ARCHITECTURE=6
    ;pld        [src, ppl]
    ;;pld       [src, r9]
    ;;ENDIF
    subs        r2, r2, #0x10000
    sub         lr, lr, #158
    add         r0, r0, r1                  ; move to next input line
    add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier
    pld         [r0, r11]
    bne         first_pass_hloop_v6
 ;second pass filter
 secondpass_filter
    ldr         r3, [sp], #4                ; load back yoffset
    ldr         r0, [sp, #216]              ; load dst address from stack 180+36
    ldr         r1, [sp, #220]              ; load dst stride from stack 180+40
    cmp         r3, #0
    beq         skip_secondpass_filter
    adr         r12, filter8_coeff
    add         lr, r12, r3, lsl #4         ;calculate filter location
    mov         r2, #0x00080000
    ldr         r3, [lr]                    ; load up packed filter coefficients
    ldr         r4, [lr, #4]
    ldr         r5, [lr, #8]
    pkhbt       r12, r4, r3                 ; pack the filter differently
    pkhbt       r11, r5, r4
 second_pass_hloop_v6
    ldr         r6, [sp]                    ; load the data
    ldr         r7, [sp, #4]
    orr         r2, r2, #2                  ; loop counter
 second_pass_wloop_v6
    smuad       lr, r3, r6                  ; apply filter
    smulbt      r10, r3, r6
    ldr         r8, [sp, #8]
    smlad       lr, r4, r7, lr
    smladx      r10, r12, r7, r10
    ldrh        r9, [sp, #12]
    smlad       lr, r5, r8, lr
    smladx      r10, r11, r8, r10
    add         sp, sp, #4
    smlatb      r10, r5, r9, r10
    sub         r2, r2, #1
    add         lr, lr, #0x40               ; round_shift_and_clamp
    tst         r2, #0xff
    usat        lr, #8, lr, asr #7
    add         r10, r10, #0x40
    strb        lr, [r0], r1                ; the result is transposed back and stored
    usat        r10, #8, r10, asr #7
    strb        r10, [r0],r1
    movne       r6, r7
    movne       r7, r8
    bne         second_pass_wloop_v6
    subs        r2, r2, #0x10000
    add         sp, sp, #12                 ; updata src for next loop (20-8)
    sub         r0, r0, r1, lsl #2
    add         r0, r0, #1
    bne         second_pass_hloop_v6
    add         sp, sp, #20
    ldmia       sp!, {r4 - r11, pc}
 ;--------------------
 skip_firstpass_filter
    sub         r0, r0, r1, lsl #1
    sub         r1, r1, #8
    mov         r2, #9
 skip_firstpass_hloop
    ldrb        r4, [r0], #1                ; load data
    subs        r2, r2, #1
    ldrb        r5, [r0], #1
    strh        r4, [lr], #20               ; store it to immediate buffer
    ldrb        r6, [r0], #1                ; load data
    strh        r5, [lr], #20
    ldrb        r7, [r0], #1
    strh        r6, [lr], #20
    ldrb        r8, [r0], #1
    strh        r7, [lr], #20
    ldrb        r9, [r0], #1
    strh        r8, [lr], #20
    ldrb        r10, [r0], #1
    strh        r9, [lr], #20
    ldrb        r11, [r0], #1
    strh        r10, [lr], #20
    add         r0, r0, r1                  ; move to next input line
    strh        r11, [lr], #20
    sub         lr, lr, #158                ; move over to next column
    bne         skip_firstpass_hloop
    b           secondpass_filter
 ;--------------------
 skip_secondpass_filter
    mov         r2, #8
    add         sp, sp, #4                  ;start from src[0] instead of src[-2]
 skip_secondpass_hloop
    ldr         r6, [sp], #4
    subs        r2, r2, #1
    ldr         r8, [sp], #4
    mov         r7, r6, lsr #16             ; unpack
    strb        r6, [r0], r1
    mov         r9, r8, lsr #16
    strb        r7, [r0], r1
    add         sp, sp, #12                 ; 20-8
    strb        r8, [r0], r1
    strb        r9, [r0], r1
    sub         r0, r0, r1, lsl #2
    add         r0, r0, #1
    bne         skip_secondpass_hloop
    add         sp, sp, #16                 ; 180 - (160 +4)
    ldmia       sp!, {r4 - r11, pc}
    ENDP
 ;-----------------
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 filter8_coeff
    DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
    DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
    DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000
    DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000
    DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000
    DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000
    DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000
    DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000
    ;DCD        0,  0,  128,    0,   0,  0
    ;DCD        0, -6,  123,   12,  -1,  0
    ;DCD        2, -11, 108,   36,  -8,  1
    ;DCD        0, -9,   93,   50,  -6,  0
    ;DCD        3, -16,  77,   77, -16,  3
    ;DCD        0, -6,   50,   93,  -9,  0
    ;DCD        1, -8,   36,  108, -11,  2
    ;DCD        0, -1,   12,  123,  -6,  0
    END
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ b/vp8/common/arm/bilinearfilter_arm.c
@@ -1,113 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include <math.h>
 #include "vp8/common/filter.h"
 #include "bilinearfilter_arm.h"
 void vp8_filter_block2d_bil_armv6
 (
    unsigned char *src_ptr,
    unsigned char *dst_ptr,
    unsigned int   src_pitch,
    unsigned int   dst_pitch,
    const short   *HFilter,
    const short   *VFilter,
    int            Width,
    int            Height
 )
 {
    unsigned short FData[36*16]; /* Temp data buffer used in filtering */
    /* First filter 1-D horizontally... */
    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
    /* then 1-D vertically... */
    vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }
 void vp8_bilinear_predict4x4_armv6
 (
    unsigned char  *src_ptr,
    int   src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int dst_pitch
 )
 {
    const short  *HFilter;
    const short  *VFilter;
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];
    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
 }
 void vp8_bilinear_predict8x8_armv6
 (
    unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int  dst_pitch
 )
 {
    const short  *HFilter;
    const short  *VFilter;
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];
    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
 }
 void vp8_bilinear_predict8x4_armv6
 (
    unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int  dst_pitch
 )
 {
    const short  *HFilter;
    const short  *VFilter;
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];
    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
 }
 void vp8_bilinear_predict16x16_armv6
 (
    unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int  dst_pitch
 )
 {
    const short  *HFilter;
    const short  *VFilter;
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];
    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
--- a/vp8/common/arm/bilinearfilter_arm.h
+++ b/vp8/common/arm/bilinearfilter_arm.h
@@ -1,43 +0,0 @@
 /*
 *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
 #define VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
 extern void vp8_filter_block2d_bil_first_pass_armv6
 (
    const unsigned char  *src_ptr,
    unsigned short       *dst_ptr,
    unsigned int          src_pitch,
    unsigned int          height,
    unsigned int          width,
    const short          *vp8_filter
 );
 extern void vp8_filter_block2d_bil_second_pass_armv6
 (
    const unsigned short *src_ptr,
    unsigned char        *dst_ptr,
    int                   dst_pitch,
    unsigned int          height,
    unsigned int          width,
    const short         *vp8_filter
 );
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
--- a/vp8/common/arm/dequantize_arm.c
+++ b/vp8/common/arm/dequantize_arm.c
@@ -1,25 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "vp8/common/blockd.h"
 #if HAVE_MEDIA
 extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
 void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
 {
    short *DQ  = d->dqcoeff;
    short *Q   = d->qcoeff;
    vp8_dequantize_b_loop_v6(Q, DQC, DQ);
 }
 #endif
--- a/vp8/common/arm/filter_arm.c
+++ b/vp8/common/arm/filter_arm.c
@@ -1,221 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include <math.h>
 #include "vp8/common/filter.h"
 #include "vpx_ports/mem.h"
 extern void vp8_filter_block2d_first_pass_armv6
 (
    unsigned char *src_ptr,
    short         *output_ptr,
    unsigned int src_pixels_per_line,
    unsigned int output_width,
    unsigned int output_height,
    const short *vp8_filter
 );
 // 8x8
 extern void vp8_filter_block2d_first_pass_8x8_armv6
 (
    unsigned char *src_ptr,
    short         *output_ptr,
    unsigned int src_pixels_per_line,
    unsigned int output_width,
    unsigned int output_height,
    const short *vp8_filter
 );
 // 16x16
 extern void vp8_filter_block2d_first_pass_16x16_armv6
 (
    unsigned char *src_ptr,
    short         *output_ptr,
    unsigned int src_pixels_per_line,
    unsigned int output_width,
    unsigned int output_height,
    const short *vp8_filter
 );
 extern void vp8_filter_block2d_second_pass_armv6
 (
    short         *src_ptr,
    unsigned char *output_ptr,
    unsigned int output_pitch,
    unsigned int cnt,
    const short *vp8_filter
 );
 extern void vp8_filter4_block2d_second_pass_armv6
 (
    short         *src_ptr,
    unsigned char *output_ptr,
    unsigned int output_pitch,
    unsigned int cnt,
    const short *vp8_filter
 );
 extern void vp8_filter_block2d_first_pass_only_armv6
 (
    unsigned char *src_ptr,
    unsigned char *output_ptr,
    unsigned int src_pixels_per_line,
    unsigned int cnt,
    unsigned int output_pitch,
    const short *vp8_filter
 );
 extern void vp8_filter_block2d_second_pass_only_armv6
 (
    unsigned char *src_ptr,
    unsigned char *output_ptr,
    unsigned int src_pixels_per_line,
    unsigned int cnt,
    unsigned int output_pitch,
    const short *vp8_filter
 );
 #if HAVE_MEDIA
 void vp8_sixtap_predict4x4_armv6
 (
    unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int  dst_pitch
 )
 {
    const short  *HFilter;
    const short  *VFilter;
    DECLARE_ALIGNED(4, short, FData[12*4]); /* Temp data buffer used in filtering */
    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
    /* Vfilter is null. First pass only */
    if (xoffset && !yoffset)
    {
        /*vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
        vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/
        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
    }
    /* Hfilter is null. Second pass only */
    else if (!xoffset && yoffset)
    {
        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
    }
    else
    {
        /* Vfilter is a 4 tap filter */
        if (yoffset & 0x1)
        {
            vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
        }
        /* Vfilter is 6 tap filter */
        else
        {
            vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
        }
    }
 }
 void vp8_sixtap_predict8x8_armv6
 (
    unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int  dst_pitch
 )
 {
    const short  *HFilter;
    const short  *VFilter;
    DECLARE_ALIGNED(4, short, FData[16*8]); /* Temp data buffer used in filtering */
    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
    if (xoffset && !yoffset)
    {
        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
    }
    /* Hfilter is null. Second pass only */
    else if (!xoffset && yoffset)
    {
        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
    }
    else
    {
        if (yoffset & 0x1)
        {
            vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
        }
        else
        {
            vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
        }
    }
 }
 void vp8_sixtap_predict16x16_armv6
 (
    unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int  dst_pitch
 )
 {
    const short  *HFilter;
    const short  *VFilter;
    DECLARE_ALIGNED(4, short, FData[24*16]);    /* Temp data buffer used in filtering */
    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
    if (xoffset && !yoffset)
    {
        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
    }
    /* Hfilter is null. Second pass only */
    else if (!xoffset && yoffset)
    {
        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
    }
    else
    {
        if (yoffset & 0x1)
        {
            vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
        }
        else
        {
            vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
        }
    }
 }
 #endif
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -1,181 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/onyxc_int.h"
 #define prototype_loopfilter(sym) \
    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
             const unsigned char *limit, const unsigned char *thresh, int count)
 #if HAVE_MEDIA
 extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
 extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
 #endif
 #if HAVE_NEON
 typedef void loopfilter_y_neon(unsigned char *src, int pitch,
        unsigned char blimit, unsigned char limit, unsigned char thresh);
 typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
        unsigned char blimit, unsigned char limit, unsigned char thresh,
        unsigned char *v);
 extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
 extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
 extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
 extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
 extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
 extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
 extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
 extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
 #endif
 #if HAVE_MEDIA
 /* ARMV6/MEDIA loopfilter functions*/
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
    if (u_ptr)
        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
    if (v_ptr)
        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
    if (u_ptr)
        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
    if (v_ptr)
        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
    if (u_ptr)
        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
    if (v_ptr)
        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
                               const unsigned char *blimit)
 {
    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
 }
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
    vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
    vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
    if (u_ptr)
        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
    if (v_ptr)
        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
                               const unsigned char *blimit)
 {
    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
 }
 #endif
 #if HAVE_NEON
 /* NEON loopfilter functions */
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
    unsigned char mblim = *lfi->mblim;
    unsigned char lim = *lfi->lim;
    unsigned char hev_thr = *lfi->hev_thr;
    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
    if (u_ptr)
        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
 }
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
    unsigned char mblim = *lfi->mblim;
    unsigned char lim = *lfi->lim;
    unsigned char hev_thr = *lfi->hev_thr;
    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
    if (u_ptr)
        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
 }
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
    unsigned char blim = *lfi->blim;
    unsigned char lim = *lfi->lim;
    unsigned char hev_thr = *lfi->hev_thr;
    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
    if (u_ptr)
        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
 }
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
    unsigned char blim = *lfi->blim;
    unsigned char lim = *lfi->lim;
    unsigned char hev_thr = *lfi->hev_thr;
    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
    if (u_ptr)
        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
 }
 #endif
--- a/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -1,591 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 static const uint8_t bifilter4_coeff[8][2] = {
    {128,   0},
    {112,  16},
    { 96,  32},
    { 80,  48},
    { 64,  64},
    { 48,  80},
    { 32,  96},
    { 16, 112}
 };
 void vp8_bilinear_predict8x4_neon(
        unsigned char *src_ptr,
        int src_pixels_per_line,
        int xoffset,
        int yoffset,
        unsigned char *dst_ptr,
        int dst_pitch) {
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8;
    uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8;
    uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
    uint16x8_t q1u16, q2u16, q3u16, q4u16;
    uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
    if (xoffset == 0) {  // skip_1stpass_filter
        d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
        d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
        d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
        d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
        d26u8 = vld1_u8(src_ptr);
    } else {
        q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
        q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
        q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
        q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
        q5u8 = vld1q_u8(src_ptr);
        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
        q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
        q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
        q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
        q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
        q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
        d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
        d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
        d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
        d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
        d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
        q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
        q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
        q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
        q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
        q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
        d22u8 = vqrshrn_n_u16(q6u16, 7);
        d23u8 = vqrshrn_n_u16(q7u16, 7);
        d24u8 = vqrshrn_n_u16(q8u16, 7);
        d25u8 = vqrshrn_n_u16(q9u16, 7);
        d26u8 = vqrshrn_n_u16(q10u16, 7);
    }
    // secondpass_filter
    if (yoffset == 0) {  // skip_2ndpass_filter
        vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d25u8);
    } else {
        d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
        d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
        q1u16 = vmull_u8(d22u8, d0u8);
        q2u16 = vmull_u8(d23u8, d0u8);
        q3u16 = vmull_u8(d24u8, d0u8);
        q4u16 = vmull_u8(d25u8, d0u8);
        q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
        q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
        q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
        q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
        d2u8 = vqrshrn_n_u16(q1u16, 7);
        d3u8 = vqrshrn_n_u16(q2u16, 7);
        d4u8 = vqrshrn_n_u16(q3u16, 7);
        d5u8 = vqrshrn_n_u16(q4u16, 7);
        vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d5u8);
    }
    return;
 }
 void vp8_bilinear_predict8x8_neon(
        unsigned char *src_ptr,
        int src_pixels_per_line,
        int xoffset,
        int yoffset,
        unsigned char *dst_ptr,
        int dst_pitch) {
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8;
    uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8;
    uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16;
    uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
    if (xoffset == 0) {  // skip_1stpass_filter
        d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
        d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
        d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
        d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
        d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
        d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
        d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
        d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
        d30u8 = vld1_u8(src_ptr);
    } else {
        q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
        q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
        q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
        q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
        q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
        q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
        q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
        q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
        d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
        d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
        d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
        d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
        q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
        q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
        q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
        q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
        d22u8 = vqrshrn_n_u16(q6u16, 7);
        d23u8 = vqrshrn_n_u16(q7u16, 7);
        d24u8 = vqrshrn_n_u16(q8u16, 7);
        d25u8 = vqrshrn_n_u16(q9u16, 7);
        // first_pass filtering on the rest 5-line data
        q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
        q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
        q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
        q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
        q5u8 = vld1q_u8(src_ptr);
        q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
        q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
        q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
        q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
        q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
        d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
        d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
        d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
        d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
        d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
        q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
        q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
        q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
        q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
        q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
        d26u8 = vqrshrn_n_u16(q6u16, 7);
        d27u8 = vqrshrn_n_u16(q7u16, 7);
        d28u8 = vqrshrn_n_u16(q8u16, 7);
        d29u8 = vqrshrn_n_u16(q9u16, 7);
        d30u8 = vqrshrn_n_u16(q10u16, 7);
    }
    // secondpass_filter
    if (yoffset == 0) {  // skip_2ndpass_filter
        vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d29u8);
    } else {
        d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
        d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
        q1u16 = vmull_u8(d22u8, d0u8);
        q2u16 = vmull_u8(d23u8, d0u8);
        q3u16 = vmull_u8(d24u8, d0u8);
        q4u16 = vmull_u8(d25u8, d0u8);
        q5u16 = vmull_u8(d26u8, d0u8);
        q6u16 = vmull_u8(d27u8, d0u8);
        q7u16 = vmull_u8(d28u8, d0u8);
        q8u16 = vmull_u8(d29u8, d0u8);
        q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
        q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
        q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
        q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
        q5u16 = vmlal_u8(q5u16, d27u8, d1u8);
        q6u16 = vmlal_u8(q6u16, d28u8, d1u8);
        q7u16 = vmlal_u8(q7u16, d29u8, d1u8);
        q8u16 = vmlal_u8(q8u16, d30u8, d1u8);
        d2u8 = vqrshrn_n_u16(q1u16, 7);
        d3u8 = vqrshrn_n_u16(q2u16, 7);
        d4u8 = vqrshrn_n_u16(q3u16, 7);
        d5u8 = vqrshrn_n_u16(q4u16, 7);
        d6u8 = vqrshrn_n_u16(q5u16, 7);
        d7u8 = vqrshrn_n_u16(q6u16, 7);
        d8u8 = vqrshrn_n_u16(q7u16, 7);
        d9u8 = vqrshrn_n_u16(q8u16, 7);
        vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch;
        vst1_u8((uint8_t *)dst_ptr, d9u8);
    }
    return;
 }
 void vp8_bilinear_predict16x16_neon(
        unsigned char *src_ptr,
        int src_pixels_per_line,
        int xoffset,
        int yoffset,
        unsigned char *dst_ptr,
        int dst_pitch) {
    int i;
    unsigned char tmp[272];
    unsigned char *tmpp;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
    uint8x8_t d19u8, d20u8, d21u8;
    uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
    uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8;
    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
    uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
    if (xoffset == 0) {  // secondpass_bfilter16x16_only
        d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
        d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
        q11u8 = vld1q_u8(src_ptr);
        src_ptr += src_pixels_per_line;
        for (i = 4; i > 0; i--) {
            q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
            q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
            q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
            q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
            d2u8 = vqrshrn_n_u16(q1u16, 7);
            d3u8 = vqrshrn_n_u16(q2u16, 7);
            d4u8 = vqrshrn_n_u16(q3u16, 7);
            d5u8 = vqrshrn_n_u16(q4u16, 7);
            d6u8 = vqrshrn_n_u16(q5u16, 7);
            d7u8 = vqrshrn_n_u16(q6u16, 7);
            d8u8 = vqrshrn_n_u16(q7u16, 7);
            d9u8 = vqrshrn_n_u16(q8u16, 7);
            q1u8 = vcombine_u8(d2u8, d3u8);
            q2u8 = vcombine_u8(d4u8, d5u8);
            q3u8 = vcombine_u8(d6u8, d7u8);
            q4u8 = vcombine_u8(d8u8, d9u8);
            q11u8 = q15u8;
            vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
            vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
            vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
            vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
        }
        return;
    }
    if (yoffset == 0) {  // firstpass_bfilter16x16_only
        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
        for (i = 4; i > 0 ; i--) {
            d2u8 = vld1_u8(src_ptr);
            d3u8 = vld1_u8(src_ptr + 8);
            d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
            d5u8 = vld1_u8(src_ptr);
            d6u8 = vld1_u8(src_ptr + 8);
            d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
            d8u8 = vld1_u8(src_ptr);
            d9u8 = vld1_u8(src_ptr + 8);
            d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
            d11u8 = vld1_u8(src_ptr);
            d12u8 = vld1_u8(src_ptr + 8);
            d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
            q7u16  = vmull_u8(d2u8, d0u8);
            q8u16  = vmull_u8(d3u8, d0u8);
            q9u16  = vmull_u8(d5u8, d0u8);
            q10u16 = vmull_u8(d6u8, d0u8);
            q11u16 = vmull_u8(d8u8, d0u8);
            q12u16 = vmull_u8(d9u8, d0u8);
            q13u16 = vmull_u8(d11u8, d0u8);
            q14u16 = vmull_u8(d12u8, d0u8);
            d2u8  = vext_u8(d2u8, d3u8, 1);
            d5u8  = vext_u8(d5u8, d6u8, 1);
            d8u8  = vext_u8(d8u8, d9u8, 1);
            d11u8 = vext_u8(d11u8, d12u8, 1);
            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
            d3u8  = vext_u8(d3u8, d4u8, 1);
            d6u8  = vext_u8(d6u8, d7u8, 1);
            d9u8  = vext_u8(d9u8, d10u8, 1);
            d12u8 = vext_u8(d12u8, d13u8, 1);
            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
            d14u8 = vqrshrn_n_u16(q7u16, 7);
            d15u8 = vqrshrn_n_u16(q8u16, 7);
            d16u8 = vqrshrn_n_u16(q9u16, 7);
            d17u8 = vqrshrn_n_u16(q10u16, 7);
            d18u8 = vqrshrn_n_u16(q11u16, 7);
            d19u8 = vqrshrn_n_u16(q12u16, 7);
            d20u8 = vqrshrn_n_u16(q13u16, 7);
            d21u8 = vqrshrn_n_u16(q14u16, 7);
            q7u8 = vcombine_u8(d14u8, d15u8);
            q8u8 = vcombine_u8(d16u8, d17u8);
            q9u8 = vcombine_u8(d18u8, d19u8);
            q10u8 =vcombine_u8(d20u8, d21u8);
            vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch;
            vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch;
            vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch;
            vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch;
        }
        return;
    }
    d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
    d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
    d2u8 = vld1_u8(src_ptr);
    d3u8 = vld1_u8(src_ptr + 8);
    d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
    d5u8 = vld1_u8(src_ptr);
    d6u8 = vld1_u8(src_ptr + 8);
    d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
    d8u8 = vld1_u8(src_ptr);
    d9u8 = vld1_u8(src_ptr + 8);
    d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
    d11u8 = vld1_u8(src_ptr);
    d12u8 = vld1_u8(src_ptr + 8);
    d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
    // First Pass: output_height lines x output_width columns (17x16)
    tmpp = tmp;
    for (i = 3; i > 0; i--) {
        q7u16  = vmull_u8(d2u8, d0u8);
        q8u16  = vmull_u8(d3u8, d0u8);
        q9u16  = vmull_u8(d5u8, d0u8);
        q10u16 = vmull_u8(d6u8, d0u8);
        q11u16 = vmull_u8(d8u8, d0u8);
        q12u16 = vmull_u8(d9u8, d0u8);
        q13u16 = vmull_u8(d11u8, d0u8);
        q14u16 = vmull_u8(d12u8, d0u8);
        d2u8  = vext_u8(d2u8, d3u8, 1);
        d5u8  = vext_u8(d5u8, d6u8, 1);
        d8u8  = vext_u8(d8u8, d9u8, 1);
        d11u8 = vext_u8(d11u8, d12u8, 1);
        q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
        q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
        q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
        q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
        d3u8  = vext_u8(d3u8, d4u8, 1);
        d6u8  = vext_u8(d6u8, d7u8, 1);
        d9u8  = vext_u8(d9u8, d10u8, 1);
        d12u8 = vext_u8(d12u8, d13u8, 1);
        q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
        q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
        q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
        q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
        d14u8 = vqrshrn_n_u16(q7u16, 7);
        d15u8 = vqrshrn_n_u16(q8u16, 7);
        d16u8 = vqrshrn_n_u16(q9u16, 7);
        d17u8 = vqrshrn_n_u16(q10u16, 7);
        d18u8 = vqrshrn_n_u16(q11u16, 7);
        d19u8 = vqrshrn_n_u16(q12u16, 7);
        d20u8 = vqrshrn_n_u16(q13u16, 7);
        d21u8 = vqrshrn_n_u16(q14u16, 7);
        d2u8 = vld1_u8(src_ptr);
        d3u8 = vld1_u8(src_ptr + 8);
        d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
        d5u8 = vld1_u8(src_ptr);
        d6u8 = vld1_u8(src_ptr + 8);
        d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
        d8u8 = vld1_u8(src_ptr);
        d9u8 = vld1_u8(src_ptr + 8);
        d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
        d11u8 = vld1_u8(src_ptr);
        d12u8 = vld1_u8(src_ptr + 8);
        d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
        q7u8 = vcombine_u8(d14u8, d15u8);
        q8u8 = vcombine_u8(d16u8, d17u8);
        q9u8 = vcombine_u8(d18u8, d19u8);
        q10u8 = vcombine_u8(d20u8, d21u8);
        vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16;
    }
    // First-pass filtering for rest 5 lines
    d14u8 = vld1_u8(src_ptr);
    d15u8 = vld1_u8(src_ptr + 8);
    d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
    q9u16  = vmull_u8(d2u8, d0u8);
    q10u16 = vmull_u8(d3u8, d0u8);
    q11u16 = vmull_u8(d5u8, d0u8);
    q12u16 = vmull_u8(d6u8, d0u8);
    q13u16 = vmull_u8(d8u8, d0u8);
    q14u16 = vmull_u8(d9u8, d0u8);
    d2u8  = vext_u8(d2u8, d3u8, 1);
    d5u8  = vext_u8(d5u8, d6u8, 1);
    d8u8  = vext_u8(d8u8, d9u8, 1);
    q9u16  = vmlal_u8(q9u16, d2u8, d1u8);
    q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
    q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
    d3u8  = vext_u8(d3u8, d4u8, 1);
    d6u8  = vext_u8(d6u8, d7u8, 1);
    d9u8  = vext_u8(d9u8, d10u8, 1);
    q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
    q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
    q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
    q1u16 = vmull_u8(d11u8, d0u8);
    q2u16 = vmull_u8(d12u8, d0u8);
    q3u16 = vmull_u8(d14u8, d0u8);
    q4u16 = vmull_u8(d15u8, d0u8);
    d11u8 = vext_u8(d11u8, d12u8, 1);
    d14u8 = vext_u8(d14u8, d15u8, 1);
    q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
    q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
    d12u8 = vext_u8(d12u8, d13u8, 1);
    d15u8 = vext_u8(d15u8, d16u8, 1);
    q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
    q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
    d10u8 = vqrshrn_n_u16(q9u16, 7);
    d11u8 = vqrshrn_n_u16(q10u16, 7);
    d12u8 = vqrshrn_n_u16(q11u16, 7);
    d13u8 = vqrshrn_n_u16(q12u16, 7);
    d14u8 = vqrshrn_n_u16(q13u16, 7);
    d15u8 = vqrshrn_n_u16(q14u16, 7);
    d16u8 = vqrshrn_n_u16(q1u16, 7);
    d17u8 = vqrshrn_n_u16(q2u16, 7);
    d18u8 = vqrshrn_n_u16(q3u16, 7);
    d19u8 = vqrshrn_n_u16(q4u16, 7);
    q5u8 = vcombine_u8(d10u8, d11u8);
    q6u8 = vcombine_u8(d12u8, d13u8);
    q7u8 = vcombine_u8(d14u8, d15u8);
    q8u8 = vcombine_u8(d16u8, d17u8);
    q9u8 = vcombine_u8(d18u8, d19u8);
    vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16;
    vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16;
    vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
    vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
    vst1q_u8((uint8_t *)tmpp, q9u8);
    // secondpass_filter
    d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
    d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
    tmpp = tmp;
    q11u8 = vld1q_u8(tmpp);
    tmpp += 16;
    for (i = 4; i > 0; i--) {
        q12u8 = vld1q_u8(tmpp); tmpp += 16;
        q13u8 = vld1q_u8(tmpp); tmpp += 16;
        q14u8 = vld1q_u8(tmpp); tmpp += 16;
        q15u8 = vld1q_u8(tmpp); tmpp += 16;
        q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
        q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
        q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
        q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
        q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
        q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
        q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
        q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
        q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
        q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
        q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
        q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
        q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
        q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
        q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
        q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
        d2u8 = vqrshrn_n_u16(q1u16, 7);
        d3u8 = vqrshrn_n_u16(q2u16, 7);
        d4u8 = vqrshrn_n_u16(q3u16, 7);
        d5u8 = vqrshrn_n_u16(q4u16, 7);
        d6u8 = vqrshrn_n_u16(q5u16, 7);
        d7u8 = vqrshrn_n_u16(q6u16, 7);
        d8u8 = vqrshrn_n_u16(q7u16, 7);
        d9u8 = vqrshrn_n_u16(q8u16, 7);
        q1u8 = vcombine_u8(d2u8, d3u8);
        q2u8 = vcombine_u8(d4u8, d5u8);
        q3u8 = vcombine_u8(d6u8, d7u8);
        q4u8 = vcombine_u8(d8u8, d9u8);
        q11u8 = q15u8;
        vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
        vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
        vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
        vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
    }
    return;
 }
--- a/vp8/common/arm/neon/copymem_neon.c
+++ b/vp8/common/arm/neon/copymem_neon.c
@@ -1,59 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 void vp8_copy_mem8x4_neon(
        unsigned char *src,
        int src_stride,
        unsigned char *dst,
        int dst_stride) {
    uint8x8_t vtmp;
    int r;
    for (r = 0; r < 4; r++) {
        vtmp = vld1_u8(src);
        vst1_u8(dst, vtmp);
        src += src_stride;
        dst += dst_stride;
    }
 }
 void vp8_copy_mem8x8_neon(
        unsigned char *src,
        int src_stride,
        unsigned char *dst,
        int dst_stride) {
    uint8x8_t vtmp;
    int r;
    for (r = 0; r < 8; r++) {
        vtmp = vld1_u8(src);
        vst1_u8(dst, vtmp);
        src += src_stride;
        dst += dst_stride;
    }
 }
 void vp8_copy_mem16x16_neon(
        unsigned char *src,
        int src_stride,
        unsigned char *dst,
        int dst_stride) {
    int r;
    uint8x16_t qtmp;
    for (r = 0; r < 16; r++) {
        qtmp = vld1q_u8(src);
        vst1q_u8(dst, qtmp);
        src += src_stride;
        dst += dst_stride;
    }
 }
--- a/vp8/common/arm/neon/dc_only_idct_add_neon.c
+++ b/vp8/common/arm/neon/dc_only_idct_add_neon.c
@@ -1,42 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 void vp8_dc_only_idct_add_neon(
        int16_t input_dc,
        unsigned char *pred_ptr,
        int pred_stride,
        unsigned char *dst_ptr,
        int dst_stride) {
    int i;
    uint16_t a1 = ((input_dc + 4) >> 3);
    uint32x2_t d2u32 = vdup_n_u32(0);
    uint8x8_t d2u8;
    uint16x8_t q1u16;
    uint16x8_t qAdd;
    qAdd = vdupq_n_u16(a1);
    for (i = 0; i < 2; i++) {
        d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0);
        pred_ptr += pred_stride;
        d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1);
        pred_ptr += pred_stride;
        q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32));
        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
        dst_ptr += dst_stride;
        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
        dst_ptr += dst_stride;
    }
 }
--- a/vp8/common/arm/neon/dequant_idct_neon.c
+++ b/vp8/common/arm/neon/dequant_idct_neon.c
@@ -1,142 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 static const int16_t cospi8sqrt2minus1 = 20091;
 static const int16_t sinpi8sqrt2       = 35468;
 void vp8_dequant_idct_add_neon(
        int16_t *input,
        int16_t *dq,
        unsigned char *dst,
        int stride) {
    unsigned char *dst0;
    int32x2_t d14, d15;
    int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
    int16x8_t q1, q2, q3, q4, q5, q6;
    int16x8_t qEmpty = vdupq_n_s16(0);
    int32x2x2_t d2tmp0, d2tmp1;
    int16x4x2_t d2tmp2, d2tmp3;
    d14 = d15 = vdup_n_s32(0);
    // load input
    q3 = vld1q_s16(input);
    vst1q_s16(input, qEmpty);
    input += 8;
    q4 = vld1q_s16(input);
    vst1q_s16(input, qEmpty);
    // load dq
    q5 = vld1q_s16(dq);
    dq += 8;
    q6 = vld1q_s16(dq);
    // load src from dst
    dst0 = dst;
    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
    dst0 += stride;
    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
    dst0 += stride;
    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
    dst0 += stride;
    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
    q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
                                         vreinterpretq_u16_s16(q5)));
    q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
                                         vreinterpretq_u16_s16(q6)));
    d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
    d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
    q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
    q3 = vshrq_n_s16(q3, 1);
    q4 = vshrq_n_s16(q4, 1);
    q3 = vqaddq_s16(q3, q2);
    q4 = vqaddq_s16(q4, q2);
    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
    d2 = vqadd_s16(d12, d11);
    d3 = vqadd_s16(d13, d10);
    d4 = vqsub_s16(d13, d10);
    d5 = vqsub_s16(d12, d11);
    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
                      vreinterpret_s16_s32(d2tmp1.val[0]));
    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
                      vreinterpret_s16_s32(d2tmp1.val[1]));
    // loop 2
    q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
    d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
    d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
    q3 = vshrq_n_s16(q3, 1);
    q4 = vshrq_n_s16(q4, 1);
    q3 = vqaddq_s16(q3, q2);
    q4 = vqaddq_s16(q4, q2);
    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
    d2 = vqadd_s16(d12, d11);
    d3 = vqadd_s16(d13, d10);
    d4 = vqsub_s16(d13, d10);
    d5 = vqsub_s16(d12, d11);
    d2 = vrshr_n_s16(d2, 3);
    d3 = vrshr_n_s16(d3, 3);
    d4 = vrshr_n_s16(d4, 3);
    d5 = vrshr_n_s16(d5, 3);
    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
                      vreinterpret_s16_s32(d2tmp1.val[0]));
    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
                      vreinterpret_s16_s32(d2tmp1.val[1]));
    q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
    q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
    q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
                                        vreinterpret_u8_s32(d14)));
    q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
                                        vreinterpret_u8_s32(d15)));
    d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
    d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
    dst0 = dst;
    vst1_lane_s32((int32_t *)dst0, d14, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst0, d14, 1);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst0, d15, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst0, d15, 1);
    return;
 }
--- a/vp8/common/arm/neon/dequantizeb_neon.c
+++ b/vp8/common/arm/neon/dequantizeb_neon.c
@@ -1,25 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 #include "vp8/common/blockd.h"
 void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
    int16x8x2_t qQ, qDQC, qDQ;
    qQ   = vld2q_s16(d->qcoeff);
    qDQC = vld2q_s16(DQC);
    qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
    qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
    vst2q_s16(d->dqcoeff, qDQ);
 }
--- a/vp8/common/arm/neon/idct_blk_neon.c
+++ b/vp8/common/arm/neon/idct_blk_neon.c
@@ -1,96 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 /* place these declarations here because we don't want to maintain them
 * outside of this scope
 */
 void idct_dequant_full_2x_neon(short *q, short *dq,
                               unsigned char *dst, int stride);
 void idct_dequant_0_2x_neon(short *q, short dq,
                            unsigned char *dst, int stride);
 void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
                                       unsigned char *dst,
                                       int stride, char *eobs)
 {
    int i;
    for (i = 0; i < 4; i++)
    {
        if (((short *)(eobs))[0])
        {
            if (((short *)eobs)[0] & 0xfefe)
                idct_dequant_full_2x_neon (q, dq, dst, stride);
            else
                idct_dequant_0_2x_neon (q, dq[0], dst, stride);
        }
        if (((short *)(eobs))[1])
        {
            if (((short *)eobs)[1] & 0xfefe)
                idct_dequant_full_2x_neon (q+32, dq, dst+8, stride);
            else
                idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride);
        }
        q    += 64;
        dst  += 4*stride;
        eobs += 4;
    }
 }
 void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
                                        unsigned char *dstu,
                                        unsigned char *dstv,
                                        int stride, char *eobs)
 {
    if (((short *)(eobs))[0])
    {
        if (((short *)eobs)[0] & 0xfefe)
            idct_dequant_full_2x_neon (q, dq, dstu, stride);
        else
            idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
    }
    q    += 32;
    dstu += 4*stride;
    if (((short *)(eobs))[1])
    {
        if (((short *)eobs)[1] & 0xfefe)
            idct_dequant_full_2x_neon (q, dq, dstu, stride);
        else
            idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
    }
    q += 32;
    if (((short *)(eobs))[2])
    {
        if (((short *)eobs)[2] & 0xfefe)
            idct_dequant_full_2x_neon (q, dq, dstv, stride);
        else
            idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
    }
    q    += 32;
    dstv += 4*stride;
    if (((short *)(eobs))[3])
    {
        if (((short *)eobs)[3] & 0xfefe)
            idct_dequant_full_2x_neon (q, dq, dstv, stride);
        else
            idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
    }
 }
--- a/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
+++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
@@ -1,63 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 void idct_dequant_0_2x_neon(
        int16_t *q,
        int16_t dq,
        unsigned char *dst,
        int stride) {
    unsigned char *dst0;
    int i, a0, a1;
    int16x8x2_t q2Add;
    int32x2_t d2s32 = vdup_n_s32(0),
              d4s32 = vdup_n_s32(0);
    uint8x8_t d2u8, d4u8;
    uint16x8_t q1u16, q2u16;
    a0 = ((q[0] * dq) + 4) >> 3;
    a1 = ((q[16] * dq) + 4) >> 3;
    q[0] = q[16] = 0;
    q2Add.val[0] = vdupq_n_s16((int16_t)a0);
    q2Add.val[1] = vdupq_n_s16((int16_t)a1);
    for (i = 0; i < 2; i++, dst += 4) {
        dst0 = dst;
        d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
        dst0 += stride;
        d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
        dst0 += stride;
        d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
        dst0 += stride;
        d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
        q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
                         vreinterpret_u8_s32(d2s32));
        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
                         vreinterpret_u8_s32(d4s32));
        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
        d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
        d2s32 = vreinterpret_s32_u8(d2u8);
        d4s32 = vreinterpret_s32_u8(d4u8);
        dst0 = dst;
        vst1_lane_s32((int32_t *)dst0, d2s32, 0);
        dst0 += stride;
        vst1_lane_s32((int32_t *)dst0, d2s32, 1);
        dst0 += stride;
        vst1_lane_s32((int32_t *)dst0, d4s32, 0);
        dst0 += stride;
        vst1_lane_s32((int32_t *)dst0, d4s32, 1);
    }
    return;
 }
--- a/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
+++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
@@ -1,185 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 static const int16_t cospi8sqrt2minus1 = 20091;
 static const int16_t sinpi8sqrt2       = 17734;
 // because the lowest bit in 0x8a8c is 0, we can pre-shift this
 void idct_dequant_full_2x_neon(
        int16_t *q,
        int16_t *dq,
        unsigned char *dst,
        int stride) {
    unsigned char *dst0, *dst1;
    int32x2_t d28, d29, d30, d31;
    int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
    int16x8_t qEmpty = vdupq_n_s16(0);
    int32x4x2_t q2tmp0, q2tmp1;
    int16x8x2_t q2tmp2, q2tmp3;
    int16x4_t dLow0, dLow1, dHigh0, dHigh1;
    d28 = d29 = d30 = d31 = vdup_n_s32(0);
    // load dq
    q0 = vld1q_s16(dq);
    dq += 8;
    q1 = vld1q_s16(dq);
    // load q
    q2 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);
    q += 8;
    q3 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);
    q += 8;
    q4 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);
    q += 8;
    q5 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);
    // load src from dst
    dst0 = dst;
    dst1 = dst + 4;
    d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
    dst0 += stride;
    d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
    dst1 += stride;
    d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
    dst0 += stride;
    d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
    dst1 += stride;
    d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
    dst0 += stride;
    d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
    dst1 += stride;
    d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
    d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
    q2 = vmulq_s16(q2, q0);
    q3 = vmulq_s16(q3, q1);
    q4 = vmulq_s16(q4, q0);
    q5 = vmulq_s16(q5, q1);
    // vswp
    dLow0 = vget_low_s16(q2);
    dHigh0 = vget_high_s16(q2);
    dLow1 = vget_low_s16(q4);
    dHigh1 = vget_high_s16(q4);
    q2 = vcombine_s16(dLow0, dLow1);
    q4 = vcombine_s16(dHigh0, dHigh1);
    dLow0 = vget_low_s16(q3);
    dHigh0 = vget_high_s16(q3);
    dLow1 = vget_low_s16(q5);
    dHigh1 = vget_high_s16(q5);
    q3 = vcombine_s16(dLow0, dLow1);
    q5 = vcombine_s16(dHigh0, dHigh1);
    q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
    q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
    q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
    q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
    q10 = vqaddq_s16(q2, q3);
    q11 = vqsubq_s16(q2, q3);
    q8 = vshrq_n_s16(q8, 1);
    q9 = vshrq_n_s16(q9, 1);
    q4 = vqaddq_s16(q4, q8);
    q5 = vqaddq_s16(q5, q9);
    q2 = vqsubq_s16(q6, q5);
    q3 = vqaddq_s16(q7, q4);
    q4 = vqaddq_s16(q10, q3);
    q5 = vqaddq_s16(q11, q2);
    q6 = vqsubq_s16(q11, q2);
    q7 = vqsubq_s16(q10, q3);
    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
                       vreinterpretq_s16_s32(q2tmp1.val[0]));
    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
                       vreinterpretq_s16_s32(q2tmp1.val[1]));
    // loop 2
    q8  = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
    q9  = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
    q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
    q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
    q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
    q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
    q10 = vshrq_n_s16(q10, 1);
    q11 = vshrq_n_s16(q11, 1);
    q10 = vqaddq_s16(q2tmp2.val[1], q10);
    q11 = vqaddq_s16(q2tmp3.val[1], q11);
    q8 = vqsubq_s16(q8, q11);
    q9 = vqaddq_s16(q9, q10);
    q4 = vqaddq_s16(q2, q9);
    q5 = vqaddq_s16(q3, q8);
    q6 = vqsubq_s16(q3, q8);
    q7 = vqsubq_s16(q2, q9);
    q4 = vrshrq_n_s16(q4, 3);
    q5 = vrshrq_n_s16(q5, 3);
    q6 = vrshrq_n_s16(q6, 3);
    q7 = vrshrq_n_s16(q7, 3);
    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
                       vreinterpretq_s16_s32(q2tmp1.val[0]));
    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
                       vreinterpretq_s16_s32(q2tmp1.val[1]));
    q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
                                          vreinterpret_u8_s32(d28)));
    q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
                                          vreinterpret_u8_s32(d29)));
    q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
                                          vreinterpret_u8_s32(d30)));
    q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
                                          vreinterpret_u8_s32(d31)));
    d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
    d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
    d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
    d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
    dst0 = dst;
    dst1 = dst + 4;
    vst1_lane_s32((int32_t *)dst0, d28, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst1, d28, 1);
    dst1 += stride;
    vst1_lane_s32((int32_t *)dst0, d29, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst1, d29, 1);
    dst1 += stride;
    vst1_lane_s32((int32_t *)dst0, d30, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst1, d30, 1);
    dst1 += stride;
    vst1_lane_s32((int32_t *)dst0, d31, 0);
    vst1_lane_s32((int32_t *)dst1, d31, 1);
    return;
 }
--- a/vp8/common/arm/neon/iwalsh_neon.c
+++ b/vp8/common/arm/neon/iwalsh_neon.c
@@ -1,102 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 void vp8_short_inv_walsh4x4_neon(
        int16_t *input,
        int16_t *mb_dqcoeff) {
    int16x8_t q0s16, q1s16, q2s16, q3s16;
    int16x4_t d4s16, d5s16, d6s16, d7s16;
    int16x4x2_t v2tmp0, v2tmp1;
    int32x2x2_t v2tmp2, v2tmp3;
    int16x8_t qAdd3;
    q0s16 = vld1q_s16(input);
    q1s16 = vld1q_s16(input + 8);
    // 1st for loop
    d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
    d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
    d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
    d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
    q2s16 = vcombine_s16(d4s16, d5s16);
    q3s16 = vcombine_s16(d6s16, d7s16);
    q0s16 = vaddq_s16(q2s16, q3s16);
    q1s16 = vsubq_s16(q2s16, q3s16);
    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
                      vreinterpret_s32_s16(vget_low_s16(q1s16)));
    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
                      vreinterpret_s32_s16(vget_high_s16(q1s16)));
    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
                      vreinterpret_s16_s32(v2tmp3.val[0]));
    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
                      vreinterpret_s16_s32(v2tmp3.val[1]));
    // 2nd for loop
    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
    d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
    d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
    d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
    q2s16 = vcombine_s16(d4s16, d5s16);
    q3s16 = vcombine_s16(d6s16, d7s16);
    qAdd3 = vdupq_n_s16(3);
    q0s16 = vaddq_s16(q2s16, q3s16);
    q1s16 = vsubq_s16(q2s16, q3s16);
    q0s16 = vaddq_s16(q0s16, qAdd3);
    q1s16 = vaddq_s16(q1s16, qAdd3);
    q0s16 = vshrq_n_s16(q0s16, 3);
    q1s16 = vshrq_n_s16(q1s16, 3);
    // store
    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  0);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  0);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  1);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  1);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  2);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  2);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  3);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  3);
    mb_dqcoeff += 16;
    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
    mb_dqcoeff += 16;
    return;
 }
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
@@ -1,111 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 #include "./vpx_config.h"
 static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
        unsigned char *s,
        int p,
        const unsigned char *blimit) {
    uint8_t *sp;
    uint8x16_t qblimit, q0u8;
    uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
    int16x8_t q2s16, q3s16, q13s16;
    int8x8_t d8s8, d9s8;
    int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
    qblimit = vdupq_n_u8(*blimit);
    sp = s - (p << 1);
    q5u8 = vld1q_u8(sp);
    sp += p;
    q6u8 = vld1q_u8(sp);
    sp += p;
    q7u8 = vld1q_u8(sp);
    sp += p;
    q8u8 = vld1q_u8(sp);
    q15u8 = vabdq_u8(q6u8, q7u8);
    q14u8 = vabdq_u8(q5u8, q8u8);
    q15u8 = vqaddq_u8(q15u8, q15u8);
    q14u8 = vshrq_n_u8(q14u8, 1);
    q0u8 = vdupq_n_u8(0x80);
    q13s16 = vdupq_n_s16(3);
    q15u8 = vqaddq_u8(q15u8, q14u8);
    q5u8 = veorq_u8(q5u8, q0u8);
    q6u8 = veorq_u8(q6u8, q0u8);
    q7u8 = veorq_u8(q7u8, q0u8);
    q8u8 = veorq_u8(q8u8, q0u8);
    q15u8 = vcgeq_u8(qblimit, q15u8);
    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
                     vget_low_s8(vreinterpretq_s8_u8(q6u8)));
    q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
                     vget_high_s8(vreinterpretq_s8_u8(q6u8)));
    q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8),
                     vreinterpretq_s8_u8(q8u8));
    q2s16 = vmulq_s16(q2s16, q13s16);
    q3s16 = vmulq_s16(q3s16, q13s16);
    q10u8 = vdupq_n_u8(3);
    q9u8 = vdupq_n_u8(4);
    q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
    q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
    d8s8 = vqmovn_s16(q2s16);
    d9s8 = vqmovn_s16(q3s16);
    q4s8 = vcombine_s8(d8s8, d9s8);
    q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
    q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
    q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
    q2s8 = vshrq_n_s8(q2s8, 3);
    q3s8 = vshrq_n_s8(q3s8, 3);
    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
    q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
    q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
    vst1q_u8(s, q7u8);
    s -= p;
    vst1q_u8(s, q6u8);
    return;
 }
 void vp8_loop_filter_bhs_neon(
        unsigned char *y_ptr,
        int y_stride,
        const unsigned char *blimit) {
    y_ptr += y_stride * 4;
    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
    y_ptr += y_stride * 4;
    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
    y_ptr += y_stride * 4;
    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
    return;
 }
 void vp8_loop_filter_mbhs_neon(
        unsigned char *y_ptr,
        int y_stride,
        const unsigned char *blimit) {
    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
    return;
 }
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -1,283 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 #include "./vpx_config.h"
 #include "vpx_ports/arm.h"
 #ifdef VPX_INCOMPATIBLE_GCC
 static INLINE void write_2x4(unsigned char *dst, int pitch,
                             const uint8x8x2_t result) {
    /*
     * uint8x8x2_t result
    00 01 02 03 | 04 05 06 07
    10 11 12 13 | 14 15 16 17
    ---
    * after vtrn_u8
    00 10 02 12 | 04 14 06 16
    01 11 03 13 | 05 15 07 17
    */
    const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
                                       result.val[1]);
    const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
    const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
    vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
    dst += pitch;
    vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
    dst += pitch;
    vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
    dst += pitch;
    vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
    dst += pitch;
    vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
    dst += pitch;
    vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
    dst += pitch;
    vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
    dst += pitch;
    vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
 }
 static INLINE void write_2x8(unsigned char *dst, int pitch,
                             const uint8x8x2_t result,
                             const uint8x8x2_t result2) {
  write_2x4(dst, pitch, result);
  dst += pitch * 8;
  write_2x4(dst, pitch, result2);
 }
 #else
 static INLINE void write_2x8(unsigned char *dst, int pitch,
                             const uint8x8x2_t result,
                             const uint8x8x2_t result2) {
  vst2_lane_u8(dst, result, 0);
  dst += pitch;
  vst2_lane_u8(dst, result, 1);
  dst += pitch;
  vst2_lane_u8(dst, result, 2);
  dst += pitch;
  vst2_lane_u8(dst, result, 3);
  dst += pitch;
  vst2_lane_u8(dst, result, 4);
  dst += pitch;
  vst2_lane_u8(dst, result, 5);
  dst += pitch;
  vst2_lane_u8(dst, result, 6);
  dst += pitch;
  vst2_lane_u8(dst, result, 7);
  dst += pitch;
  vst2_lane_u8(dst, result2, 0);
  dst += pitch;
  vst2_lane_u8(dst, result2, 1);
  dst += pitch;
  vst2_lane_u8(dst, result2, 2);
  dst += pitch;
  vst2_lane_u8(dst, result2, 3);
  dst += pitch;
  vst2_lane_u8(dst, result2, 4);
  dst += pitch;
  vst2_lane_u8(dst, result2, 5);
  dst += pitch;
  vst2_lane_u8(dst, result2, 6);
  dst += pitch;
  vst2_lane_u8(dst, result2, 7);
 }
 #endif  // VPX_INCOMPATIBLE_GCC
 #ifdef VPX_INCOMPATIBLE_GCC
 static INLINE
 uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
    uint8x8x4_t x;
    const uint8x8_t a = vld1_u8(src);
    const uint8x8_t b = vld1_u8(src + pitch * 1);
    const uint8x8_t c = vld1_u8(src + pitch * 2);
    const uint8x8_t d = vld1_u8(src + pitch * 3);
    const uint8x8_t e = vld1_u8(src + pitch * 4);
    const uint8x8_t f = vld1_u8(src + pitch * 5);
    const uint8x8_t g = vld1_u8(src + pitch * 6);
    const uint8x8_t h = vld1_u8(src + pitch * 7);
    const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
                                          vreinterpret_u32_u8(e));
    const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
                                          vreinterpret_u32_u8(f));
    const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
                                          vreinterpret_u32_u8(g));
    const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
                                          vreinterpret_u32_u8(h));
    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
                                          vreinterpret_u16_u32(r26_u32.val[0]));
    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
                                          vreinterpret_u16_u32(r37_u32.val[0]));
    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
                                       vreinterpret_u8_u16(r13_u16.val[0]));
    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
                                       vreinterpret_u8_u16(r13_u16.val[1]));
    /*
     * after vtrn_u32
    00 01 02 03 | 40 41 42 43
    10 11 12 13 | 50 51 52 53
    20 21 22 23 | 60 61 62 63
    30 31 32 33 | 70 71 72 73
    ---
    * after vtrn_u16
    00 01 20 21 | 40 41 60 61
    02 03 22 23 | 42 43 62 63
    10 11 30 31 | 50 51 70 71
    12 13 32 33 | 52 52 72 73
    00 01 20 21 | 40 41 60 61
    10 11 30 31 | 50 51 70 71
    02 03 22 23 | 42 43 62 63
    12 13 32 33 | 52 52 72 73
    ---
    * after vtrn_u8
    00 10 20 30 | 40 50 60 70
    01 11 21 31 | 41 51 61 71
    02 12 22 32 | 42 52 62 72
    03 13 23 33 | 43 53 63 73
    */
    x.val[0] = r01_u8.val[0];
    x.val[1] = r01_u8.val[1];
    x.val[2] = r23_u8.val[0];
    x.val[3] = r23_u8.val[1];
    return x;
 }
 #else
 static INLINE
 uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
    uint8x8x4_t x;
    x.val[0] = x.val[1] = x.val[2] = x.val[3] = vdup_n_u8(0);
    x = vld4_lane_u8(src, x, 0);
    src += pitch;
    x = vld4_lane_u8(src, x, 1);
    src += pitch;
    x = vld4_lane_u8(src, x, 2);
    src += pitch;
    x = vld4_lane_u8(src, x, 3);
    src += pitch;
    x = vld4_lane_u8(src, x, 4);
    src += pitch;
    x = vld4_lane_u8(src, x, 5);
    src += pitch;
    x = vld4_lane_u8(src, x, 6);
    src += pitch;
    x = vld4_lane_u8(src, x, 7);
    return x;
 }
 #endif  // VPX_INCOMPATIBLE_GCC
 static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
        unsigned char *s,
        int p,
        const unsigned char *blimit) {
    unsigned char *src1;
    uint8x16_t qblimit, q0u8;
    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
    int16x8_t q2s16, q13s16, q11s16;
    int8x8_t d28s8, d29s8;
    int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
    uint8x8x4_t d0u8x4;  // d6, d7, d8, d9
    uint8x8x4_t d1u8x4;  // d10, d11, d12, d13
    uint8x8x2_t d2u8x2;  // d12, d13
    uint8x8x2_t d3u8x2;  // d14, d15
    qblimit = vdupq_n_u8(*blimit);
    src1 = s - 2;
    d0u8x4 = read_4x8(src1, p);
    src1 += p * 8;
    d1u8x4 = read_4x8(src1, p);
    q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]);  // d6 d10
    q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]);  // d8 d12
    q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]);  // d7 d11
    q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]);  // d9 d13
    q15u8 = vabdq_u8(q5u8, q4u8);
    q14u8 = vabdq_u8(q3u8, q6u8);
    q15u8 = vqaddq_u8(q15u8, q15u8);
    q14u8 = vshrq_n_u8(q14u8, 1);
    q0u8 = vdupq_n_u8(0x80);
    q11s16 = vdupq_n_s16(3);
    q15u8 = vqaddq_u8(q15u8, q14u8);
    q3u8 = veorq_u8(q3u8, q0u8);
    q4u8 = veorq_u8(q4u8, q0u8);
    q5u8 = veorq_u8(q5u8, q0u8);
    q6u8 = veorq_u8(q6u8, q0u8);
    q15u8 = vcgeq_u8(qblimit, q15u8);
    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
                     vget_low_s8(vreinterpretq_s8_u8(q5u8)));
    q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
                      vget_high_s8(vreinterpretq_s8_u8(q5u8)));
    q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
                      vreinterpretq_s8_u8(q6u8));
    q2s16 = vmulq_s16(q2s16, q11s16);
    q13s16 = vmulq_s16(q13s16, q11s16);
    q11u8 = vdupq_n_u8(3);
    q12u8 = vdupq_n_u8(4);
    q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
    q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
    d28s8 = vqmovn_s16(q2s16);
    d29s8 = vqmovn_s16(q13s16);
    q14s8 = vcombine_s8(d28s8, d29s8);
    q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
    q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
    q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
    q2s8 = vshrq_n_s8(q2s8, 3);
    q14s8 = vshrq_n_s8(q3s8, 3);
    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
    q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
    q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
    d2u8x2.val[0] = vget_low_u8(q6u8);   // d12
    d2u8x2.val[1] = vget_low_u8(q7u8);   // d14
    d3u8x2.val[0] = vget_high_u8(q6u8);  // d13
    d3u8x2.val[1] = vget_high_u8(q7u8);  // d15
    src1 = s - 1;
    write_2x8(src1, p, d2u8x2, d3u8x2);
 }
 void vp8_loop_filter_bvs_neon(
        unsigned char *y_ptr,
        int y_stride,
        const unsigned char *blimit) {
    y_ptr += 4;
    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
    y_ptr += 4;
    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
    y_ptr += 4;
    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
    return;
 }
 void vp8_loop_filter_mbvs_neon(
        unsigned char *y_ptr,
        int y_stride,
        const unsigned char *blimit) {
    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
    return;
 }
--- a/vp8/common/arm/neon/mbloopfilter_neon.c
+++ b/vp8/common/arm/neon/mbloopfilter_neon.c
@@ -1,625 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 #include "./vpx_config.h"
 static INLINE void vp8_mbloop_filter_neon(
        uint8x16_t qblimit,  // mblimit
        uint8x16_t qlimit,   // limit
        uint8x16_t qthresh,  // thresh
        uint8x16_t q3,       // p2
        uint8x16_t q4,       // p2
        uint8x16_t q5,       // p1
        uint8x16_t q6,       // p0
        uint8x16_t q7,       // q0
        uint8x16_t q8,       // q1
        uint8x16_t q9,       // q2
        uint8x16_t q10,      // q3
        uint8x16_t *q4r,     // p1
        uint8x16_t *q5r,     // p1
        uint8x16_t *q6r,     // p0
        uint8x16_t *q7r,     // q0
        uint8x16_t *q8r,     // q1
        uint8x16_t *q9r) {   // q1
    uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
    int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
    int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
    uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
    int8x16_t q0s8, q12s8, q14s8, q15s8;
    int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
    q11u8 = vabdq_u8(q3, q4);
    q12u8 = vabdq_u8(q4, q5);
    q13u8 = vabdq_u8(q5, q6);
    q14u8 = vabdq_u8(q8, q7);
    q1u8  = vabdq_u8(q9, q8);
    q0u8  = vabdq_u8(q10, q9);
    q11u8 = vmaxq_u8(q11u8, q12u8);
    q12u8 = vmaxq_u8(q13u8, q14u8);
    q1u8  = vmaxq_u8(q1u8, q0u8);
    q15u8 = vmaxq_u8(q11u8, q12u8);
    q12u8 = vabdq_u8(q6, q7);
    // vp8_hevmask
    q13u8 = vcgtq_u8(q13u8, qthresh);
    q14u8 = vcgtq_u8(q14u8, qthresh);
    q15u8 = vmaxq_u8(q15u8, q1u8);
    q15u8 = vcgeq_u8(qlimit, q15u8);
    q1u8 = vabdq_u8(q5, q8);
    q12u8 = vqaddq_u8(q12u8, q12u8);
    // vp8_filter() function
    // convert to signed
    q0u8 = vdupq_n_u8(0x80);
    q9 = veorq_u8(q9, q0u8);
    q8 = veorq_u8(q8, q0u8);
    q7 = veorq_u8(q7, q0u8);
    q6 = veorq_u8(q6, q0u8);
    q5 = veorq_u8(q5, q0u8);
    q4 = veorq_u8(q4, q0u8);
    q1u8 = vshrq_n_u8(q1u8, 1);
    q12u8 = vqaddq_u8(q12u8, q1u8);
    q14u8 = vorrq_u8(q13u8, q14u8);
    q12u8 = vcgeq_u8(qblimit, q12u8);
    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
                     vget_low_s8(vreinterpretq_s8_u8(q6)));
    q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
                      vget_high_s8(vreinterpretq_s8_u8(q6)));
    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
                     vreinterpretq_s8_u8(q8));
    q11s16 = vdupq_n_s16(3);
    q2s16  = vmulq_s16(q2s16, q11s16);
    q13s16 = vmulq_s16(q13s16, q11s16);
    q15u8 = vandq_u8(q15u8, q12u8);
    q2s16  = vaddw_s8(q2s16, vget_low_s8(q1s8));
    q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
    q12u8 = vdupq_n_u8(3);
    q11u8 = vdupq_n_u8(4);
    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
    d2 = vqmovn_s16(q2s16);
    d3 = vqmovn_s16(q13s16);
    q1s8 = vcombine_s8(d2, d3);
    q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
    q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
    q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
    q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
    q2s8 = vshrq_n_s8(q2s8, 3);
    q13s8 = vshrq_n_s8(q13s8, 3);
    q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
    q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
    q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
    d5 = vdup_n_s8(9);
    d4 = vdup_n_s8(18);
    q0s16  = vmlal_s8(vreinterpretq_s16_u16(q0u16),  vget_low_s8(q1s8),  d5);
    q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
    d5 = vdup_n_s8(27);
    q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8),  d4);
    q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
    q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8),  d5);
    q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
    d0  = vqshrn_n_s16(q0s16 , 7);
    d1  = vqshrn_n_s16(q11s16, 7);
    d24 = vqshrn_n_s16(q12s16, 7);
    d25 = vqshrn_n_s16(q13s16, 7);
    d28 = vqshrn_n_s16(q14s16, 7);
    d29 = vqshrn_n_s16(q15s16, 7);
    q0s8  = vcombine_s8(d0, d1);
    q12s8 = vcombine_s8(d24, d25);
    q14s8 = vcombine_s8(d28, d29);
    q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
    q0s8  = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
    q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
    q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
    q15s8 = vqsubq_s8((q7s8), q14s8);
    q14s8 = vqaddq_s8((q6s8), q14s8);
    q1u8 = vdupq_n_u8(0x80);
    *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
    *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
    *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
    *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
    *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
    *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
    return;
 }
 void vp8_mbloop_filter_horizontal_edge_y_neon(
        unsigned char *src,
        int pitch,
        unsigned char blimit,
        unsigned char limit,
        unsigned char thresh) {
    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    uint8x16_t q5, q6, q7, q8, q9, q10;
    qblimit = vdupq_n_u8(blimit);
    qlimit = vdupq_n_u8(limit);
    qthresh = vdupq_n_u8(thresh);
    src -= (pitch << 2);
    q3 = vld1q_u8(src);
    src += pitch;
    q4 = vld1q_u8(src);
    src += pitch;
    q5 = vld1q_u8(src);
    src += pitch;
    q6 = vld1q_u8(src);
    src += pitch;
    q7 = vld1q_u8(src);
    src += pitch;
    q8 = vld1q_u8(src);
    src += pitch;
    q9 = vld1q_u8(src);
    src += pitch;
    q10 = vld1q_u8(src);
    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
                         q5, q6, q7, q8, q9, q10,
                         &q4, &q5, &q6, &q7, &q8, &q9);
    src -= (pitch * 6);
    vst1q_u8(src, q4);
    src += pitch;
    vst1q_u8(src, q5);
    src += pitch;
    vst1q_u8(src, q6);
    src += pitch;
    vst1q_u8(src, q7);
    src += pitch;
    vst1q_u8(src, q8);
    src += pitch;
    vst1q_u8(src, q9);
    return;
 }
 void vp8_mbloop_filter_horizontal_edge_uv_neon(
        unsigned char *u,
        int pitch,
        unsigned char blimit,
        unsigned char limit,
        unsigned char thresh,
        unsigned char *v) {
    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    uint8x16_t q5, q6, q7, q8, q9, q10;
    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
    qblimit = vdupq_n_u8(blimit);
    qlimit = vdupq_n_u8(limit);
    qthresh = vdupq_n_u8(thresh);
    u -= (pitch << 2);
    v -= (pitch << 2);
    d6 = vld1_u8(u);
    u += pitch;
    d7 = vld1_u8(v);
    v += pitch;
    d8 = vld1_u8(u);
    u += pitch;
    d9 = vld1_u8(v);
    v += pitch;
    d10 = vld1_u8(u);
    u += pitch;
    d11 = vld1_u8(v);
    v += pitch;
    d12 = vld1_u8(u);
    u += pitch;
    d13 = vld1_u8(v);
    v += pitch;
    d14 = vld1_u8(u);
    u += pitch;
    d15 = vld1_u8(v);
    v += pitch;
    d16 = vld1_u8(u);
    u += pitch;
    d17 = vld1_u8(v);
    v += pitch;
    d18 = vld1_u8(u);
    u += pitch;
    d19 = vld1_u8(v);
    v += pitch;
    d20 = vld1_u8(u);
    d21 = vld1_u8(v);
    q3 = vcombine_u8(d6, d7);
    q4 = vcombine_u8(d8, d9);
    q5 = vcombine_u8(d10, d11);
    q6 = vcombine_u8(d12, d13);
    q7 = vcombine_u8(d14, d15);
    q8 = vcombine_u8(d16, d17);
    q9 = vcombine_u8(d18, d19);
    q10 = vcombine_u8(d20, d21);
    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
                         q5, q6, q7, q8, q9, q10,
                         &q4, &q5, &q6, &q7, &q8, &q9);
    u -= (pitch * 6);
    v -= (pitch * 6);
    vst1_u8(u, vget_low_u8(q4));
    u += pitch;
    vst1_u8(v, vget_high_u8(q4));
    v += pitch;
    vst1_u8(u, vget_low_u8(q5));
    u += pitch;
    vst1_u8(v, vget_high_u8(q5));
    v += pitch;
    vst1_u8(u, vget_low_u8(q6));
    u += pitch;
    vst1_u8(v, vget_high_u8(q6));
    v += pitch;
    vst1_u8(u, vget_low_u8(q7));
    u += pitch;
    vst1_u8(v, vget_high_u8(q7));
    v += pitch;
    vst1_u8(u, vget_low_u8(q8));
    u += pitch;
    vst1_u8(v, vget_high_u8(q8));
    v += pitch;
    vst1_u8(u, vget_low_u8(q9));
    vst1_u8(v, vget_high_u8(q9));
    return;
 }
 void vp8_mbloop_filter_vertical_edge_y_neon(
        unsigned char *src,
        int pitch,
        unsigned char blimit,
        unsigned char limit,
        unsigned char thresh) {
    unsigned char *s1, *s2;
    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    uint8x16_t q5, q6, q7, q8, q9, q10;
    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
    qblimit = vdupq_n_u8(blimit);
    qlimit = vdupq_n_u8(limit);
    qthresh = vdupq_n_u8(thresh);
    s1 = src - 4;
    s2 = s1 + 8 * pitch;
    d6  = vld1_u8(s1);
    s1 += pitch;
    d7  = vld1_u8(s2);
    s2 += pitch;
    d8  = vld1_u8(s1);
    s1 += pitch;
    d9  = vld1_u8(s2);
    s2 += pitch;
    d10 = vld1_u8(s1);
    s1 += pitch;
    d11 = vld1_u8(s2);
    s2 += pitch;
    d12 = vld1_u8(s1);
    s1 += pitch;
    d13 = vld1_u8(s2);
    s2 += pitch;
    d14 = vld1_u8(s1);
    s1 += pitch;
    d15 = vld1_u8(s2);
    s2 += pitch;
    d16 = vld1_u8(s1);
    s1 += pitch;
    d17 = vld1_u8(s2);
    s2 += pitch;
    d18 = vld1_u8(s1);
    s1 += pitch;
    d19 = vld1_u8(s2);
    s2 += pitch;
    d20 = vld1_u8(s1);
    d21 = vld1_u8(s2);
    q3 = vcombine_u8(d6, d7);
    q4 = vcombine_u8(d8, d9);
    q5 = vcombine_u8(d10, d11);
    q6 = vcombine_u8(d12, d13);
    q7 = vcombine_u8(d14, d15);
    q8 = vcombine_u8(d16, d17);
    q9 = vcombine_u8(d18, d19);
    q10 = vcombine_u8(d20, d21);
    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
                       vreinterpretq_u16_u32(q2tmp2.val[0]));
    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
                       vreinterpretq_u16_u32(q2tmp3.val[0]));
    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
                       vreinterpretq_u16_u32(q2tmp2.val[1]));
    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
                       vreinterpretq_u16_u32(q2tmp3.val[1]));
    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
                       vreinterpretq_u8_u16(q2tmp5.val[0]));
    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
                       vreinterpretq_u8_u16(q2tmp5.val[1]));
    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
                       vreinterpretq_u8_u16(q2tmp7.val[0]));
    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
                       vreinterpretq_u8_u16(q2tmp7.val[1]));
    q3 = q2tmp8.val[0];
    q4 = q2tmp8.val[1];
    q5 = q2tmp9.val[0];
    q6 = q2tmp9.val[1];
    q7 = q2tmp10.val[0];
    q8 = q2tmp10.val[1];
    q9 = q2tmp11.val[0];
    q10 = q2tmp11.val[1];
    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
                         q5, q6, q7, q8, q9, q10,
                         &q4, &q5, &q6, &q7, &q8, &q9);
    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
                       vreinterpretq_u16_u32(q2tmp2.val[0]));
    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
                       vreinterpretq_u16_u32(q2tmp3.val[0]));
    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
                       vreinterpretq_u16_u32(q2tmp2.val[1]));
    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
                       vreinterpretq_u16_u32(q2tmp3.val[1]));
    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
                       vreinterpretq_u8_u16(q2tmp5.val[0]));
    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
                       vreinterpretq_u8_u16(q2tmp5.val[1]));
    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
                       vreinterpretq_u8_u16(q2tmp7.val[0]));
    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
                       vreinterpretq_u8_u16(q2tmp7.val[1]));
    q3 = q2tmp8.val[0];
    q4 = q2tmp8.val[1];
    q5 = q2tmp9.val[0];
    q6 = q2tmp9.val[1];
    q7 = q2tmp10.val[0];
    q8 = q2tmp10.val[1];
    q9 = q2tmp11.val[0];
    q10 = q2tmp11.val[1];
    s1 -= 7 * pitch;
    s2 -= 7 * pitch;
    vst1_u8(s1, vget_low_u8(q3));
    s1 += pitch;
    vst1_u8(s2, vget_high_u8(q3));
    s2 += pitch;
    vst1_u8(s1, vget_low_u8(q4));
    s1 += pitch;
    vst1_u8(s2, vget_high_u8(q4));
    s2 += pitch;
    vst1_u8(s1, vget_low_u8(q5));
    s1 += pitch;
    vst1_u8(s2, vget_high_u8(q5));
    s2 += pitch;
    vst1_u8(s1, vget_low_u8(q6));
    s1 += pitch;
    vst1_u8(s2, vget_high_u8(q6));
    s2 += pitch;
    vst1_u8(s1, vget_low_u8(q7));
    s1 += pitch;
    vst1_u8(s2, vget_high_u8(q7));
    s2 += pitch;
    vst1_u8(s1, vget_low_u8(q8));
    s1 += pitch;
    vst1_u8(s2, vget_high_u8(q8));
    s2 += pitch;
    vst1_u8(s1, vget_low_u8(q9));
    s1 += pitch;
    vst1_u8(s2, vget_high_u8(q9));
    s2 += pitch;
    vst1_u8(s1, vget_low_u8(q10));
    vst1_u8(s2, vget_high_u8(q10));
    return;
 }
 void vp8_mbloop_filter_vertical_edge_uv_neon(
        unsigned char *u,
        int pitch,
        unsigned char blimit,
        unsigned char limit,
        unsigned char thresh,
        unsigned char *v) {
    unsigned char *us, *ud;
    unsigned char *vs, *vd;
    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    uint8x16_t q5, q6, q7, q8, q9, q10;
    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
    qblimit = vdupq_n_u8(blimit);
    qlimit = vdupq_n_u8(limit);
    qthresh = vdupq_n_u8(thresh);
    us = u - 4;
    vs = v - 4;
    d6 = vld1_u8(us);
    us += pitch;
    d7 = vld1_u8(vs);
    vs += pitch;
    d8 = vld1_u8(us);
    us += pitch;
    d9 = vld1_u8(vs);
    vs += pitch;
    d10 = vld1_u8(us);
    us += pitch;
    d11 = vld1_u8(vs);
    vs += pitch;
    d12 = vld1_u8(us);
    us += pitch;
    d13 = vld1_u8(vs);
    vs += pitch;
    d14 = vld1_u8(us);
    us += pitch;
    d15 = vld1_u8(vs);
    vs += pitch;
    d16 = vld1_u8(us);
    us += pitch;
    d17 = vld1_u8(vs);
    vs += pitch;
    d18 = vld1_u8(us);
    us += pitch;
    d19 = vld1_u8(vs);
    vs += pitch;
    d20 = vld1_u8(us);
    d21 = vld1_u8(vs);
    q3 = vcombine_u8(d6, d7);
    q4 = vcombine_u8(d8, d9);
    q5 = vcombine_u8(d10, d11);
    q6 = vcombine_u8(d12, d13);
    q7 = vcombine_u8(d14, d15);
    q8 = vcombine_u8(d16, d17);
    q9 = vcombine_u8(d18, d19);
    q10 = vcombine_u8(d20, d21);
    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
                       vreinterpretq_u16_u32(q2tmp2.val[0]));
    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
                       vreinterpretq_u16_u32(q2tmp3.val[0]));
    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
                       vreinterpretq_u16_u32(q2tmp2.val[1]));
    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
                       vreinterpretq_u16_u32(q2tmp3.val[1]));
    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
                       vreinterpretq_u8_u16(q2tmp5.val[0]));
    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
                       vreinterpretq_u8_u16(q2tmp5.val[1]));
    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
                       vreinterpretq_u8_u16(q2tmp7.val[0]));
    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
                       vreinterpretq_u8_u16(q2tmp7.val[1]));
    q3 = q2tmp8.val[0];
    q4 = q2tmp8.val[1];
    q5 = q2tmp9.val[0];
    q6 = q2tmp9.val[1];
    q7 = q2tmp10.val[0];
    q8 = q2tmp10.val[1];
    q9 = q2tmp11.val[0];
    q10 = q2tmp11.val[1];
    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
                         q5, q6, q7, q8, q9, q10,
                         &q4, &q5, &q6, &q7, &q8, &q9);
    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
                       vreinterpretq_u16_u32(q2tmp2.val[0]));
    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
                       vreinterpretq_u16_u32(q2tmp3.val[0]));
    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
                       vreinterpretq_u16_u32(q2tmp2.val[1]));
    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
                       vreinterpretq_u16_u32(q2tmp3.val[1]));
    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
                       vreinterpretq_u8_u16(q2tmp5.val[0]));
    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
                       vreinterpretq_u8_u16(q2tmp5.val[1]));
    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
                       vreinterpretq_u8_u16(q2tmp7.val[0]));
    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
                       vreinterpretq_u8_u16(q2tmp7.val[1]));
    q3 = q2tmp8.val[0];
    q4 = q2tmp8.val[1];
    q5 = q2tmp9.val[0];
    q6 = q2tmp9.val[1];
    q7 = q2tmp10.val[0];
    q8 = q2tmp10.val[1];
    q9 = q2tmp11.val[0];
    q10 = q2tmp11.val[1];
    ud = u - 4;
    vst1_u8(ud, vget_low_u8(q3));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q4));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q5));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q6));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q7));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q8));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q9));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q10));
    vd = v - 4;
    vst1_u8(vd, vget_high_u8(q3));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q4));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q5));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q6));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q7));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q8));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q9));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q10));
    return;
 }
--- a/vp8/common/arm/neon/shortidct4x4llm_neon.c
+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.c
@@ -1,123 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 static const int16_t cospi8sqrt2minus1 = 20091;
 static const int16_t sinpi8sqrt2       = 35468;
 void vp8_short_idct4x4llm_neon(
        int16_t *input,
        unsigned char *pred_ptr,
        int pred_stride,
        unsigned char *dst_ptr,
        int dst_stride) {
    int i;
    uint32x2_t d6u32 = vdup_n_u32(0);
    uint8x8_t d1u8;
    int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
    uint16x8_t q1u16;
    int16x8_t q1s16, q2s16, q3s16, q4s16;
    int32x2x2_t v2tmp0, v2tmp1;
    int16x4x2_t v2tmp2, v2tmp3;
    d2 = vld1_s16(input);
    d3 = vld1_s16(input + 4);
    d4 = vld1_s16(input + 8);
    d5 = vld1_s16(input + 12);
    // 1st for loop
    q1s16 = vcombine_s16(d2, d4);  // Swap d3 d4 here
    q2s16 = vcombine_s16(d3, d5);
    q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
    q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
    d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
    d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
    q3s16 = vshrq_n_s16(q3s16, 1);
    q4s16 = vshrq_n_s16(q4s16, 1);
    q3s16 = vqaddq_s16(q3s16, q2s16);
    q4s16 = vqaddq_s16(q4s16, q2s16);
    d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
    d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1
    d2 = vqadd_s16(d12, d11);
    d3 = vqadd_s16(d13, d10);
    d4 = vqsub_s16(d13, d10);
    d5 = vqsub_s16(d12, d11);
    v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
    v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
    v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
                      vreinterpret_s16_s32(v2tmp1.val[0]));
    v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
                      vreinterpret_s16_s32(v2tmp1.val[1]));
    // 2nd for loop
    q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
    q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
    q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
    q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
    d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
    d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
    q3s16 = vshrq_n_s16(q3s16, 1);
    q4s16 = vshrq_n_s16(q4s16, 1);
    q3s16 = vqaddq_s16(q3s16, q2s16);
    q4s16 = vqaddq_s16(q4s16, q2s16);
    d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
    d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1
    d2 = vqadd_s16(d12, d11);
    d3 = vqadd_s16(d13, d10);
    d4 = vqsub_s16(d13, d10);
    d5 = vqsub_s16(d12, d11);
    d2 = vrshr_n_s16(d2, 3);
    d3 = vrshr_n_s16(d3, 3);
    d4 = vrshr_n_s16(d4, 3);
    d5 = vrshr_n_s16(d5, 3);
    v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
    v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
    v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
                      vreinterpret_s16_s32(v2tmp1.val[0]));
    v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
                      vreinterpret_s16_s32(v2tmp1.val[1]));
    q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
    q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
    // dc_only_idct_add
    for (i = 0; i < 2; i++, q1s16 = q2s16) {
        d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
        pred_ptr += pred_stride;
        d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
        pred_ptr += pred_stride;
        q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16),
                         vreinterpret_u8_u32(d6u32));
        d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
        dst_ptr += dst_stride;
        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
        dst_ptr += dst_stride;
    }
    return;
 }
--- a/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
--- a/vp8/common/arm/neon/vp8_loopfilter_neon.c
+++ b/vp8/common/arm/neon/vp8_loopfilter_neon.c
@@ -1,550 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 #include "./vpx_config.h"
 #include "vpx_ports/arm.h"
 static INLINE void vp8_loop_filter_neon(
        uint8x16_t qblimit,  // flimit
        uint8x16_t qlimit,   // limit
        uint8x16_t qthresh,  // thresh
        uint8x16_t q3,       // p3
        uint8x16_t q4,       // p2
        uint8x16_t q5,       // p1
        uint8x16_t q6,       // p0
        uint8x16_t q7,       // q0
        uint8x16_t q8,       // q1
        uint8x16_t q9,       // q2
        uint8x16_t q10,      // q3
        uint8x16_t *q5r,     // p1
        uint8x16_t *q6r,     // p0
        uint8x16_t *q7r,     // q0
        uint8x16_t *q8r) {   // q1
    uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
    int16x8_t q2s16, q11s16;
    uint16x8_t q4u16;
    int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
    int8x8_t d2s8, d3s8;
    q11u8 = vabdq_u8(q3, q4);
    q12u8 = vabdq_u8(q4, q5);
    q13u8 = vabdq_u8(q5, q6);
    q14u8 = vabdq_u8(q8, q7);
    q3    = vabdq_u8(q9, q8);
    q4    = vabdq_u8(q10, q9);
    q11u8 = vmaxq_u8(q11u8, q12u8);
    q12u8 = vmaxq_u8(q13u8, q14u8);
    q3    = vmaxq_u8(q3, q4);
    q15u8 = vmaxq_u8(q11u8, q12u8);
    q9 = vabdq_u8(q6, q7);
    // vp8_hevmask
    q13u8 = vcgtq_u8(q13u8, qthresh);
    q14u8 = vcgtq_u8(q14u8, qthresh);
    q15u8 = vmaxq_u8(q15u8, q3);
    q2u8 = vabdq_u8(q5, q8);
    q9 = vqaddq_u8(q9, q9);
    q15u8 = vcgeq_u8(qlimit, q15u8);
    // vp8_filter() function
    // convert to signed
    q10 = vdupq_n_u8(0x80);
    q8 = veorq_u8(q8, q10);
    q7 = veorq_u8(q7, q10);
    q6 = veorq_u8(q6, q10);
    q5 = veorq_u8(q5, q10);
    q2u8 = vshrq_n_u8(q2u8, 1);
    q9 = vqaddq_u8(q9, q2u8);
    q10 = vdupq_n_u8(3);
    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
                     vget_low_s8(vreinterpretq_s8_u8(q6)));
    q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
                      vget_high_s8(vreinterpretq_s8_u8(q6)));
    q9 = vcgeq_u8(qblimit, q9);
    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
                    vreinterpretq_s8_u8(q8));
    q14u8 = vorrq_u8(q13u8, q14u8);
    q4u16 = vmovl_u8(vget_low_u8(q10));
    q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
    q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
    q15u8 = vandq_u8(q15u8, q9);
    q1s8 = vreinterpretq_s8_u8(q1u8);
    q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
    q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
    q9 = vdupq_n_u8(4);
    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
    d2s8 = vqmovn_s16(q2s16);
    d3s8 = vqmovn_s16(q11s16);
    q1s8 = vcombine_s8(d2s8, d3s8);
    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
    q1s8 = vreinterpretq_s8_u8(q1u8);
    q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
    q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
    q2s8 = vshrq_n_s8(q2s8, 3);
    q1s8 = vshrq_n_s8(q1s8, 3);
    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
    q1s8 = vrshrq_n_s8(q1s8, 1);
    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
    q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
    q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
    q0u8 = vdupq_n_u8(0x80);
    *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
    *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
    *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
    *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
    return;
 }
 void vp8_loop_filter_horizontal_edge_y_neon(
        unsigned char *src,
        int pitch,
        unsigned char blimit,
        unsigned char limit,
        unsigned char thresh) {
    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    uint8x16_t q5, q6, q7, q8, q9, q10;
    qblimit = vdupq_n_u8(blimit);
    qlimit  = vdupq_n_u8(limit);
    qthresh = vdupq_n_u8(thresh);
    src -= (pitch << 2);
    q3 = vld1q_u8(src);
    src += pitch;
    q4 = vld1q_u8(src);
    src += pitch;
    q5 = vld1q_u8(src);
    src += pitch;
    q6 = vld1q_u8(src);
    src += pitch;
    q7 = vld1q_u8(src);
    src += pitch;
    q8 = vld1q_u8(src);
    src += pitch;
    q9 = vld1q_u8(src);
    src += pitch;
    q10 = vld1q_u8(src);
    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
                         q5, q6, q7, q8, q9, q10,
                         &q5, &q6, &q7, &q8);
    src -= (pitch * 5);
    vst1q_u8(src, q5);
    src += pitch;
    vst1q_u8(src, q6);
    src += pitch;
    vst1q_u8(src, q7);
    src += pitch;
    vst1q_u8(src, q8);
    return;
 }
 void vp8_loop_filter_horizontal_edge_uv_neon(
        unsigned char *u,
        int pitch,
        unsigned char blimit,
        unsigned char limit,
        unsigned char thresh,
        unsigned char *v) {
    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    uint8x16_t q5, q6, q7, q8, q9, q10;
    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
    qblimit = vdupq_n_u8(blimit);
    qlimit  = vdupq_n_u8(limit);
    qthresh = vdupq_n_u8(thresh);
    u -= (pitch << 2);
    v -= (pitch << 2);
    d6  = vld1_u8(u);
    u += pitch;
    d7  = vld1_u8(v);
    v += pitch;
    d8  = vld1_u8(u);
    u += pitch;
    d9  = vld1_u8(v);
    v += pitch;
    d10 = vld1_u8(u);
    u += pitch;
    d11 = vld1_u8(v);
    v += pitch;
    d12 = vld1_u8(u);
    u += pitch;
    d13 = vld1_u8(v);
    v += pitch;
    d14 = vld1_u8(u);
    u += pitch;
    d15 = vld1_u8(v);
    v += pitch;
    d16 = vld1_u8(u);
    u += pitch;
    d17 = vld1_u8(v);
    v += pitch;
    d18 = vld1_u8(u);
    u += pitch;
    d19 = vld1_u8(v);
    v += pitch;
    d20 = vld1_u8(u);
    d21 = vld1_u8(v);
    q3 = vcombine_u8(d6, d7);
    q4 = vcombine_u8(d8, d9);
    q5 = vcombine_u8(d10, d11);
    q6 = vcombine_u8(d12, d13);
    q7 = vcombine_u8(d14, d15);
    q8 = vcombine_u8(d16, d17);
    q9 = vcombine_u8(d18, d19);
    q10 = vcombine_u8(d20, d21);
    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
                         q5, q6, q7, q8, q9, q10,
                         &q5, &q6, &q7, &q8);
    u -= (pitch * 5);
    vst1_u8(u, vget_low_u8(q5));
    u += pitch;
    vst1_u8(u, vget_low_u8(q6));
    u += pitch;
    vst1_u8(u, vget_low_u8(q7));
    u += pitch;
    vst1_u8(u, vget_low_u8(q8));
    v -= (pitch * 5);
    vst1_u8(v, vget_high_u8(q5));
    v += pitch;
    vst1_u8(v, vget_high_u8(q6));
    v += pitch;
    vst1_u8(v, vget_high_u8(q7));
    v += pitch;
    vst1_u8(v, vget_high_u8(q8));
    return;
 }
 static INLINE void write_4x8(unsigned char *dst, int pitch,
                             const uint8x8x4_t result) {
 #ifdef VPX_INCOMPATIBLE_GCC
    /*
     * uint8x8x4_t result
    00 01 02 03 | 04 05 06 07
    10 11 12 13 | 14 15 16 17
    20 21 22 23 | 24 25 26 27
    30 31 32 33 | 34 35 36 37
    ---
    * after vtrn_u16
    00 01 20 21 | 04 05 24 25
    02 03 22 23 | 06 07 26 27
    10 11 30 31 | 14 15 34 35
    12 13 32 33 | 16 17 36 37
    ---
    * after vtrn_u8
    00 10 20 30 | 04 14 24 34
    01 11 21 31 | 05 15 25 35
    02 12 22 32 | 06 16 26 36
    03 13 23 33 | 07 17 27 37
    */
    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
                                          vreinterpret_u16_u8(result.val[2]));
    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
                                          vreinterpret_u16_u8(result.val[3]));
    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
                                       vreinterpret_u8_u16(r13_u16.val[0]));
    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
                                       vreinterpret_u8_u16(r13_u16.val[1]));
    const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
    const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
    const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
    const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
    vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
    dst += pitch;
    vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
    dst += pitch;
    vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
    dst += pitch;
    vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
    dst += pitch;
    vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
    dst += pitch;
    vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
    dst += pitch;
    vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
    dst += pitch;
    vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
 #else
    vst4_lane_u8(dst, result, 0);
    dst += pitch;
    vst4_lane_u8(dst, result, 1);
    dst += pitch;
    vst4_lane_u8(dst, result, 2);
    dst += pitch;
    vst4_lane_u8(dst, result, 3);
    dst += pitch;
    vst4_lane_u8(dst, result, 4);
    dst += pitch;
    vst4_lane_u8(dst, result, 5);
    dst += pitch;
    vst4_lane_u8(dst, result, 6);
    dst += pitch;
    vst4_lane_u8(dst, result, 7);
 #endif  // VPX_INCOMPATIBLE_GCC
 }
 void vp8_loop_filter_vertical_edge_y_neon(
        unsigned char *src,
        int pitch,
        unsigned char blimit,
        unsigned char limit,
        unsigned char thresh) {
    unsigned char *s, *d;
    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    uint8x16_t q5, q6, q7, q8, q9, q10;
    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
    uint8x8x4_t q4ResultH, q4ResultL;
    qblimit = vdupq_n_u8(blimit);
    qlimit  = vdupq_n_u8(limit);
    qthresh = vdupq_n_u8(thresh);
    s = src - 4;
    d6  = vld1_u8(s);
    s += pitch;
    d8  = vld1_u8(s);
    s += pitch;
    d10 = vld1_u8(s);
    s += pitch;
    d12 = vld1_u8(s);
    s += pitch;
    d14 = vld1_u8(s);
    s += pitch;
    d16 = vld1_u8(s);
    s += pitch;
    d18 = vld1_u8(s);
    s += pitch;
    d20 = vld1_u8(s);
    s += pitch;
    d7  = vld1_u8(s);
    s += pitch;
    d9  = vld1_u8(s);
    s += pitch;
    d11 = vld1_u8(s);
    s += pitch;
    d13 = vld1_u8(s);
    s += pitch;
    d15 = vld1_u8(s);
    s += pitch;
    d17 = vld1_u8(s);
    s += pitch;
    d19 = vld1_u8(s);
    s += pitch;
    d21 = vld1_u8(s);
    q3 = vcombine_u8(d6, d7);
    q4 = vcombine_u8(d8, d9);
    q5 = vcombine_u8(d10, d11);
    q6 = vcombine_u8(d12, d13);
    q7 = vcombine_u8(d14, d15);
    q8 = vcombine_u8(d16, d17);
    q9 = vcombine_u8(d18, d19);
    q10 = vcombine_u8(d20, d21);
    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
                       vreinterpretq_u16_u32(q2tmp2.val[0]));
    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
                       vreinterpretq_u16_u32(q2tmp3.val[0]));
    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
                       vreinterpretq_u16_u32(q2tmp2.val[1]));
    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
                       vreinterpretq_u16_u32(q2tmp3.val[1]));
    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
                       vreinterpretq_u8_u16(q2tmp5.val[0]));
    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
                       vreinterpretq_u8_u16(q2tmp5.val[1]));
    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
                       vreinterpretq_u8_u16(q2tmp7.val[0]));
    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
                       vreinterpretq_u8_u16(q2tmp7.val[1]));
    q3 = q2tmp8.val[0];
    q4 = q2tmp8.val[1];
    q5 = q2tmp9.val[0];
    q6 = q2tmp9.val[1];
    q7 = q2tmp10.val[0];
    q8 = q2tmp10.val[1];
    q9 = q2tmp11.val[0];
    q10 = q2tmp11.val[1];
    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
                         q5, q6, q7, q8, q9, q10,
                         &q5, &q6, &q7, &q8);
    q4ResultL.val[0] = vget_low_u8(q5);   // d10
    q4ResultL.val[1] = vget_low_u8(q6);   // d12
    q4ResultL.val[2] = vget_low_u8(q7);   // d14
    q4ResultL.val[3] = vget_low_u8(q8);   // d16
    q4ResultH.val[0] = vget_high_u8(q5);  // d11
    q4ResultH.val[1] = vget_high_u8(q6);  // d13
    q4ResultH.val[2] = vget_high_u8(q7);  // d15
    q4ResultH.val[3] = vget_high_u8(q8);  // d17
    d = src - 2;
    write_4x8(d, pitch, q4ResultL);
    d += pitch * 8;
    write_4x8(d, pitch, q4ResultH);
 }
 void vp8_loop_filter_vertical_edge_uv_neon(
        unsigned char *u,
        int pitch,
        unsigned char blimit,
        unsigned char limit,
        unsigned char thresh,
        unsigned char *v) {
    unsigned char *us, *ud;
    unsigned char *vs, *vd;
    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    uint8x16_t q5, q6, q7, q8, q9, q10;
    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
    uint8x8x4_t q4ResultH, q4ResultL;
    qblimit = vdupq_n_u8(blimit);
    qlimit  = vdupq_n_u8(limit);
    qthresh = vdupq_n_u8(thresh);
    us = u - 4;
    d6 = vld1_u8(us);
    us += pitch;
    d8 = vld1_u8(us);
    us += pitch;
    d10 = vld1_u8(us);
    us += pitch;
    d12 = vld1_u8(us);
    us += pitch;
    d14 = vld1_u8(us);
    us += pitch;
    d16 = vld1_u8(us);
    us += pitch;
    d18 = vld1_u8(us);
    us += pitch;
    d20 = vld1_u8(us);
    vs = v - 4;
    d7 = vld1_u8(vs);
    vs += pitch;
    d9 = vld1_u8(vs);
    vs += pitch;
    d11 = vld1_u8(vs);
    vs += pitch;
    d13 = vld1_u8(vs);
    vs += pitch;
    d15 = vld1_u8(vs);
    vs += pitch;
    d17 = vld1_u8(vs);
    vs += pitch;
    d19 = vld1_u8(vs);
    vs += pitch;
    d21 = vld1_u8(vs);
    q3 = vcombine_u8(d6, d7);
    q4 = vcombine_u8(d8, d9);
    q5 = vcombine_u8(d10, d11);
    q6 = vcombine_u8(d12, d13);
    q7 = vcombine_u8(d14, d15);
    q8 = vcombine_u8(d16, d17);
    q9 = vcombine_u8(d18, d19);
    q10 = vcombine_u8(d20, d21);
    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
                       vreinterpretq_u16_u32(q2tmp2.val[0]));
    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
                       vreinterpretq_u16_u32(q2tmp3.val[0]));
    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
                       vreinterpretq_u16_u32(q2tmp2.val[1]));
    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
                       vreinterpretq_u16_u32(q2tmp3.val[1]));
    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
                       vreinterpretq_u8_u16(q2tmp5.val[0]));
    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
                       vreinterpretq_u8_u16(q2tmp5.val[1]));
    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
                       vreinterpretq_u8_u16(q2tmp7.val[0]));
    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
                       vreinterpretq_u8_u16(q2tmp7.val[1]));
    q3 = q2tmp8.val[0];
    q4 = q2tmp8.val[1];
    q5 = q2tmp9.val[0];
    q6 = q2tmp9.val[1];
    q7 = q2tmp10.val[0];
    q8 = q2tmp10.val[1];
    q9 = q2tmp11.val[0];
    q10 = q2tmp11.val[1];
    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
                         q5, q6, q7, q8, q9, q10,
                         &q5, &q6, &q7, &q8);
    q4ResultL.val[0] = vget_low_u8(q5);   // d10
    q4ResultL.val[1] = vget_low_u8(q6);   // d12
    q4ResultL.val[2] = vget_low_u8(q7);   // d14
    q4ResultL.val[3] = vget_low_u8(q8);   // d16
    ud = u - 2;
    write_4x8(ud, pitch, q4ResultL);
    q4ResultH.val[0] = vget_high_u8(q5);  // d11
    q4ResultH.val[1] = vget_high_u8(q6);  // d13
    q4ResultH.val[2] = vget_high_u8(q7);  // d15
    q4ResultH.val[3] = vget_high_u8(q8);  // d17
    vd = v - 2;
    write_4x8(vd, pitch, q4ResultH);
 }
--- a/vp8/common/blockd.c
+++ b/vp8/common/blockd.c
@@ -1,22 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "blockd.h"
 #include "vpx_mem/vpx_mem.h"
 const unsigned char vp8_block2left[25] =
 {
    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
 };
 const unsigned char vp8_block2above[25] =
 {
    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
 };
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -1,312 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_BLOCKD_H_
 #define VP8_COMMON_BLOCKD_H_
 void vpx_log(const char *format, ...);
 #include "vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "mv.h"
 #include "treecoder.h"
 #include "vpx_ports/mem.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*#define DCPRED 1*/
 #define DCPREDSIMTHRESH 0
 #define DCPREDCNTTHRESH 3
 #define MB_FEATURE_TREE_PROBS   3
 #define MAX_MB_SEGMENTS         4
 #define MAX_REF_LF_DELTAS       4
 #define MAX_MODE_LF_DELTAS      4
 /* Segment Feature Masks */
 #define SEGMENT_DELTADATA   0
 #define SEGMENT_ABSDATA     1
 typedef struct
 {
    int r, c;
 } POS;
 #define PLANE_TYPE_Y_NO_DC    0
 #define PLANE_TYPE_Y2         1
 #define PLANE_TYPE_UV         2
 #define PLANE_TYPE_Y_WITH_DC  3
 typedef char ENTROPY_CONTEXT;
 typedef struct
 {
    ENTROPY_CONTEXT y1[4];
    ENTROPY_CONTEXT u[2];
    ENTROPY_CONTEXT v[2];
    ENTROPY_CONTEXT y2;
 } ENTROPY_CONTEXT_PLANES;
 extern const unsigned char vp8_block2left[25];
 extern const unsigned char vp8_block2above[25];
 #define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \
    Dest = (A)+(B);
 typedef enum
 {
    KEY_FRAME = 0,
    INTER_FRAME = 1
 } FRAME_TYPE;
 typedef enum
 {
    DC_PRED,            /* average of above and left pixels */
    V_PRED,             /* vertical prediction */
    H_PRED,             /* horizontal prediction */
    TM_PRED,            /* Truemotion prediction */
    B_PRED,             /* block based prediction, each block has its own prediction mode */
    NEARESTMV,
    NEARMV,
    ZEROMV,
    NEWMV,
    SPLITMV,
    MB_MODE_COUNT
 } MB_PREDICTION_MODE;
 /* Macroblock level features */
 typedef enum
 {
    MB_LVL_ALT_Q = 0,               /* Use alternate Quantizer .... */
    MB_LVL_ALT_LF = 1,              /* Use alternate loop filter value... */
    MB_LVL_MAX = 2                  /* Number of MB level features supported */
 } MB_LVL_FEATURES;
 /* Segment Feature Masks */
 #define SEGMENT_ALTQ    0x01
 #define SEGMENT_ALT_LF  0x02
 #define VP8_YMODES  (B_PRED + 1)
 #define VP8_UV_MODES (TM_PRED + 1)
 #define VP8_MVREFS (1 + SPLITMV - NEARESTMV)
 typedef enum
 {
    B_DC_PRED,          /* average of above and left pixels */
    B_TM_PRED,
    B_VE_PRED,           /* vertical prediction */
    B_HE_PRED,           /* horizontal prediction */
    B_LD_PRED,
    B_RD_PRED,
    B_VR_PRED,
    B_VL_PRED,
    B_HD_PRED,
    B_HU_PRED,
    LEFT4X4,
    ABOVE4X4,
    ZERO4X4,
    NEW4X4,
    B_MODE_COUNT
 } B_PREDICTION_MODE;
 #define VP8_BINTRAMODES (B_HU_PRED + 1)  /* 10 */
 #define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
 /* For keyframes, intra block modes are predicted by the (already decoded)
   modes for the Y blocks to the left and above us; for interframes, there
   is a single probability table. */
 union b_mode_info
 {
    B_PREDICTION_MODE as_mode;
    int_mv mv;
 };
 typedef enum
 {
    INTRA_FRAME = 0,
    LAST_FRAME = 1,
    GOLDEN_FRAME = 2,
    ALTREF_FRAME = 3,
    MAX_REF_FRAMES = 4
 } MV_REFERENCE_FRAME;
 typedef struct
 {
    uint8_t mode, uv_mode;
    uint8_t ref_frame;
    uint8_t is_4x4;
    int_mv mv;
    uint8_t partitioning;
    uint8_t mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
    uint8_t need_to_clamp_mvs;
    uint8_t segment_id;                  /* Which set of segmentation parameters should be used for this MB */
 } MB_MODE_INFO;
 typedef struct modeinfo
 {
    MB_MODE_INFO mbmi;
    union b_mode_info bmi[16];
 } MODE_INFO;
 #if CONFIG_MULTI_RES_ENCODING
 /* The mb-level information needed to be stored for higher-resolution encoder */
 typedef struct
 {
    MB_PREDICTION_MODE mode;
    MV_REFERENCE_FRAME ref_frame;
    int_mv mv;
    int dissim;    /* dissimilarity level of the macroblock */
 } LOWER_RES_MB_INFO;
 /* The frame-level information needed to be stored for higher-resolution
 *  encoder */
 typedef struct
 {
    FRAME_TYPE frame_type;
    int is_frame_dropped;
    // The frame rate for the lowest resolution.
    double low_res_framerate;
    /* The frame number of each reference frames */
    unsigned int low_res_ref_frames[MAX_REF_FRAMES];
    // The video frame counter value for the key frame, for lowest resolution.
    unsigned int key_frame_counter_value;
    LOWER_RES_MB_INFO *mb_info;
 } LOWER_RES_FRAME_INFO;
 #endif
 typedef struct blockd
 {
    short *qcoeff;
    short *dqcoeff;
    unsigned char  *predictor;
    short *dequant;
    int offset;
    char *eob;
    union b_mode_info bmi;
 } BLOCKD;
 typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
 typedef struct macroblockd
 {
    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
    DECLARE_ALIGNED(16, short, qcoeff[400]);
    DECLARE_ALIGNED(16, short, dqcoeff[400]);
    DECLARE_ALIGNED(16, char,  eobs[25]);
    DECLARE_ALIGNED(16, short,  dequant_y1[16]);
    DECLARE_ALIGNED(16, short,  dequant_y1_dc[16]);
    DECLARE_ALIGNED(16, short,  dequant_y2[16]);
    DECLARE_ALIGNED(16, short,  dequant_uv[16]);
    /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
    BLOCKD block[25];
    int fullpixel_mask;
    YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
    YV12_BUFFER_CONFIG dst;
    MODE_INFO *mode_info_context;
    int mode_info_stride;
    FRAME_TYPE frame_type;
    int up_available;
    int left_available;
    unsigned char *recon_above[3];
    unsigned char *recon_left[3];
    int recon_left_stride[2];
    /* Y,U,V,Y2 */
    ENTROPY_CONTEXT_PLANES *above_context;
    ENTROPY_CONTEXT_PLANES *left_context;
    /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
    unsigned char segmentation_enabled;
    /* 0 (do not update) 1 (update) the macroblock segmentation map. */
    unsigned char update_mb_segmentation_map;
    /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
    unsigned char update_mb_segmentation_data;
    /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
    unsigned char mb_segement_abs_delta;
    /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
    /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
    vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];         /* Probability Tree used to code Segment number */
    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];            /* Segment parameters */
    /* mode_based Loop filter adjustment */
    unsigned char mode_ref_lf_delta_enabled;
    unsigned char mode_ref_lf_delta_update;
    /* Delta values have the range +/- MAX_LOOP_FILTER */
    signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];                /* 0 = Intra, Last, GF, ARF */
    signed char ref_lf_deltas[MAX_REF_LF_DELTAS];                     /* 0 = Intra, Last, GF, ARF */
    signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];                      /* 0 = BPRED, ZERO_MV, MV, SPLIT */
    signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];                           /* 0 = BPRED, ZERO_MV, MV, SPLIT */
    /* Distance of MB away from frame edges */
    int mb_to_left_edge;
    int mb_to_right_edge;
    int mb_to_top_edge;
    int mb_to_bottom_edge;
    vp8_subpix_fn_t  subpixel_predict;
    vp8_subpix_fn_t  subpixel_predict8x4;
    vp8_subpix_fn_t  subpixel_predict8x8;
    vp8_subpix_fn_t  subpixel_predict16x16;
    void *current_bc;
    int corrupted;
 #if ARCH_X86 || ARCH_X86_64
    /* This is an intermediate buffer currently used in sub-pixel motion search
     * to keep a copy of the reference area. This buffer can be used for other
     * purpose.
     */
    DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]);
 #endif
 } MACROBLOCKD;
 extern void vp8_build_block_doffsets(MACROBLOCKD *x);
 extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_BLOCKD_H_
--- a/vp8/common/coefupdateprobs.h
+++ b/vp8/common/coefupdateprobs.h
@@ -1,197 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_COEFUPDATEPROBS_H_
 #define VP8_COMMON_COEFUPDATEPROBS_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* Update probabilities for the nodes in the token entropy tree.
   Generated file included by entropy.c */
 const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] =
 {
    {
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
            {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
            {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
            {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
    },
    {
        {
            {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
            {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
        },
        {
            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
    },
    {
        {
            {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
            {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
            {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
        },
        {
            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
    },
    {
        {
            {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
            {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
            {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
            {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
            {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
            {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
            {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
    },
 };
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_COEFUPDATEPROBS_H_
--- a/vp8/common/common.h
+++ b/vp8/common/common.h
@@ -1,48 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_COMMON_H_
 #define VP8_COMMON_COMMON_H_
 #include <assert.h>
 /* Interface header for common constant data structures and lookup tables */
 #include "vpx_mem/vpx_mem.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* Only need this for fixed-size arrays, for structs just assign. */
 #define vp8_copy( Dest, Src) { \
        assert( sizeof( Dest) == sizeof( Src)); \
        memcpy( Dest, Src, sizeof( Src)); \
    }
 /* Use this for variably-sized arrays. */
 #define vp8_copy_array( Dest, Src, N) { \
        assert( sizeof( *(Dest)) == sizeof( *(Src))); \
        memcpy( Dest, Src, N * sizeof( *(Src))); \
    }
 #define vp8_zero( Dest)  memset( &(Dest), 0, sizeof( Dest));
 #define vp8_zero_array( Dest, N)  memset( Dest, 0, N * sizeof( *(Dest)));
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_COMMON_H_
--- a/vp8/common/context.c
+++ b/vp8/common/context.c
@@ -1,399 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "entropy.h"
 /* *** GENERATED FILE: DO NOT EDIT *** */
 #if 0
 int Contexts[vp8_coef_counter_dimen];
 const int default_contexts[vp8_coef_counter_dimen] =
 {
    {
        // Block Type ( 0 )
        {
            // Coeff Band ( 0 )
            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
        },
        {
            // Coeff Band ( 1 )
            {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593,},
            {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987,},
            {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104,},
        },
        {
            // Coeff Band ( 2 )
            {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0,},
            {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294,},
            {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879,},
        },
        {
            // Coeff Band ( 3 )
            {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0,},
            {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302,},
            { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611,},
        },
        {
            // Coeff Band ( 4 )
            {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0,},
            {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073,},
            { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50,},
        },
        {
            // Coeff Band ( 5 )
            {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0,},
            {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362,},
            { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190,},
        },
        {
            // Coeff Band ( 6 )
            {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0,},
            {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164,},
            { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345,},
        },
        {
            // Coeff Band ( 7 )
            {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
            {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319,},
            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8,},
        },
    },
    {
        // Block Type ( 1 )
        {
            // Coeff Band ( 0 )
            {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289,},
            {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914,},
            {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620,},
        },
        {
            // Coeff Band ( 1 )
            {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0,},
            {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988,},
            {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136,},
        },
        {
            // Coeff Band ( 2 )
            {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0,},
            {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980,},
            {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429,},
        },
        {
            // Coeff Band ( 3 )
            {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0,},
            {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820,},
            {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679,},
        },
        {
            // Coeff Band ( 4 )
            {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0,},
            {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127,},
            { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101,},
        },
        {
            // Coeff Band ( 5 )
            {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0,},
            {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157,},
            { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198,},
        },
        {
            // Coeff Band ( 6 )
            {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0,},
            {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195,},
            { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507,},
        },
        {
            // Coeff Band ( 7 )
            {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0,},
            {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641,},
            {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30,},
        },
    },
    {
        // Block Type ( 2 )
        {
            // Coeff Band ( 0 )
            { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798,},
            {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837,},
            {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122,},
        },
        {
            // Coeff Band ( 1 )
            {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0,},
            {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063,},
            {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047,},
        },
        {
            // Coeff Band ( 2 )
            { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0,},
            { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404,},
            { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236,},
        },
        {
            // Coeff Band ( 3 )
            { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
            { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157,},
            { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300,},
        },
        {
            // Coeff Band ( 4 )
            {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
            {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427,},
            {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7,},
        },
        {
            // Coeff Band ( 5 )
            {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
            {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652,},
            {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30,},
        },
        {
            // Coeff Band ( 6 )
            { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
            {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517,},
            {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,},
        },
        {
            // Coeff Band ( 7 )
            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
        },
    },
    {
        // Block Type ( 3 )
        {
            // Coeff Band ( 0 )
            {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694,},
            {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572,},
            {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284,},
        },
        {
            // Coeff Band ( 1 )
            {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0,},
            {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280,},
            {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460,},
        },
        {
            // Coeff Band ( 2 )
            {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0,},
            {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539,},
            {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138,},
        },
        {
            // Coeff Band ( 3 )
            {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0,},
            {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181,},
            {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267,},
        },
        {
            // Coeff Band ( 4 )
            {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0,},
            {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401,},
            {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268,},
        },
        {
            // Coeff Band ( 5 )
            {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0,},
            {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811,},
            {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527,},
        },
        {
            // Coeff Band ( 6 )
            {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0,},
            {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954,},
            {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979,},
        },
        {
            // Coeff Band ( 7 )
            {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
            {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459,},
            {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13,},
        },
    },
 };
 //Update probabilities for the nodes in the token entropy tree.
 const vp8_prob tree_update_probs[vp8_coef_tree_dimen] =
 {
    {
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
            {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
            {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
            {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
    },
    {
        {
            {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
            {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
        },
        {
            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
    },
    {
        {
            {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
            {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
            {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
        },
        {
            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
    },
    {
        {
            {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
            {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
            {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
            {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
            {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
            {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
            {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
        {
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
        },
    },
 };
 #endif
--- a/vp8/common/copy_c.c
+++ b/vp8/common/copy_c.c
@@ -1,32 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <string.h>
 #include "./vp8_rtcd.h"
 #include "vpx/vpx_integer.h"
 /* Copy 2 macroblocks to a buffer */
 void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride,
                    unsigned char *dst_ptr, int dst_stride,
                    int height)
 {
    int r;
    for (r = 0; r < height; r++)
    {
        memcpy(dst_ptr, src_ptr, 32);
        src_ptr += src_stride;
        dst_ptr += dst_stride;
    }
 }
--- a/vp8/common/debugmodes.c
+++ b/vp8/common/debugmodes.c
@@ -1,155 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <stdio.h>
 #include "blockd.h"
 void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int frame)
 {
    int mb_row;
    int mb_col;
    int mb_index = 0;
    FILE *mvs = fopen("mvs.stt", "a");
    /* print out the macroblock Y modes */
    mb_index = 0;
    fprintf(mvs, "Mb Modes for Frame %d\n", frame);
    for (mb_row = 0; mb_row < rows; mb_row++)
    {
        for (mb_col = 0; mb_col < cols; mb_col++)
        {
            fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);
            mb_index++;
        }
        fprintf(mvs, "\n");
        mb_index++;
    }
    fprintf(mvs, "\n");
    mb_index = 0;
    fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
    for (mb_row = 0; mb_row < rows; mb_row++)
    {
        for (mb_col = 0; mb_col < cols; mb_col++)
        {
            fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);
            mb_index++;
        }
        fprintf(mvs, "\n");
        mb_index++;
    }
    fprintf(mvs, "\n");
    /* print out the macroblock UV modes */
    mb_index = 0;
    fprintf(mvs, "UV Modes for Frame %d\n", frame);
    for (mb_row = 0; mb_row < rows; mb_row++)
    {
        for (mb_col = 0; mb_col < cols; mb_col++)
        {
            fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);
            mb_index++;
        }
        mb_index++;
        fprintf(mvs, "\n");
    }
    fprintf(mvs, "\n");
    /* print out the block modes */
    fprintf(mvs, "Mbs for Frame %d\n", frame);
    {
        int b_row;
        for (b_row = 0; b_row < 4 * rows; b_row++)
        {
            int b_col;
            int bindex;
            for (b_col = 0; b_col < 4 * cols; b_col++)
            {
                mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
                bindex = (b_row & 3) * 4 + (b_col & 3);
                if (mi[mb_index].mbmi.mode == B_PRED)
                    fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode);
                else
                    fprintf(mvs, "xx ");
            }
            fprintf(mvs, "\n");
        }
    }
    fprintf(mvs, "\n");
    /* print out the macroblock mvs */
    mb_index = 0;
    fprintf(mvs, "MVs for Frame %d\n", frame);
    for (mb_row = 0; mb_row < rows; mb_row++)
    {
        for (mb_col = 0; mb_col < cols; mb_col++)
        {
            fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv.as_mv.row / 2, mi[mb_index].mbmi.mv.as_mv.col / 2);
            mb_index++;
        }
        mb_index++;
        fprintf(mvs, "\n");
    }
    fprintf(mvs, "\n");
    /* print out the block modes */
    fprintf(mvs, "MVs for Frame %d\n", frame);
    {
        int b_row;
        for (b_row = 0; b_row < 4 * rows; b_row++)
        {
            int b_col;
            int bindex;
            for (b_col = 0; b_col < 4 * cols; b_col++)
            {
                mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
                bindex = (b_row & 3) * 4 + (b_col & 3);
                fprintf(mvs, "%3d:%-3d ", mi[mb_index].bmi[bindex].mv.as_mv.row, mi[mb_index].bmi[bindex].mv.as_mv.col);
            }
            fprintf(mvs, "\n");
        }
    }
    fprintf(mvs, "\n");
    fclose(mvs);
 }
--- a/vp8/common/default_coef_probs.h
+++ b/vp8/common/default_coef_probs.h
@@ -1,200 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_
 #define VP8_COMMON_DEFAULT_COEF_PROBS_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*Generated file, included by entropy.c*/
 static const vp8_prob default_coef_probs [BLOCK_TYPES]
                                         [COEF_BANDS]
                                         [PREV_COEF_CONTEXTS]
                                         [ENTROPY_NODES] =
 {
    { /* Block Type ( 0 ) */
        { /* Coeff Band ( 0 )*/
            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
        },
        { /* Coeff Band ( 1 )*/
            { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
            { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
            { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
        },
        { /* Coeff Band ( 2 )*/
            {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
            { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
            {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 }
        },
        { /* Coeff Band ( 3 )*/
            {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
            { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
            {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 }
        },
        { /* Coeff Band ( 4 )*/
            {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
            { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
            {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
        },
        { /* Coeff Band ( 5 )*/
            {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
            { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
            { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
        },
        { /* Coeff Band ( 6 )*/
            {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
            { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
            {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
        },
        { /* Coeff Band ( 7 )*/
            {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
            { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
            { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
        }
    },
    { /* Block Type ( 1 ) */
        { /* Coeff Band ( 0 )*/
            { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },
            { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },
            {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
        },
        { /* Coeff Band ( 1 )*/
            {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
            { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
            {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
        },
        { /* Coeff Band ( 2 )*/
            {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
            {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
            {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
        },
        { /* Coeff Band ( 3 )*/
            {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
            { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
            {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
        },
        { /* Coeff Band ( 4 )*/
            {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
            {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
            {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
        },
        { /* Coeff Band ( 5 )*/
            {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
            { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
            {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
        },
        { /* Coeff Band ( 6 )*/
            {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
            { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
            {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
        },
        { /* Coeff Band ( 7 )*/
            {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
            { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
            { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
        }
    },
    { /* Block Type ( 2 ) */
        { /* Coeff Band ( 0 )*/
            { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
            { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
            {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
        },
        { /* Coeff Band ( 1 )*/
            {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
            { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
            { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
        },
        { /* Coeff Band ( 2 )*/
            {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
            { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
            {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
        },
        { /* Coeff Band ( 3 )*/
            {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
            { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
            { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
        },
        { /* Coeff Band ( 4 )*/
            {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
            { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
            { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
        },
        { /* Coeff Band ( 5 )*/
            {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
            { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
            { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
        },
        { /* Coeff Band ( 6 )*/
            {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
            { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
            {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
        },
        { /* Coeff Band ( 7 )*/
            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
        }
    },
    { /* Block Type ( 3 ) */
        { /* Coeff Band ( 0 )*/
            { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
            { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
            {  61,  46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
        },
        { /* Coeff Band ( 1 )*/
            {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
            { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
            {  39,  77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
        },
        { /* Coeff Band ( 2 )*/
            {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
            { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
            {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
        },
        { /* Coeff Band ( 3 )*/
            {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
            { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
            {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
        },
        { /* Coeff Band ( 4 )*/
            {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
            { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
            {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
        },
        { /* Coeff Band ( 5 )*/
            {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
            { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
            {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
        },
        { /* Coeff Band ( 6 )*/
            {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
            { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
            {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
        },
        { /* Coeff Band ( 7 )*/
            {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
            { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
            { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
        }
    }
 };
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_DEFAULT_COEF_PROBS_H_
--- a/vp8/common/dequantize.c
+++ b/vp8/common/dequantize.c
@@ -1,43 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vpx_mem/vpx_mem.h"
 void vp8_dequantize_b_c(BLOCKD *d, short *DQC)
 {
    int i;
    short *DQ  = d->dqcoeff;
    short *Q   = d->qcoeff;
    for (i = 0; i < 16; i++)
    {
        DQ[i] = Q[i] * DQC[i];
    }
 }
 void vp8_dequant_idct_add_c(short *input, short *dq,
                            unsigned char *dest, int stride)
 {
    int i;
    for (i = 0; i < 16; i++)
    {
        input[i] = dq[i] * input[i];
    }
    vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
    memset(input, 0, 32);
 }
--- a/vp8/common/entropy.c
+++ b/vp8/common/entropy.c
@@ -1,188 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "entropy.h"
 #include "blockd.h"
 #include "onyxc_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "coefupdateprobs.h"
 DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) =
 {
    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]) =
 { 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7};
 DECLARE_ALIGNED(16, const unsigned char,
                vp8_prev_token_class[MAX_ENTROPY_TOKENS]) =
 { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0};
 DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
 {
    0,  1,  4,  8,
    5,  2,  3,  6,
    9, 12, 13, 10,
    7, 11, 14, 15,
 };
 DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
 {
    1,  2,  6,  7,
    3,  5,  8, 13,
    4,  9, 12, 14,
   10, 11, 15, 16
 };
 /* vp8_default_zig_zag_mask generated with:
    void vp8_init_scan_order_mask()
    {
        int i;
        for (i = 0; i < 16; i++)
        {
            vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i;
        }
    }
 */
 DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]) =
 {
     1,    2,    32,     64,
     4,   16,   128,   4096,
     8,  256,  2048,   8192,
   512, 1024, 16384, -32768
 };
 const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
 /* Array indices are identical to previously-existing CONTEXT_NODE indices */
 const vp8_tree_index vp8_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
 {
    -DCT_EOB_TOKEN, 2,                             /* 0 = EOB */
    -ZERO_TOKEN, 4,                               /* 1 = ZERO */
    -ONE_TOKEN, 6,                               /* 2 = ONE */
    8, 12,                                      /* 3 = LOW_VAL */
    -TWO_TOKEN, 10,                            /* 4 = TWO */
    -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
    14, 16,                                    /* 6 = HIGH_LOW */
    -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
    18, 20,                                   /* 8 = CAT_THREEFOUR */
    -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */
    -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */
 };
 /* vp8_coef_encodings generated with:
    vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree);
 */
 vp8_token vp8_coef_encodings[MAX_ENTROPY_TOKENS] =
 {
    {2, 2},
    {6, 3},
    {28, 5},
    {58, 6},
    {59, 6},
    {60, 6},
    {61, 6},
    {124, 7},
    {125, 7},
    {126, 7},
    {127, 7},
    {0, 1}
 };
 /* Trees for extra bits.  Probabilities are constant and
   do not depend on previously encoded bits */
 static const vp8_prob Pcat1[] = { 159};
 static const vp8_prob Pcat2[] = { 165, 145};
 static const vp8_prob Pcat3[] = { 173, 148, 140};
 static const vp8_prob Pcat4[] = { 176, 155, 140, 135};
 static const vp8_prob Pcat5[] = { 180, 157, 141, 134, 130};
 static const vp8_prob Pcat6[] =
 { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129};
 /* tree index tables generated with:
    void init_bit_tree(vp8_tree_index *p, int n)
    {
        int i = 0;
        while (++i < n)
        {
            p[0] = p[1] = i << 1;
            p += 2;
        }
        p[0] = p[1] = 0;
    }
    void init_bit_trees()
    {
        init_bit_tree(cat1, 1);
        init_bit_tree(cat2, 2);
        init_bit_tree(cat3, 3);
        init_bit_tree(cat4, 4);
        init_bit_tree(cat5, 5);
        init_bit_tree(cat6, 11);
    }
 */
 static const vp8_tree_index cat1[2] = { 0, 0 };
 static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 };
 static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 };
 static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 };
 static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 };
 static const vp8_tree_index cat6[22] = { 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
                                        14, 14, 16, 16, 18, 18, 20, 20, 0, 0 };
 const vp8_extra_bit_struct vp8_extra_bits[12] =
 {
    { 0, 0, 0, 0},
    { 0, 0, 0, 1},
    { 0, 0, 0, 2},
    { 0, 0, 0, 3},
    { 0, 0, 0, 4},
    { cat1, Pcat1, 1, 5},
    { cat2, Pcat2, 2, 7},
    { cat3, Pcat3, 3, 11},
    { cat4, Pcat4, 4, 19},
    { cat5, Pcat5, 5, 35},
    { cat6, Pcat6, 11, 67},
    { 0, 0, 0, 0}
 };
 #include "default_coef_probs.h"
 void vp8_default_coef_probs(VP8_COMMON *pc)
 {
    memcpy(pc->fc.coef_probs, default_coef_probs, sizeof(default_coef_probs));
 }
--- a/vp8/common/entropy.h
+++ b/vp8/common/entropy.h
@@ -1,109 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_ENTROPY_H_
 #define VP8_COMMON_ENTROPY_H_
 #include "treecoder.h"
 #include "blockd.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* Coefficient token alphabet */
 #define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */
 #define ONE_TOKEN               1       /* 1         Extra Bits 0+1 */
 #define TWO_TOKEN               2       /* 2         Extra Bits 0+1 */
 #define THREE_TOKEN             3       /* 3         Extra Bits 0+1 */
 #define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
 #define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
 #define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
 #define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */
 #define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */
 #define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
 #define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 11+1 */
 #define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */
 #define MAX_ENTROPY_TOKENS 12
 #define ENTROPY_NODES 11
 extern const vp8_tree_index vp8_coef_tree[];
 extern const struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS];
 typedef struct
 {
    vp8_tree_p tree;
    const vp8_prob *prob;
    int Len;
    int base_val;
 } vp8_extra_bit_struct;
 extern const vp8_extra_bit_struct vp8_extra_bits[12];    /* indexed by token value */
 #define PROB_UPDATE_BASELINE_COST   7
 #define MAX_PROB                255
 #define DCT_MAX_VALUE           2048
 /* Coefficients are predicted via a 3-dimensional probability table. */
 /* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
 #define BLOCK_TYPES 4
 /* Middle dimension is a coarsening of the coefficient's
   position within the 4x4 DCT. */
 #define COEF_BANDS 8
 extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);
 /* Inside dimension is 3-valued measure of nearby complexity, that is,
   the extent to which nearby coefficients are nonzero.  For the first
   coefficient (DC, unless block type is 0), we look at the (already encoded)
   blocks above and to the left of the current block.  The context index is
   then the number (0,1,or 2) of these blocks having nonzero coefficients.
   After decoding a coefficient, the measure is roughly the size of the
   most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).
   Note that the intuitive meaning of this measure changes as coefficients
   are decoded, e.g., prior to the first token, a zero means that my neighbors
   are empty while, after the first token, because of the use of end-of-block,
   a zero means we just decoded a zero and hence guarantees that a non-zero
   coefficient will appear later in this block.  However, this shift
   in meaning is perfectly OK because our context depends also on the
   coefficient band (and since zigzag positions 0, 1, and 2 are in
   distinct bands). */
 /*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
 #   define PREV_COEF_CONTEXTS       3
 extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]);
 extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
 struct VP8Common;
 void vp8_default_coef_probs(struct VP8Common *);
 extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
 extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
 extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]);
 extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
 void vp8_coef_tree_initialize(void);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_ENTROPY_H_
--- a/vp8/common/entropymode.c
+++ b/vp8/common/entropymode.c
@@ -1,171 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #define USE_PREBUILT_TABLES
 #include "entropymode.h"
 #include "entropy.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8_entropymodedata.h"
 int vp8_mv_cont(const int_mv *l, const int_mv *a)
 {
    int lez = (l->as_int == 0);
    int aez = (a->as_int == 0);
    int lea = (l->as_int == a->as_int);
    if (lea && lez)
        return SUBMVREF_LEFT_ABOVE_ZED;
    if (lea)
        return SUBMVREF_LEFT_ABOVE_SAME;
    if (aez)
        return SUBMVREF_ABOVE_ZED;
    if (lez)
        return SUBMVREF_LEFT_ZED;
    return SUBMVREF_NORMAL;
 }
 static const vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1] = { 180, 162, 25};
 const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1] =
 {
    { 147, 136, 18 },
    { 106, 145, 1  },
    { 179, 121, 1  },
    { 223, 1  , 34 },
    { 208, 1  , 1  }
 };
 const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS] =
 {
    {
        0,  0,  0,  0,
        0,  0,  0,  0,
        1,  1,  1,  1,
        1,  1,  1,  1,
    },
    {
        0,  0,  1,  1,
        0,  0,  1,  1,
        0,  0,  1,  1,
        0,  0,  1,  1,
    },
    {
        0,  0,  1,  1,
        0,  0,  1,  1,
        2,  2,  3,  3,
        2,  2,  3,  3,
    },
    {
        0,  1,  2,  3,
        4,  5,  6,  7,
        8,  9,  10, 11,
        12, 13, 14, 15,
    }
 };
 const int vp8_mbsplit_count [VP8_NUMMBSPLITS] = { 2, 2, 4, 16};
 const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1] = { 110, 111, 150};
 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
 const vp8_tree_index vp8_bmode_tree[18] =     /* INTRAMODECONTEXTNODE value */
 {
    -B_DC_PRED, 2,                             /* 0 = DC_NODE */
    -B_TM_PRED, 4,                            /* 1 = TM_NODE */
    -B_VE_PRED, 6,                           /* 2 = VE_NODE */
    8, 12,                                  /* 3 = COM_NODE */
    -B_HE_PRED, 10,                        /* 4 = HE_NODE */
    -B_RD_PRED, -B_VR_PRED,               /* 5 = RD_NODE */
    -B_LD_PRED, 14,                        /* 6 = LD_NODE */
    -B_VL_PRED, 16,                      /* 7 = VL_NODE */
    -B_HD_PRED, -B_HU_PRED             /* 8 = HD_NODE */
 };
 /* Again, these trees use the same probability indices as their
   explicitly-programmed predecessors. */
 const vp8_tree_index vp8_ymode_tree[8] =
 {
    -DC_PRED, 2,
    4, 6,
    -V_PRED, -H_PRED,
    -TM_PRED, -B_PRED
 };
 const vp8_tree_index vp8_kf_ymode_tree[8] =
 {
    -B_PRED, 2,
    4, 6,
    -DC_PRED, -V_PRED,
    -H_PRED, -TM_PRED
 };
 const vp8_tree_index vp8_uv_mode_tree[6] =
 {
    -DC_PRED, 2,
    -V_PRED, 4,
    -H_PRED, -TM_PRED
 };
 const vp8_tree_index vp8_mbsplit_tree[6] =
 {
    -3, 2,
    -2, 4,
    -0, -1
 };
 const vp8_tree_index vp8_mv_ref_tree[8] =
 {
    -ZEROMV, 2,
    -NEARESTMV, 4,
    -NEARMV, 6,
    -NEWMV, -SPLITMV
 };
 const vp8_tree_index vp8_sub_mv_ref_tree[6] =
 {
    -LEFT4X4, 2,
    -ABOVE4X4, 4,
    -ZERO4X4, -NEW4X4
 };
 const vp8_tree_index vp8_small_mvtree [14] =
 {
    2, 8,
    4, 6,
    -0, -1,
    -2, -3,
    10, 12,
    -4, -5,
    -6, -7
 };
 void vp8_init_mbmode_probs(VP8_COMMON *x)
 {
    memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
    memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
    memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
 }
 void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1])
 {
    memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
 }
--- a/vp8/common/entropymode.h
+++ b/vp8/common/entropymode.h
@@ -1,88 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_ENTROPYMODE_H_
 #define VP8_COMMON_ENTROPYMODE_H_
 #include "onyxc_int.h"
 #include "treecoder.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef enum
 {
    SUBMVREF_NORMAL,
    SUBMVREF_LEFT_ZED,
    SUBMVREF_ABOVE_ZED,
    SUBMVREF_LEFT_ABOVE_SAME,
    SUBMVREF_LEFT_ABOVE_ZED
 } sumvfref_t;
 typedef int vp8_mbsplit[16];
 #define VP8_NUMMBSPLITS 4
 extern const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS];
 extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS];    /* # of subsets */
 extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1];
 extern int vp8_mv_cont(const int_mv *l, const int_mv *a);
 #define SUBMVREF_COUNT 5
 extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1];
 extern const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES];
 extern const vp8_tree_index vp8_bmode_tree[];
 extern const vp8_tree_index  vp8_ymode_tree[];
 extern const vp8_tree_index  vp8_kf_ymode_tree[];
 extern const vp8_tree_index  vp8_uv_mode_tree[];
 extern const vp8_tree_index  vp8_mbsplit_tree[];
 extern const vp8_tree_index  vp8_mv_ref_tree[];
 extern const vp8_tree_index  vp8_sub_mv_ref_tree[];
 extern const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES];
 extern const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES];
 extern const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES];
 extern const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES];
 extern const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS];
 /* Inter mode values do not start at zero */
 extern const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS];
 extern const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS];
 extern const vp8_tree_index vp8_small_mvtree[];
 extern const struct vp8_token_struct vp8_small_mvencodings[8];
 /* Key frame default mode probs */
 extern const vp8_prob vp8_kf_bmode_prob[VP8_BINTRAMODES][VP8_BINTRAMODES]
 [VP8_BINTRAMODES-1];
 extern const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1];
 extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1];
 void vp8_init_mbmode_probs(VP8_COMMON *x);
 void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]);
 void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_ENTROPYMODE_H_
--- a/vp8/common/entropymv.c
+++ b/vp8/common/entropymv.c
@@ -1,49 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "entropymv.h"
 const MV_CONTEXT vp8_mv_update_probs[2] =
 {
    {{
        237,
        246,
        253, 253, 254, 254, 254, 254, 254,
        254, 254, 254, 254, 254, 250, 250, 252, 254, 254
    }},
    {{
        231,
        243,
        245, 253, 254, 254, 254, 254, 254,
        254, 254, 254, 254, 254, 251, 251, 254, 254, 254
    }}
 };
 const MV_CONTEXT vp8_default_mv_context[2] =
 {
    {{
        /* row */
        162,                                        /* is short */
        128,                                        /* sign */
        225, 146, 172, 147, 214,  39, 156,          /* short tree */
        128, 129, 132,  75, 145, 178, 206, 239, 254, 254 /* long bits */
    }},
    {{
        /* same for column */
        164,                                        /* is short */
        128,
        204, 170, 119, 235, 140, 230, 228,
        128, 130, 130,  74, 148, 180, 203, 236, 254, 254 /* long bits */
    }}
 };
--- a/vp8/common/entropymv.h
+++ b/vp8/common/entropymv.h
@@ -1,52 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_ENTROPYMV_H_
 #define VP8_COMMON_ENTROPYMV_H_
 #include "treecoder.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 enum
 {
    mv_max  = 1023,              /* max absolute value of a MV component */
    MVvals = (2 * mv_max) + 1,   /* # possible values "" */
    mvfp_max  = 255,              /* max absolute value of a full pixel MV component */
    MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */
    mvlong_width = 10,       /* Large MVs have 9 bit magnitudes */
    mvnum_short = 8,         /* magnitudes 0 through 7 */
    /* probability offsets for coding each MV component */
    mvpis_short = 0,         /* short (<= 7) vs long (>= 8) */
    MVPsign,                /* sign for non-zero */
    MVPshort,               /* 8 short values = 7-position tree */
    MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */
    MVPcount = MVPbits + mvlong_width    /* (with independent probabilities) */
 };
 typedef struct mv_context
 {
    vp8_prob prob[MVPcount];  /* often come in row, col pairs */
 } MV_CONTEXT;
 extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_ENTROPYMV_H_
--- a/vp8/common/extend.c
+++ b/vp8/common/extend.c
@@ -1,188 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "extend.h"
 #include "vpx_mem/vpx_mem.h"
 static void copy_and_extend_plane
 (
    unsigned char *s, /* source */
    int sp,           /* source pitch */
    unsigned char *d, /* destination */
    int dp,           /* destination pitch */
    int h,            /* height */
    int w,            /* width */
    int et,           /* extend top border */
    int el,           /* extend left border */
    int eb,           /* extend bottom border */
    int er            /* extend right border */
 )
 {
    int i;
    unsigned char *src_ptr1, *src_ptr2;
    unsigned char *dest_ptr1, *dest_ptr2;
    int linesize;
    /* copy the left and right most columns out */
    src_ptr1 = s;
    src_ptr2 = s + w - 1;
    dest_ptr1 = d - el;
    dest_ptr2 = d + w;
    for (i = 0; i < h; i++)
    {
        memset(dest_ptr1, src_ptr1[0], el);
        memcpy(dest_ptr1 + el, src_ptr1, w);
        memset(dest_ptr2, src_ptr2[0], er);
        src_ptr1  += sp;
        src_ptr2  += sp;
        dest_ptr1 += dp;
        dest_ptr2 += dp;
    }
    /* Now copy the top and bottom lines into each line of the respective
     * borders
     */
    src_ptr1 = d - el;
    src_ptr2 = d + dp * (h - 1) - el;
    dest_ptr1 = d + dp * (-et) - el;
    dest_ptr2 = d + dp * (h) - el;
    linesize = el + er + w;
    for (i = 0; i < et; i++)
    {
        memcpy(dest_ptr1, src_ptr1, linesize);
        dest_ptr1 += dp;
    }
    for (i = 0; i < eb; i++)
    {
        memcpy(dest_ptr2, src_ptr2, linesize);
        dest_ptr2 += dp;
    }
 }
 void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
                               YV12_BUFFER_CONFIG *dst)
 {
    int et = dst->border;
    int el = dst->border;
    int eb = dst->border + dst->y_height - src->y_height;
    int er = dst->border + dst->y_width - src->y_width;
    copy_and_extend_plane(src->y_buffer, src->y_stride,
                          dst->y_buffer, dst->y_stride,
                          src->y_height, src->y_width,
                          et, el, eb, er);
    et = dst->border >> 1;
    el = dst->border >> 1;
    eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
    er = (dst->border >> 1) + dst->uv_width - src->uv_width;
    copy_and_extend_plane(src->u_buffer, src->uv_stride,
                          dst->u_buffer, dst->uv_stride,
                          src->uv_height, src->uv_width,
                          et, el, eb, er);
    copy_and_extend_plane(src->v_buffer, src->uv_stride,
                          dst->v_buffer, dst->uv_stride,
                          src->uv_height, src->uv_width,
                          et, el, eb, er);
 }
 void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
                                         YV12_BUFFER_CONFIG *dst,
                                         int srcy, int srcx,
                                         int srch, int srcw)
 {
    int et = dst->border;
    int el = dst->border;
    int eb = dst->border + dst->y_height - src->y_height;
    int er = dst->border + dst->y_width - src->y_width;
    int src_y_offset = srcy * src->y_stride + srcx;
    int dst_y_offset = srcy * dst->y_stride + srcx;
    int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
    int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
    /* If the side is not touching the bounder then don't extend. */
    if (srcy)
      et = 0;
    if (srcx)
      el = 0;
    if (srcy + srch != src->y_height)
      eb = 0;
    if (srcx + srcw != src->y_width)
      er = 0;
    copy_and_extend_plane(src->y_buffer + src_y_offset,
                          src->y_stride,
                          dst->y_buffer + dst_y_offset,
                          dst->y_stride,
                          srch, srcw,
                          et, el, eb, er);
    et = (et + 1) >> 1;
    el = (el + 1) >> 1;
    eb = (eb + 1) >> 1;
    er = (er + 1) >> 1;
    srch = (srch + 1) >> 1;
    srcw = (srcw + 1) >> 1;
    copy_and_extend_plane(src->u_buffer + src_uv_offset,
                          src->uv_stride,
                          dst->u_buffer + dst_uv_offset,
                          dst->uv_stride,
                          srch, srcw,
                          et, el, eb, er);
    copy_and_extend_plane(src->v_buffer + src_uv_offset,
                          src->uv_stride,
                          dst->v_buffer + dst_uv_offset,
                          dst->uv_stride,
                          srch, srcw,
                          et, el, eb, er);
 }
 /* note the extension is only for the last row, for intra prediction purpose */
 void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf,
                       unsigned char *YPtr,
                       unsigned char *UPtr,
                       unsigned char *VPtr)
 {
    int i;
    YPtr += ybf->y_stride * 14;
    UPtr += ybf->uv_stride * 6;
    VPtr += ybf->uv_stride * 6;
    for (i = 0; i < 4; i++)
    {
        YPtr[i] = YPtr[-1];
        UPtr[i] = UPtr[-1];
        VPtr[i] = VPtr[-1];
    }
    YPtr += ybf->y_stride;
    UPtr += ybf->uv_stride;
    VPtr += ybf->uv_stride;
    for (i = 0; i < 4; i++)
    {
        YPtr[i] = YPtr[-1];
        UPtr[i] = UPtr[-1];
        VPtr[i] = VPtr[-1];
    }
 }
--- a/vp8/common/extend.h
+++ b/vp8/common/extend.h
@@ -1,33 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_EXTEND_H_
 #define VP8_COMMON_EXTEND_H_
 #include "vpx_scale/yv12config.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
 void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
                               YV12_BUFFER_CONFIG *dst);
 void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
                                         YV12_BUFFER_CONFIG *dst,
                                         int srcy, int srcx,
                                         int srch, int srcw);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_EXTEND_H_
--- a/vp8/common/filter.c
+++ b/vp8/common/filter.c
@@ -1,493 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "filter.h"
 #include "./vp8_rtcd.h"
 DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
 {
    { 128,   0 },
    { 112,  16 },
    {  96,  32 },
    {  80,  48 },
    {  64,  64 },
    {  48,  80 },
    {  32,  96 },
    {  16, 112 }
 };
 DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
 {
    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
    { 0, -6,  123,   12,  -1,  0 },
    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
    { 0, -9,   93,   50,  -6,  0 },
    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
    { 0, -6,   50,   93,  -9,  0 },
    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
    { 0, -1,   12,  123,  -6,  0 },
 };
 static void filter_block2d_first_pass
 (
    unsigned char *src_ptr,
    int *output_ptr,
    unsigned int src_pixels_per_line,
    unsigned int pixel_step,
    unsigned int output_height,
    unsigned int output_width,
    const short *vp8_filter
 )
 {
    unsigned int i, j;
    int  Temp;
    for (i = 0; i < output_height; i++)
    {
        for (j = 0; j < output_width; j++)
        {
            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
                   ((int)src_ptr[0]                 * vp8_filter[2]) +
                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
                   (VP8_FILTER_WEIGHT >> 1);      /* Rounding */
            /* Normalize back to 0-255 */
            Temp = Temp >> VP8_FILTER_SHIFT;
            if (Temp < 0)
                Temp = 0;
            else if (Temp > 255)
                Temp = 255;
            output_ptr[j] = Temp;
            src_ptr++;
        }
        /* Next row... */
        src_ptr    += src_pixels_per_line - output_width;
        output_ptr += output_width;
    }
 }
 static void filter_block2d_second_pass
 (
    int *src_ptr,
    unsigned char *output_ptr,
    int output_pitch,
    unsigned int src_pixels_per_line,
    unsigned int pixel_step,
    unsigned int output_height,
    unsigned int output_width,
    const short *vp8_filter
 )
 {
    unsigned int i, j;
    int  Temp;
    for (i = 0; i < output_height; i++)
    {
        for (j = 0; j < output_width; j++)
        {
            /* Apply filter */
            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
                   ((int)src_ptr[0]                 * vp8_filter[2]) +
                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
                   (VP8_FILTER_WEIGHT >> 1);   /* Rounding */
            /* Normalize back to 0-255 */
            Temp = Temp >> VP8_FILTER_SHIFT;
            if (Temp < 0)
                Temp = 0;
            else if (Temp > 255)
                Temp = 255;
            output_ptr[j] = (unsigned char)Temp;
            src_ptr++;
        }
        /* Start next row */
        src_ptr    += src_pixels_per_line - output_width;
        output_ptr += output_pitch;
    }
 }
 static void filter_block2d
 (
    unsigned char  *src_ptr,
    unsigned char  *output_ptr,
    unsigned int src_pixels_per_line,
    int output_pitch,
    const short  *HFilter,
    const short  *VFilter
 )
 {
    int FData[9*4]; /* Temp data buffer used in filtering */
    /* First filter 1-D horizontally... */
    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
    /* then filter verticaly... */
    filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
 }
 void vp8_sixtap_predict4x4_c
 (
    unsigned char  *src_ptr,
    int   src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int dst_pitch
 )
 {
    const short  *HFilter;
    const short  *VFilter;
    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
    filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
 }
 void vp8_sixtap_predict8x8_c
 (
    unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int  dst_pitch
 )
 {
    const short  *HFilter;
    const short  *VFilter;
    int FData[13*16];   /* Temp data buffer used in filtering */
    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
    /* First filter 1-D horizontally... */
    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
    /* then filter verticaly... */
    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
 }
 void vp8_sixtap_predict8x4_c
 (
    unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int  dst_pitch
 )
 {
    const short  *HFilter;
    const short  *VFilter;
    int FData[13*16];   /* Temp data buffer used in filtering */
    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
    /* First filter 1-D horizontally... */
    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
    /* then filter verticaly... */
    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
 }
 void vp8_sixtap_predict16x16_c
 (
    unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int  dst_pitch
 )
 {
    const short  *HFilter;
    const short  *VFilter;
    int FData[21*24];   /* Temp data buffer used in filtering */
    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
    /* First filter 1-D horizontally... */
    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
    /* then filter verticaly... */
    filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
 }
 /****************************************************************************
 *
 *  ROUTINE       : filter_block2d_bil_first_pass
 *
 *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.
 *                  UINT32  src_stride : Stride of source block.
 *                  UINT32  height     : Block height.
 *                  UINT32  width      : Block width.
 *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
 *
 *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
 *                  in the horizontal direction to produce the filtered output
 *                  block. Used to implement first-pass of 2-D separable filter.
 *
 *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
 *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
 *
 ****************************************************************************/
 static void filter_block2d_bil_first_pass
 (
    unsigned char  *src_ptr,
    unsigned short *dst_ptr,
    unsigned int    src_stride,
    unsigned int    height,
    unsigned int    width,
    const short    *vp8_filter
 )
 {
    unsigned int i, j;
    for (i = 0; i < height; i++)
    {
        for (j = 0; j < width; j++)
        {
            /* Apply bilinear filter */
            dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
                          ((int)src_ptr[1] * vp8_filter[1]) +
                          (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
            src_ptr++;
        }
        /* Next row... */
        src_ptr += src_stride - width;
        dst_ptr += width;
    }
 }
 /****************************************************************************
 *
 *  ROUTINE       : filter_block2d_bil_second_pass
 *
 *  INPUTS        : INT32  *src_ptr    : Pointer to source block.
 *                  UINT32  dst_pitch  : Destination block pitch.
 *                  UINT32  height     : Block height.
 *                  UINT32  width      : Block width.
 *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
 *
 *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
 *                  in the vertical direction to produce the filtered output
 *                  block. Used to implement second-pass of 2-D separable filter.
 *
 *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
 *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
 *
 ****************************************************************************/
 static void filter_block2d_bil_second_pass
 (
    unsigned short *src_ptr,
    unsigned char  *dst_ptr,
    int             dst_pitch,
    unsigned int    height,
    unsigned int    width,
    const short    *vp8_filter
 )
 {
    unsigned int  i, j;
    int  Temp;
    for (i = 0; i < height; i++)
    {
        for (j = 0; j < width; j++)
        {
            /* Apply filter */
            Temp = ((int)src_ptr[0]     * vp8_filter[0]) +
                   ((int)src_ptr[width] * vp8_filter[1]) +
                   (VP8_FILTER_WEIGHT / 2);
            dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
            src_ptr++;
        }
        /* Next row... */
        dst_ptr += dst_pitch;
    }
 }
 /****************************************************************************
 *
 *  ROUTINE       : filter_block2d_bil
 *
 *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
 *                  UINT32  src_pitch        : Stride of source block.
 *                  UINT32  dst_pitch        : Stride of destination block.
 *                  INT32  *HFilter          : Array of 2 horizontal filter taps.
 *                  INT32  *VFilter          : Array of 2 vertical filter taps.
 *                  INT32  Width             : Block width
 *                  INT32  Height            : Block height
 *
 *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 2-D filters an input block by applying a 2-tap
 *                  bi-linear filter horizontally followed by a 2-tap
 *                  bi-linear filter vertically on the result.
 *
 *  SPECIAL NOTES : The largest block size can be handled here is 16x16
 *
 ****************************************************************************/
 static void filter_block2d_bil
 (
    unsigned char *src_ptr,
    unsigned char *dst_ptr,
    unsigned int   src_pitch,
    unsigned int   dst_pitch,
    const short   *HFilter,
    const short   *VFilter,
    int            Width,
    int            Height
 )
 {
    unsigned short FData[17*16];    /* Temp data buffer used in filtering */
    /* First filter 1-D horizontally... */
    filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
    /* then 1-D vertically... */
    filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }
 void vp8_bilinear_predict4x4_c
 (
    unsigned char  *src_ptr,
    int   src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int dst_pitch
 )
 {
    const short *HFilter;
    const short *VFilter;
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];
 #if 0
    {
        int i;
        unsigned char temp1[16];
        unsigned char temp2[16];
        bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
        filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
        for (i = 0; i < 16; i++)
        {
            if (temp1[i] != temp2[i])
            {
                bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
                filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
            }
        }
    }
 #endif
    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
 }
 void vp8_bilinear_predict8x8_c
 (
    unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int  dst_pitch
 )
 {
    const short *HFilter;
    const short *VFilter;
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];
    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
 }
 void vp8_bilinear_predict8x4_c
 (
    unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int  dst_pitch
 )
 {
    const short *HFilter;
    const short *VFilter;
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];
    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
 }
 void vp8_bilinear_predict16x16_c
 (
    unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int  dst_pitch
 )
 {
    const short *HFilter;
    const short *VFilter;
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];
    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
--- a/vp8/common/filter.h
+++ b/vp8/common/filter.h
@@ -1,32 +0,0 @@
 /*
 *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_FILTER_H_
 #define VP8_COMMON_FILTER_H_
 #include "vpx_ports/mem.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 #define BLOCK_HEIGHT_WIDTH 4
 #define VP8_FILTER_WEIGHT 128
 #define VP8_FILTER_SHIFT  7
 extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]);
 extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_FILTER_H_
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -1,193 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "findnearmv.h"
 const unsigned char vp8_mbsplit_offset[4][16] = {
    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
 };
 /* Predict motion vectors using those from already-decoded nearby blocks.
   Note that we only consider one 4x4 subblock from each candidate 16x16
   macroblock.   */
 void vp8_find_near_mvs
 (
    MACROBLOCKD *xd,
    const MODE_INFO *here,
    int_mv *nearest,
    int_mv *nearby,
    int_mv *best_mv,
    int cnt[4],
    int refframe,
    int *ref_frame_sign_bias
 )
 {
    const MODE_INFO *above = here - xd->mode_info_stride;
    const MODE_INFO *left = here - 1;
    const MODE_INFO *aboveleft = above - 1;
    int_mv            near_mvs[4];
    int_mv           *mv = near_mvs;
    int             *cntx = cnt;
    enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
    /* Zero accumulators */
    mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
    cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
    /* Process above */
    if (above->mbmi.ref_frame != INTRA_FRAME)
    {
        if (above->mbmi.mv.as_int)
        {
            (++mv)->as_int = above->mbmi.mv.as_int;
            mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias);
            ++cntx;
        }
        *cntx += 2;
    }
    /* Process left */
    if (left->mbmi.ref_frame != INTRA_FRAME)
    {
        if (left->mbmi.mv.as_int)
        {
            int_mv this_mv;
            this_mv.as_int = left->mbmi.mv.as_int;
            mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
            if (this_mv.as_int != mv->as_int)
            {
                (++mv)->as_int = this_mv.as_int;
                ++cntx;
            }
            *cntx += 2;
        }
        else
            cnt[CNT_INTRA] += 2;
    }
    /* Process above left */
    if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
    {
        if (aboveleft->mbmi.mv.as_int)
        {
            int_mv this_mv;
            this_mv.as_int = aboveleft->mbmi.mv.as_int;
            mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
            if (this_mv.as_int != mv->as_int)
            {
                (++mv)->as_int = this_mv.as_int;
                ++cntx;
            }
            *cntx += 1;
        }
        else
            cnt[CNT_INTRA] += 1;
    }
    /* If we have three distinct MV's ... */
    if (cnt[CNT_SPLITMV])
    {
        /* See if above-left MV can be merged with NEAREST */
        if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
            cnt[CNT_NEAREST] += 1;
    }
    cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
                        + (left->mbmi.mode == SPLITMV)) * 2
                       + (aboveleft->mbmi.mode == SPLITMV);
    /* Swap near and nearest if necessary */
    if (cnt[CNT_NEAR] > cnt[CNT_NEAREST])
    {
        int tmp;
        tmp = cnt[CNT_NEAREST];
        cnt[CNT_NEAREST] = cnt[CNT_NEAR];
        cnt[CNT_NEAR] = tmp;
        tmp = near_mvs[CNT_NEAREST].as_int;
        near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
        near_mvs[CNT_NEAR].as_int = tmp;
    }
    /* Use near_mvs[0] to store the "best" MV */
    if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])
        near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
    /* Set up return values */
    best_mv->as_int = near_mvs[0].as_int;
    nearest->as_int = near_mvs[CNT_NEAREST].as_int;
    nearby->as_int = near_mvs[CNT_NEAR].as_int;
 }
 static void invert_and_clamp_mvs(int_mv *inv, int_mv *src, MACROBLOCKD *xd)
 {
    inv->as_mv.row = src->as_mv.row * -1;
    inv->as_mv.col = src->as_mv.col * -1;
    vp8_clamp_mv2(inv, xd);
    vp8_clamp_mv2(src, xd);
 }
 int vp8_find_near_mvs_bias
 (
    MACROBLOCKD *xd,
    const MODE_INFO *here,
    int_mv mode_mv_sb[2][MB_MODE_COUNT],
    int_mv best_mv_sb[2],
    int cnt[4],
    int refframe,
    int *ref_frame_sign_bias
 )
 {
    int sign_bias = ref_frame_sign_bias[refframe];
    vp8_find_near_mvs(xd,
                      here,
                      &mode_mv_sb[sign_bias][NEARESTMV],
                      &mode_mv_sb[sign_bias][NEARMV],
                      &best_mv_sb[sign_bias],
                      cnt,
                      refframe,
                      ref_frame_sign_bias);
    invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARESTMV],
                         &mode_mv_sb[sign_bias][NEARESTMV], xd);
    invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARMV],
                         &mode_mv_sb[sign_bias][NEARMV], xd);
    invert_and_clamp_mvs(&best_mv_sb[!sign_bias],
                         &best_mv_sb[sign_bias], xd);
    return sign_bias;
 }
 vp8_prob *vp8_mv_ref_probs(
    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
 )
 {
    p[0] = vp8_mode_contexts [near_mv_ref_ct[0]] [0];
    p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1];
    p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2];
    p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3];
    /*p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];*/
    return p;
 }
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -1,195 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_FINDNEARMV_H_
 #define VP8_COMMON_FINDNEARMV_H_
 #include "./vpx_config.h"
 #include "mv.h"
 #include "blockd.h"
 #include "modecont.h"
 #include "treecoder.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 static INLINE void mv_bias(int refmb_ref_frame_sign_bias, int refframe,
                           int_mv *mvp, const int *ref_frame_sign_bias)
 {
    if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
    {
        mvp->as_mv.row *= -1;
        mvp->as_mv.col *= -1;
    }
 }
 #define LEFT_TOP_MARGIN (16 << 3)
 #define RIGHT_BOTTOM_MARGIN (16 << 3)
 static INLINE void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd)
 {
    if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
        mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
    else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
        mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
    if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
        mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
    else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
        mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
 }
 static INLINE void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge,
                                int mb_to_right_edge, int mb_to_top_edge,
                                int mb_to_bottom_edge)
 {
    mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
        mb_to_left_edge : mv->as_mv.col;
    mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
        mb_to_right_edge : mv->as_mv.col;
    mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
        mb_to_top_edge : mv->as_mv.row;
    mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
        mb_to_bottom_edge : mv->as_mv.row;
 }
 static INLINE unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
                                               int mb_to_right_edge,
                                               int mb_to_top_edge,
                                               int mb_to_bottom_edge)
 {
    unsigned int need_to_clamp;
    need_to_clamp = (mv->as_mv.col < mb_to_left_edge);
    need_to_clamp |= (mv->as_mv.col > mb_to_right_edge);
    need_to_clamp |= (mv->as_mv.row < mb_to_top_edge);
    need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge);
    return need_to_clamp;
 }
 void vp8_find_near_mvs
 (
    MACROBLOCKD *xd,
    const MODE_INFO *here,
    int_mv *nearest, int_mv *nearby, int_mv *best,
    int near_mv_ref_cts[4],
    int refframe,
    int *ref_frame_sign_bias
 );
 int vp8_find_near_mvs_bias
 (
    MACROBLOCKD *xd,
    const MODE_INFO *here,
    int_mv mode_mv_sb[2][MB_MODE_COUNT],
    int_mv best_mv_sb[2],
    int cnt[4],
    int refframe,
    int *ref_frame_sign_bias
 );
 vp8_prob *vp8_mv_ref_probs(
    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
 );
 extern const unsigned char vp8_mbsplit_offset[4][16];
 static INLINE uint32_t left_block_mv(const MODE_INFO *cur_mb, int b)
 {
    if (!(b & 3))
    {
        /* On L edge, get from MB to left of us */
        --cur_mb;
        if(cur_mb->mbmi.mode != SPLITMV)
            return cur_mb->mbmi.mv.as_int;
        b += 4;
    }
    return (cur_mb->bmi + b - 1)->mv.as_int;
 }
 static INLINE uint32_t above_block_mv(const MODE_INFO *cur_mb, int b,
                                      int mi_stride)
 {
    if (!(b >> 2))
    {
        /* On top edge, get from MB above us */
        cur_mb -= mi_stride;
        if(cur_mb->mbmi.mode != SPLITMV)
            return cur_mb->mbmi.mv.as_int;
        b += 16;
    }
    return (cur_mb->bmi + (b - 4))->mv.as_int;
 }
 static INLINE B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b)
 {
    if (!(b & 3))
    {
        /* On L edge, get from MB to left of us */
        --cur_mb;
        switch (cur_mb->mbmi.mode)
        {
            case B_PRED:
              return (cur_mb->bmi + b + 3)->as_mode;
            case DC_PRED:
                return B_DC_PRED;
            case V_PRED:
                return B_VE_PRED;
            case H_PRED:
                return B_HE_PRED;
            case TM_PRED:
                return B_TM_PRED;
            default:
                return B_DC_PRED;
        }
    }
    return (cur_mb->bmi + b - 1)->as_mode;
 }
 static INLINE B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b,
                                                 int mi_stride)
 {
    if (!(b >> 2))
    {
        /* On top edge, get from MB above us */
        cur_mb -= mi_stride;
        switch (cur_mb->mbmi.mode)
        {
            case B_PRED:
              return (cur_mb->bmi + b + 12)->as_mode;
            case DC_PRED:
                return B_DC_PRED;
            case V_PRED:
                return B_VE_PRED;
            case H_PRED:
                return B_HE_PRED;
            case TM_PRED:
                return B_TM_PRED;
            default:
                return B_DC_PRED;
        }
    }
    return (cur_mb->bmi + b - 4)->as_mode;
 }
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_FINDNEARMV_H_
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -1,104 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #if ARCH_ARM
 #include "vpx_ports/arm.h"
 #elif ARCH_X86 || ARCH_X86_64
 #include "vpx_ports/x86.h"
 #endif
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/systemdependent.h"
 #if CONFIG_MULTITHREAD
 #if HAVE_UNISTD_H && !defined(__OS2__)
 #include <unistd.h>
 #elif defined(_WIN32)
 #include <windows.h>
 typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);
 #elif defined(__OS2__)
 #define INCL_DOS
 #define INCL_DOSSPINLOCK
 #include <os2.h>
 #endif
 #endif
 #if CONFIG_MULTITHREAD
 static int get_cpu_count()
 {
    int core_count = 16;
 #if HAVE_UNISTD_H && !defined(__OS2__)
 #if defined(_SC_NPROCESSORS_ONLN)
    core_count = sysconf(_SC_NPROCESSORS_ONLN);
 #elif defined(_SC_NPROC_ONLN)
    core_count = sysconf(_SC_NPROC_ONLN);
 #endif
 #elif defined(_WIN32)
    {
 #if _WIN32_WINNT >= 0x0501
        SYSTEM_INFO sysinfo;
        GetNativeSystemInfo(&sysinfo);
 #else
        PGNSI pGNSI;
        SYSTEM_INFO sysinfo;
        /* Call GetNativeSystemInfo if supported or
         * GetSystemInfo otherwise. */
        pGNSI = (PGNSI) GetProcAddress(
                GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo");
        if (pGNSI != NULL)
            pGNSI(&sysinfo);
        else
            GetSystemInfo(&sysinfo);
 #endif
        core_count = sysinfo.dwNumberOfProcessors;
    }
 #elif defined(__OS2__)
    {
        ULONG proc_id;
        ULONG status;
        core_count = 0;
        for (proc_id = 1; ; proc_id++)
        {
            if (DosGetProcessorStatus(proc_id, &status))
                break;
            if (status == PROC_ONLINE)
                core_count++;
        }
    }
 #else
    /* other platforms */
 #endif
    return core_count > 0 ? core_count : 1;
 }
 #endif
 void vp8_clear_system_state_c() {};
 void vp8_machine_specific_config(VP8_COMMON *ctx)
 {
 #if CONFIG_MULTITHREAD
    ctx->processor_core_count = get_cpu_count();
 #endif /* CONFIG_MULTITHREAD */
 #if ARCH_ARM
    ctx->cpu_caps = arm_cpu_caps();
 #elif ARCH_X86 || ARCH_X86_64
    ctx->cpu_caps = x86_simd_caps();
 #endif
 }
--- a/vp8/common/header.h
+++ b/vp8/common/header.h
@@ -1,51 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_HEADER_H_
 #define VP8_COMMON_HEADER_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* 24 bits total */
 typedef struct
 {
    unsigned int type: 1;
    unsigned int version: 3;
    unsigned int show_frame: 1;
    /* Allow 2^20 bytes = 8 megabits for first partition */
    unsigned int first_partition_length_in_bytes: 19;
 #ifdef PACKET_TESTING
    unsigned int frame_number;
    unsigned int update_gold: 1;
    unsigned int uses_gold: 1;
    unsigned int update_last: 1;
    unsigned int uses_last: 1;
 #endif
 } VP8_HEADER;
 #ifdef PACKET_TESTING
 #define VP8_HEADER_SIZE 8
 #else
 #define VP8_HEADER_SIZE 3
 #endif
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_HEADER_H_
--- a/vp8/common/idct_blk.c
+++ b/vp8/common/idct_blk.c
@@ -1,90 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 void vp8_dequant_idct_add_c(short *input, short *dq,
                            unsigned char *dest, int stride);
 void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
                            int pred_stride, unsigned char *dst_ptr,
                            int dst_stride);
 void vp8_dequant_idct_add_y_block_c
            (short *q, short *dq,
             unsigned char *dst, int stride, char *eobs)
 {
    int i, j;
    for (i = 0; i < 4; i++)
    {
        for (j = 0; j < 4; j++)
        {
            if (*eobs++ > 1)
                vp8_dequant_idct_add_c (q, dq, dst, stride);
            else
            {
                vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
                memset(q, 0, 2 * sizeof(q[0]));
            }
            q   += 16;
            dst += 4;
        }
        dst += 4*stride - 16;
    }
 }
 void vp8_dequant_idct_add_uv_block_c
            (short *q, short *dq,
             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
 {
    int i, j;
    for (i = 0; i < 2; i++)
    {
        for (j = 0; j < 2; j++)
        {
            if (*eobs++ > 1)
                vp8_dequant_idct_add_c (q, dq, dstu, stride);
            else
            {
                vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
                memset(q, 0, 2 * sizeof(q[0]));
            }
            q    += 16;
            dstu += 4;
        }
        dstu += 4*stride - 8;
    }
    for (i = 0; i < 2; i++)
    {
        for (j = 0; j < 2; j++)
        {
            if (*eobs++ > 1)
                vp8_dequant_idct_add_c (q, dq, dstv, stride);
            else
            {
                vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
                memset(q, 0, 2 * sizeof(q[0]));
            }
            q    += 16;
            dstv += 4;
        }
        dstv += 4*stride - 8;
    }
 }
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -1,205 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vp8_rtcd.h"
 /****************************************************************************
 * Notes:
 *
 * This implementation makes use of 16 bit fixed point verio of two multiply
 * constants:
 *         1.   sqrt(2) * cos (pi/8)
 *         2.   sqrt(2) * sin (pi/8)
 * Becuase the first constant is bigger than 1, to maintain the same 16 bit
 * fixed point precision as the second one, we use a trick of
 *         x * a = x + x*(a-1)
 * so
 *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
 **************************************************************************/
 static const int cospi8sqrt2minus1 = 20091;
 static const int sinpi8sqrt2      = 35468;
 void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr,
                            int pred_stride, unsigned char *dst_ptr,
                            int dst_stride)
 {
    int i;
    int r, c;
    int a1, b1, c1, d1;
    short output[16];
    short *ip = input;
    short *op = output;
    int temp1, temp2;
    int shortpitch = 4;
    for (i = 0; i < 4; i++)
    {
        a1 = ip[0] + ip[8];
        b1 = ip[0] - ip[8];
        temp1 = (ip[4] * sinpi8sqrt2) >> 16;
        temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
        c1 = temp1 - temp2;
        temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
        temp2 = (ip[12] * sinpi8sqrt2) >> 16;
        d1 = temp1 + temp2;
        op[shortpitch*0] = a1 + d1;
        op[shortpitch*3] = a1 - d1;
        op[shortpitch*1] = b1 + c1;
        op[shortpitch*2] = b1 - c1;
        ip++;
        op++;
    }
    ip = output;
    op = output;
    for (i = 0; i < 4; i++)
    {
        a1 = ip[0] + ip[2];
        b1 = ip[0] - ip[2];
        temp1 = (ip[1] * sinpi8sqrt2) >> 16;
        temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
        c1 = temp1 - temp2;
        temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
        temp2 = (ip[3] * sinpi8sqrt2) >> 16;
        d1 = temp1 + temp2;
        op[0] = (a1 + d1 + 4) >> 3;
        op[3] = (a1 - d1 + 4) >> 3;
        op[1] = (b1 + c1 + 4) >> 3;
        op[2] = (b1 - c1 + 4) >> 3;
        ip += shortpitch;
        op += shortpitch;
    }
    ip = output;
    for (r = 0; r < 4; r++)
    {
        for (c = 0; c < 4; c++)
        {
            int a = ip[c] + pred_ptr[c] ;
            if (a < 0)
                a = 0;
            if (a > 255)
                a = 255;
            dst_ptr[c] = (unsigned char) a ;
        }
        ip += 4;
        dst_ptr += dst_stride;
        pred_ptr += pred_stride;
    }
 }
 void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
                            int pred_stride, unsigned char *dst_ptr,
                            int dst_stride)
 {
    int a1 = ((input_dc + 4) >> 3);
    int r, c;
    for (r = 0; r < 4; r++)
    {
        for (c = 0; c < 4; c++)
        {
            int a = a1 + pred_ptr[c] ;
            if (a < 0)
                a = 0;
            if (a > 255)
                a = 255;
            dst_ptr[c] = (unsigned char) a ;
        }
        dst_ptr += dst_stride;
        pred_ptr += pred_stride;
    }
 }
 void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff)
 {
    short output[16];
    int i;
    int a1, b1, c1, d1;
    int a2, b2, c2, d2;
    short *ip = input;
    short *op = output;
    for (i = 0; i < 4; i++)
    {
        a1 = ip[0] + ip[12];
        b1 = ip[4] + ip[8];
        c1 = ip[4] - ip[8];
        d1 = ip[0] - ip[12];
        op[0] = a1 + b1;
        op[4] = c1 + d1;
        op[8] = a1 - b1;
        op[12] = d1 - c1;
        ip++;
        op++;
    }
    ip = output;
    op = output;
    for (i = 0; i < 4; i++)
    {
        a1 = ip[0] + ip[3];
        b1 = ip[1] + ip[2];
        c1 = ip[1] - ip[2];
        d1 = ip[0] - ip[3];
        a2 = a1 + b1;
        b2 = c1 + d1;
        c2 = a1 - b1;
        d2 = d1 - c1;
        op[0] = (a2 + 3) >> 3;
        op[1] = (b2 + 3) >> 3;
        op[2] = (c2 + 3) >> 3;
        op[3] = (d2 + 3) >> 3;
        ip += 4;
        op += 4;
    }
    for(i = 0; i < 16; i++)
    {
        mb_dqcoeff[i * 16] = output[i];
    }
 }
 void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff)
 {
    int i;
    int a1;
    a1 = ((input[0] + 3) >> 3);
    for(i = 0; i < 16; i++)
    {
        mb_dqcoeff[i * 16] = a1;
    }
 }
--- a/vp8/common/invtrans.h
+++ b/vp8/common/invtrans.h
@@ -1,70 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_INVTRANS_H_
 #define VP8_COMMON_INVTRANS_H_
 #include "./vpx_config.h"
 #include "vp8_rtcd.h"
 #include "blockd.h"
 #include "onyxc_int.h"
 #if CONFIG_MULTITHREAD
 #include "vpx_mem/vpx_mem.h"
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
 static void eob_adjust(char *eobs, short *diff)
 {
    /* eob adjust.... the idct can only skip if both the dc and eob are zero */
    int js;
    for(js = 0; js < 16; js++)
    {
        if((eobs[js] == 0) && (diff[0] != 0))
            eobs[js]++;
        diff+=16;
    }
 }
 static INLINE void vp8_inverse_transform_mby(MACROBLOCKD *xd)
 {
    short *DQC = xd->dequant_y1;
    if (xd->mode_info_context->mbmi.mode != SPLITMV)
    {
        /* do 2nd order transform on the dc block */
        if (xd->eobs[24] > 1)
        {
            vp8_short_inv_walsh4x4
                (&xd->block[24].dqcoeff[0], xd->qcoeff);
        }
        else
        {
            vp8_short_inv_walsh4x4_1
                (&xd->block[24].dqcoeff[0], xd->qcoeff);
        }
        eob_adjust(xd->eobs, xd->qcoeff);
        DQC = xd->dequant_y1_dc;
    }
    vp8_dequant_idct_add_y_block
                    (xd->qcoeff, DQC,
                     xd->dst.y_buffer,
                     xd->dst.y_stride, xd->eobs);
 }
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_INVTRANS_H_
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h
@@ -1,113 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP8_COMMON_LOOPFILTER_H_
 #define VP8_COMMON_LOOPFILTER_H_
 #include "vpx_ports/mem.h"
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 #define MAX_LOOP_FILTER             63
 /* fraction of total macroblock rows to be used in fast filter level picking */
 /* has to be > 2 */
 #define PARTIAL_FRAME_FRACTION      8
 typedef enum
 {
    NORMAL_LOOPFILTER = 0,
    SIMPLE_LOOPFILTER = 1
 } LOOPFILTERTYPE;
 #if ARCH_ARM
 #define SIMD_WIDTH 1
 #else
 #define SIMD_WIDTH 16
 #endif
 /* Need to align this structure so when it is declared and
 * passed it can be loaded into vector registers.
 */
 typedef struct
 {
    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]);
    unsigned char lvl[4][4][4];
    unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
    unsigned char mode_lf_lut[10];
 } loop_filter_info_n;
 typedef struct loop_filter_info
 {
    const unsigned char * mblim;
    const unsigned char * blim;
    const unsigned char * lim;
    const unsigned char * hev_thr;
 } loop_filter_info;
 typedef void loop_filter_uvfunction
 (
    unsigned char *u,   /* source pointer */
    int p,              /* pitch */
    const unsigned char *blimit,
    const unsigned char *limit,
    const unsigned char *thresh,
    unsigned char *v
 );
 /* assorted loopfilter functions which get used elsewhere */
 struct VP8Common;
 struct macroblockd;
 struct modeinfo;
 void vp8_loop_filter_init(struct VP8Common *cm);
 void vp8_loop_filter_frame_init(struct VP8Common *cm,
                                struct macroblockd *mbd,
                                int default_filt_lvl);
 void vp8_loop_filter_frame(struct VP8Common *cm, struct macroblockd *mbd,
                           int frame_type);
 void vp8_loop_filter_partial_frame(struct VP8Common *cm,
                                   struct macroblockd *mbd,
                                   int default_filt_lvl);
 void vp8_loop_filter_frame_yonly(struct VP8Common *cm,
                                 struct macroblockd *mbd,
                                 int default_filt_lvl);
 void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
                                      int sharpness_lvl);
 void vp8_loop_filter_row_normal(struct VP8Common *cm,
                                struct modeinfo *mode_info_context,
                                int mb_row, int post_ystride, int post_uvstride,
                                unsigned char *y_ptr, unsigned char *u_ptr,
                                unsigned char *v_ptr);
 void vp8_loop_filter_row_simple(struct VP8Common *cm,
                                struct modeinfo *mode_info_context,
                                int mb_row, int post_ystride, int post_uvstride,
                                unsigned char *y_ptr, unsigned char *u_ptr,
                                unsigned char *v_ptr);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP8_COMMON_LOOPFILTER_H_
--- a/vp8/common/loopfilter_filters.c
+++ b/vp8/common/loopfilter_filters.c
@@ -1,430 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <stdlib.h>
 #include "loopfilter.h"
 #include "onyxc_int.h"
 typedef unsigned char uc;
 static signed char vp8_signed_char_clamp(int t)
 {
    t = (t < -128 ? -128 : t);
    t = (t > 127 ? 127 : t);
    return (signed char) t;
 }
 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */
 static signed char vp8_filter_mask(uc limit, uc blimit,
                            uc p3, uc p2, uc p1, uc p0,
                            uc q0, uc q1, uc q2, uc q3)
 {
    signed char mask = 0;
    mask |= (abs(p3 - p2) > limit);
    mask |= (abs(p2 - p1) > limit);
    mask |= (abs(p1 - p0) > limit);
    mask |= (abs(q1 - q0) > limit);
    mask |= (abs(q2 - q1) > limit);
    mask |= (abs(q3 - q2) > limit);
    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit);
    return mask - 1;
 }
 /* is there high variance internal edge ( 11111111 yes, 00000000 no) */
 static signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
 {
    signed char hev = 0;
    hev  |= (abs(p1 - p0) > thresh) * -1;
    hev  |= (abs(q1 - q0) > thresh) * -1;
    return hev;
 }
 static void vp8_filter(signed char mask, uc hev, uc *op1,
        uc *op0, uc *oq0, uc *oq1)
 {
    signed char ps0, qs0;
    signed char ps1, qs1;
    signed char filter_value, Filter1, Filter2;
    signed char u;
    ps1 = (signed char) * op1 ^ 0x80;
    ps0 = (signed char) * op0 ^ 0x80;
    qs0 = (signed char) * oq0 ^ 0x80;
    qs1 = (signed char) * oq1 ^ 0x80;
    /* add outer taps if we have high edge variance */
    filter_value = vp8_signed_char_clamp(ps1 - qs1);
    filter_value &= hev;
    /* inner taps */
    filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
    filter_value &= mask;
    /* save bottom 3 bits so that we round one side +4 and the other +3
     * if it equals 4 we'll set to adjust by -1 to account for the fact
     * we'd round 3 the other way
     */
    Filter1 = vp8_signed_char_clamp(filter_value + 4);
    Filter2 = vp8_signed_char_clamp(filter_value + 3);
    Filter1 >>= 3;
    Filter2 >>= 3;
    u = vp8_signed_char_clamp(qs0 - Filter1);
    *oq0 = u ^ 0x80;
    u = vp8_signed_char_clamp(ps0 + Filter2);
    *op0 = u ^ 0x80;
    filter_value = Filter1;
    /* outer tap adjustments */
    filter_value += 1;
    filter_value >>= 1;
    filter_value &= ~hev;
    u = vp8_signed_char_clamp(qs1 - filter_value);
    *oq1 = u ^ 0x80;
    u = vp8_signed_char_clamp(ps1 + filter_value);
    *op1 = u ^ 0x80;
 }
 void vp8_loop_filter_horizontal_edge_c
 (
    unsigned char *s,
    int p, /* pitch */
    const unsigned char *blimit,
    const unsigned char *limit,
    const unsigned char *thresh,
    int count
 )
 {
    int  hev = 0; /* high edge variance */
    signed char mask = 0;
    int i = 0;
    /* loop filter designed to work using chars so that we can make maximum use
     * of 8 bit simd instructions.
     */
    do
    {
        mask = vp8_filter_mask(limit[0], blimit[0],
                               s[-4*p], s[-3*p], s[-2*p], s[-1*p],
                               s[0*p], s[1*p], s[2*p], s[3*p]);
        hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
        vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
        ++s;
    }
    while (++i < count * 8);
 }
 void vp8_loop_filter_vertical_edge_c
 (
    unsigned char *s,
    int p,
    const unsigned char *blimit,
    const unsigned char *limit,
    const unsigned char *thresh,
    int count
 )
 {
    int  hev = 0; /* high edge variance */
    signed char mask = 0;
    int i = 0;
    /* loop filter designed to work using chars so that we can make maximum use
     * of 8 bit simd instructions.
     */
    do
    {
        mask = vp8_filter_mask(limit[0], blimit[0],
                               s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
        hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
        vp8_filter(mask, hev, s - 2, s - 1, s, s + 1);
        s += p;
    }
    while (++i < count * 8);
 }
 static void vp8_mbfilter(signed char mask, uc hev,
                           uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
 {
    signed char s, u;
    signed char filter_value, Filter1, Filter2;
    signed char ps2 = (signed char) * op2 ^ 0x80;
    signed char ps1 = (signed char) * op1 ^ 0x80;
    signed char ps0 = (signed char) * op0 ^ 0x80;
    signed char qs0 = (signed char) * oq0 ^ 0x80;
    signed char qs1 = (signed char) * oq1 ^ 0x80;
    signed char qs2 = (signed char) * oq2 ^ 0x80;
    /* add outer taps if we have high edge variance */
    filter_value = vp8_signed_char_clamp(ps1 - qs1);
    filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
    filter_value &= mask;
    Filter2 = filter_value;
    Filter2 &= hev;
    /* save bottom 3 bits so that we round one side +4 and the other +3 */
    Filter1 = vp8_signed_char_clamp(Filter2 + 4);
    Filter2 = vp8_signed_char_clamp(Filter2 + 3);
    Filter1 >>= 3;
    Filter2 >>= 3;
    qs0 = vp8_signed_char_clamp(qs0 - Filter1);
    ps0 = vp8_signed_char_clamp(ps0 + Filter2);
    /* only apply wider filter if not high edge variance */
    filter_value &= ~hev;
    Filter2 = filter_value;
    /* roughly 3/7th difference across boundary */
    u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
    s = vp8_signed_char_clamp(qs0 - u);
    *oq0 = s ^ 0x80;
    s = vp8_signed_char_clamp(ps0 + u);
    *op0 = s ^ 0x80;
    /* roughly 2/7th difference across boundary */
    u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7);
    s = vp8_signed_char_clamp(qs1 - u);
    *oq1 = s ^ 0x80;
    s = vp8_signed_char_clamp(ps1 + u);
    *op1 = s ^ 0x80;
    /* roughly 1/7th difference across boundary */
    u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7);
    s = vp8_signed_char_clamp(qs2 - u);
    *oq2 = s ^ 0x80;
    s = vp8_signed_char_clamp(ps2 + u);
    *op2 = s ^ 0x80;
 }
 void vp8_mbloop_filter_horizontal_edge_c
 (
    unsigned char *s,
    int p,
    const unsigned char *blimit,
    const unsigned char *limit,
    const unsigned char *thresh,
    int count
 )
 {
    signed char hev = 0; /* high edge variance */
    signed char mask = 0;
    int i = 0;
    /* loop filter designed to work using chars so that we can make maximum use
     * of 8 bit simd instructions.
     */
    do
    {
        mask = vp8_filter_mask(limit[0], blimit[0],
                               s[-4*p], s[-3*p], s[-2*p], s[-1*p],
                               s[0*p], s[1*p], s[2*p], s[3*p]);
        hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
        vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);
        ++s;
    }
    while (++i < count * 8);
 }
 void vp8_mbloop_filter_vertical_edge_c
 (
    unsigned char *s,
    int p,
    const unsigned char *blimit,
    const unsigned char *limit,
    const unsigned char *thresh,
    int count
 )
 {
    signed char hev = 0; /* high edge variance */
    signed char mask = 0;
    int i = 0;
    do
    {
        mask = vp8_filter_mask(limit[0], blimit[0],
                               s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
        hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
        vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);
        s += p;
    }
    while (++i < count * 8);
 }
 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */
 static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)
 {
 /* Why does this cause problems for win32?
 * error C2143: syntax error : missing ';' before 'type'
 *  (void) limit;
 */
    signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
    return mask;
 }
 static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1)
 {
    signed char filter_value, Filter1, Filter2;
    signed char p1 = (signed char) * op1 ^ 0x80;
    signed char p0 = (signed char) * op0 ^ 0x80;
    signed char q0 = (signed char) * oq0 ^ 0x80;
    signed char q1 = (signed char) * oq1 ^ 0x80;
    signed char u;
    filter_value = vp8_signed_char_clamp(p1 - q1);
    filter_value = vp8_signed_char_clamp(filter_value + 3 * (q0 - p0));
    filter_value &= mask;
    /* save bottom 3 bits so that we round one side +4 and the other +3 */
    Filter1 = vp8_signed_char_clamp(filter_value + 4);
    Filter1 >>= 3;
    u = vp8_signed_char_clamp(q0 - Filter1);
    *oq0  = u ^ 0x80;
    Filter2 = vp8_signed_char_clamp(filter_value + 3);
    Filter2 >>= 3;
    u = vp8_signed_char_clamp(p0 + Filter2);
    *op0 = u ^ 0x80;
 }
 void vp8_loop_filter_simple_horizontal_edge_c
 (
    unsigned char *s,
    int p,
    const unsigned char *blimit
 )
 {
    signed char mask = 0;
    int i = 0;
    do
    {
        mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
        vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
        ++s;
    }
    while (++i < 16);
 }
 void vp8_loop_filter_simple_vertical_edge_c
 (
    unsigned char *s,
    int p,
    const unsigned char *blimit
 )
 {
    signed char mask = 0;
    int i = 0;
    do
    {
        mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
        vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
        s += p;
    }
    while (++i < 16);
 }
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
                           unsigned char *v_ptr, int y_stride, int uv_stride,
                           loop_filter_info *lfi)
 {
    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
    if (u_ptr)
        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
    if (v_ptr)
        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
                           unsigned char *v_ptr, int y_stride, int uv_stride,
                           loop_filter_info *lfi)
 {
    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
    if (u_ptr)
        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
    if (v_ptr)
        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
                          unsigned char *v_ptr, int y_stride, int uv_stride,
                          loop_filter_info *lfi)
 {
    vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
    vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
    if (u_ptr)
        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
    if (v_ptr)
        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
                           const unsigned char *blimit)
 {
    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit);
    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit);
    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit);
 }
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
                          unsigned char *v_ptr, int y_stride, int uv_stride,
                          loop_filter_info *lfi)
 {
    vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
    vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
    if (u_ptr)
        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
    if (v_ptr)
        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
                           const unsigned char *blimit)
 {
    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
 }
--- a/vp8/common/mbpitch.c
+++ b/vp8/common/mbpitch.c
@@ -1,68 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "blockd.h"
 void vp8_setup_block_dptrs(MACROBLOCKD *x)
 {
    int r, c;
    for (r = 0; r < 4; r++)
    {
        for (c = 0; c < 4; c++)
        {
            x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4;
        }
    }
    for (r = 0; r < 2; r++)
    {
        for (c = 0; c < 2; c++)
        {
            x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4;
        }
    }
    for (r = 0; r < 2; r++)
    {
        for (c = 0; c < 2; c++)
        {
            x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4;
        }
    }
    for (r = 0; r < 25; r++)
    {
        x->block[r].qcoeff  = x->qcoeff  + r * 16;
        x->block[r].dqcoeff = x->dqcoeff + r * 16;
        x->block[r].eob     = x->eobs + r;
    }
 }
 void vp8_build_block_doffsets(MACROBLOCKD *x)
 {
    int block;
    for (block = 0; block < 16; block++) /* y blocks */
    {
        x->block[block].offset =
            (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4;
    }
    for (block = 16; block < 20; block++) /* U and V blocks */
    {
        x->block[block+4].offset =
        x->block[block].offset =
            ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4;
    }
 }
--- a/vp8/common/mfqe.c
+++ b/vp8/common/mfqe.c
@@ -1,386 +0,0 @@
 /*
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 /* MFQE: Multiframe Quality Enhancement
 * In rate limited situations keyframes may cause significant visual artifacts
 * commonly referred to as "popping." This file implements a postproccesing
 * algorithm which blends data from the preceeding frame when there is no
 * motion and the q from the previous frame is lower which indicates that it is
 * higher quality.
 */
 #include "./vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vp8/common/postproc.h"
 #include "vpx_dsp/variance.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/yv12config.h"
 #include <limits.h>
 #include <stdlib.h>
 static void filter_by_weight(unsigned char *src, int src_stride,
                             unsigned char *dst, int dst_stride,
                             int block_size, int src_weight)
 {
    int dst_weight = (1 << MFQE_PRECISION) - src_weight;
    int rounding_bit = 1 << (MFQE_PRECISION - 1);
    int r, c;
    for (r = 0; r < block_size; r++)
    {
        for (c = 0; c < block_size; c++)
        {
            dst[c] = (src[c] * src_weight +
                      dst[c] * dst_weight +
                      rounding_bit) >> MFQE_PRECISION;
        }
        src += src_stride;
        dst += dst_stride;
    }
 }
 void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride,
                                 unsigned char *dst, int dst_stride,
                                 int src_weight)
 {
    filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
 }
 void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride,
                               unsigned char *dst, int dst_stride,
                               int src_weight)
 {
    filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
 }
 void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride,
                               unsigned char *dst, int dst_stride,
                               int src_weight)
 {
    filter_by_weight(src, src_stride, dst, dst_stride, 4, src_weight);
 }
 static void apply_ifactor(unsigned char *y_src,
                          int y_src_stride,
                          unsigned char *y_dst,
                          int y_dst_stride,
                          unsigned char *u_src,
                          unsigned char *v_src,
                          int uv_src_stride,
                          unsigned char *u_dst,
                          unsigned char *v_dst,
                          int uv_dst_stride,
                          int block_size,
                          int src_weight)
 {
    if (block_size == 16)
    {
        vp8_filter_by_weight16x16(y_src, y_src_stride, y_dst, y_dst_stride, src_weight);
        vp8_filter_by_weight8x8(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight);
        vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight);
    }
    else /* if (block_size == 8) */
    {
        vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride, src_weight);
        vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight);
        vp8_filter_by_weight4x4(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight);
    }
 }
 static unsigned int int_sqrt(unsigned int x)
 {
    unsigned int y = x;
    unsigned int guess;
    int p = 1;
    while (y>>=1) p++;
    p>>=1;
    guess=0;
    while (p>=0)
    {
        guess |= (1<<p);
        if (x<guess*guess)
            guess -= (1<<p);
        p--;
    }
    /* choose between guess or guess+1 */
    return guess+(guess*guess+guess+1<=x);
 }
 #define USE_SSD
 static void multiframe_quality_enhance_block
 (
    int blksize, /* Currently only values supported are 16, 8 */
    int qcurr,
    int qprev,
    unsigned char *y,
    unsigned char *u,
    unsigned char *v,
    int y_stride,
    int uv_stride,
    unsigned char *yd,
    unsigned char *ud,
    unsigned char *vd,
    int yd_stride,
    int uvd_stride
 )
 {
    static const unsigned char VP8_ZEROS[16]=
    {
         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    };
    int uvblksize = blksize >> 1;
    int qdiff = qcurr - qprev;
    int i;
    unsigned char *up;
    unsigned char *udp;
    unsigned char *vp;
    unsigned char *vdp;
    unsigned int act, actd, sad, usad, vsad, sse, thr, thrsq, actrisk;
    if (blksize == 16)
    {
        actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
        act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
 #ifdef USE_SSD
        vpx_variance16x16(y, y_stride, yd, yd_stride, &sse);
        sad = (sse + 128)>>8;
        vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
        usad = (sse + 32)>>6;
        vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
        vsad = (sse + 32)>>6;
 #else
        sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
        usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6;
        vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride)+ 32) >> 6;
 #endif
    }
    else /* if (blksize == 8) */
    {
        actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
        act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
 #ifdef USE_SSD
        vpx_variance8x8(y, y_stride, yd, yd_stride, &sse);
        sad = (sse + 32)>>6;
        vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
        usad = (sse + 8)>>4;
        vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
        vsad = (sse + 8)>>4;
 #else
        sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
        usad = (vpx_sad4x4(u, uv_stride, ud, uvd_stride) + 8) >> 4;
        vsad = (vpx_sad4x4(v, uv_stride, vd, uvd_stride) + 8) >> 4;
 #endif
    }
    actrisk = (actd > act * 5);
    /* thr = qdiff/16 + log2(act) + log4(qprev) */
    thr = (qdiff >> 4);
    while (actd >>= 1) thr++;
    while (qprev >>= 2) thr++;
 #ifdef USE_SSD
    thrsq = thr * thr;
    if (sad < thrsq &&
        /* additional checks for color mismatch and excessive addition of
         * high-frequencies */
        4 * usad < thrsq && 4 * vsad < thrsq && !actrisk)
 #else
    if (sad < thr &&
        /* additional checks for color mismatch and excessive addition of
         * high-frequencies */
        2 * usad < thr && 2 * vsad < thr && !actrisk)
 #endif
    {
        int ifactor;
 #ifdef USE_SSD
        /* TODO: optimize this later to not need sqr root */
        sad = int_sqrt(sad);
 #endif
        ifactor = (sad << MFQE_PRECISION) / thr;
        ifactor >>= (qdiff >> 5);
        if (ifactor)
        {
            apply_ifactor(y, y_stride, yd, yd_stride,
                          u, v, uv_stride,
                          ud, vd, uvd_stride,
                          blksize, ifactor);
        }
    }
    else  /* else implicitly copy from previous frame */
    {
        if (blksize == 16)
        {
            vp8_copy_mem16x16(y, y_stride, yd, yd_stride);
            vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride);
            vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride);
        }
        else  /* if (blksize == 8) */
        {
            vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
            for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride)
                memcpy(udp, up, uvblksize);
            for (vp = v, vdp = vd, i = 0; i < uvblksize; ++i, vp += uv_stride, vdp += uvd_stride)
                memcpy(vdp, vp, uvblksize);
        }
    }
 }
 static int qualify_inter_mb(const MODE_INFO *mode_info_context, int *map)
 {
    if (mode_info_context->mbmi.mb_skip_coeff)
        map[0] = map[1] = map[2] = map[3] = 1;
    else if (mode_info_context->mbmi.mode==SPLITMV)
    {
        static int ndx[4][4] =
        {
            {0, 1, 4, 5},
            {2, 3, 6, 7},
            {8, 9, 12, 13},
            {10, 11, 14, 15}
        };
        int i, j;
        for (i=0; i<4; ++i)
        {
            map[i] = 1;
            for (j=0; j<4 && map[j]; ++j)
                map[i] &= (mode_info_context->bmi[ndx[i][j]].mv.as_mv.row <= 2 &&
                           mode_info_context->bmi[ndx[i][j]].mv.as_mv.col <= 2);
        }
    }
    else
    {
        map[0] = map[1] = map[2] = map[3] =
            (mode_info_context->mbmi.mode > B_PRED &&
             abs(mode_info_context->mbmi.mv.as_mv.row) <= 2 &&
             abs(mode_info_context->mbmi.mv.as_mv.col) <= 2);
    }
    return (map[0]+map[1]+map[2]+map[3]);
 }
 void vp8_multiframe_quality_enhance
 (
    VP8_COMMON *cm
 )
 {
    YV12_BUFFER_CONFIG *show = cm->frame_to_show;
    YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
    FRAME_TYPE frame_type = cm->frame_type;
    /* Point at base of Mb MODE_INFO list has motion vectors etc */
    const MODE_INFO *mode_info_context = cm->show_frame_mi;
    int mb_row;
    int mb_col;
    int totmap, map[4];
    int qcurr = cm->base_qindex;
    int qprev = cm->postproc_state.last_base_qindex;
    unsigned char *y_ptr, *u_ptr, *v_ptr;
    unsigned char *yd_ptr, *ud_ptr, *vd_ptr;
    /* Set up the buffer pointers */
    y_ptr = show->y_buffer;
    u_ptr = show->u_buffer;
    v_ptr = show->v_buffer;
    yd_ptr = dest->y_buffer;
    ud_ptr = dest->u_buffer;
    vd_ptr = dest->v_buffer;
    /* postprocess each macro block */
    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
    {
        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
        {
            /* if motion is high there will likely be no benefit */
            if (frame_type == INTER_FRAME) totmap = qualify_inter_mb(mode_info_context, map);
            else totmap = (frame_type == KEY_FRAME ? 4 : 0);
            if (totmap)
            {
                if (totmap < 4)
                {
                    int i, j;
                    for (i=0; i<2; ++i)
                        for (j=0; j<2; ++j)
                        {
                            if (map[i*2+j])
                            {
                                multiframe_quality_enhance_block(8, qcurr, qprev,
                                                                 y_ptr + 8*(i*show->y_stride+j),
                                                                 u_ptr + 4*(i*show->uv_stride+j),
                                                                 v_ptr + 4*(i*show->uv_stride+j),
                                                                 show->y_stride,
                                                                 show->uv_stride,
                                                                 yd_ptr + 8*(i*dest->y_stride+j),
                                                                 ud_ptr + 4*(i*dest->uv_stride+j),
                                                                 vd_ptr + 4*(i*dest->uv_stride+j),
                                                                 dest->y_stride,
                                                                 dest->uv_stride);
                            }
                            else
                            {
                                /* copy a 8x8 block */
                                int k;
                                unsigned char *up = u_ptr + 4*(i*show->uv_stride+j);
                                unsigned char *udp = ud_ptr + 4*(i*dest->uv_stride+j);
                                unsigned char *vp = v_ptr + 4*(i*show->uv_stride+j);
                                unsigned char *vdp = vd_ptr + 4*(i*dest->uv_stride+j);
                                vp8_copy_mem8x8(y_ptr + 8*(i*show->y_stride+j), show->y_stride,
                                                yd_ptr + 8*(i*dest->y_stride+j), dest->y_stride);
                                for (k = 0; k < 4; ++k, up += show->uv_stride, udp += dest->uv_stride,
                                                        vp += show->uv_stride, vdp += dest->uv_stride)
                                {
                                    memcpy(udp, up, 4);
                                    memcpy(vdp, vp, 4);
                                }
                            }
                        }
                }
                else /* totmap = 4 */
                {
                    multiframe_quality_enhance_block(16, qcurr, qprev, y_ptr,
                                                     u_ptr, v_ptr,
                                                     show->y_stride,
                                                     show->uv_stride,
                                                     yd_ptr, ud_ptr, vd_ptr,
                                                     dest->y_stride,
                                                     dest->uv_stride);
                }
            }
            else
            {
                vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
                vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
                vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
            }
            y_ptr += 16;
            u_ptr += 8;
            v_ptr += 8;
            yd_ptr += 16;
            ud_ptr += 8;
            vd_ptr += 8;
            mode_info_context++;     /* step to next MB */
        }
        y_ptr += show->y_stride  * 16 - 16 * cm->mb_cols;
        u_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
        v_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
        yd_ptr += dest->y_stride  * 16 - 16 * cm->mb_cols;
        ud_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
        vd_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
        mode_info_context++;         /* Skip border mb */
    }
 }
--- a/vp8/common/mips/dspr2/dequantize_dspr2.c
+++ b/vp8/common/mips/dspr2/dequantize_dspr2.c
@@ -1,33 +0,0 @@
 /*
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 #if HAVE_DSPR2
 void vp8_dequant_idct_add_dspr2(short *input, short *dq,
                                unsigned char *dest, int stride)
 {
    int i;
    for (i = 0; i < 16; i++)
    {
        input[i] = dq[i] * input[i];
    }
    vp8_short_idct4x4llm_dspr2(input, dest, stride, dest, stride);
    memset(input, 0, 32);
 }
 #endif
--- a/vp8/common/mips/dspr2/filter_dspr2.c
+++ b/vp8/common/mips/dspr2/filter_dspr2.c
--- a/vp8/common/mips/dspr2/idct_blk_dspr2.c
+++ b/vp8/common/mips/dspr2/idct_blk_dspr2.c
@@ -1,88 +0,0 @@
 /*
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #if HAVE_DSPR2
 void vp8_dequant_idct_add_y_block_dspr2
 (short *q, short *dq,
 unsigned char *dst, int stride, char *eobs)
 {
    int i, j;
    for (i = 0; i < 4; i++)
    {
        for (j = 0; j < 4; j++)
        {
            if (*eobs++ > 1)
                vp8_dequant_idct_add_dspr2(q, dq, dst, stride);
            else
            {
                vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dst, stride, dst, stride);
                ((int *)q)[0] = 0;
            }
            q   += 16;
            dst += 4;
        }
        dst += 4 * stride - 16;
    }
 }
 void vp8_dequant_idct_add_uv_block_dspr2
 (short *q, short *dq,
 unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
 {
    int i, j;
    for (i = 0; i < 2; i++)
    {
        for (j = 0; j < 2; j++)
        {
            if (*eobs++ > 1)
                vp8_dequant_idct_add_dspr2(q, dq, dstu, stride);
            else
            {
                vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dstu, stride, dstu, stride);
                ((int *)q)[0] = 0;
            }
            q    += 16;
            dstu += 4;
        }
        dstu += 4 * stride - 8;
    }
    for (i = 0; i < 2; i++)
    {
        for (j = 0; j < 2; j++)
        {
            if (*eobs++ > 1)
                vp8_dequant_idct_add_dspr2(q, dq, dstv, stride);
            else
            {
                vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dstv, stride, dstv, stride);
                ((int *)q)[0] = 0;
            }
            q    += 16;
            dstv += 4;
        }
        dstv += 4 * stride - 8;
    }
 }
 #endif
--- a/vp8/common/mips/dspr2/idctllm_dspr2.c
+++ b/vp8/common/mips/dspr2/idctllm_dspr2.c
@@ -1,369 +0,0 @@
 /*
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vp8_rtcd.h"
 #if HAVE_DSPR2
 #define CROP_WIDTH 256
 /******************************************************************************
 * Notes:
 *
 * This implementation makes use of 16 bit fixed point version of two multiply
 * constants:
 *         1.   sqrt(2) * cos (pi/8)
 *         2.   sqrt(2) * sin (pi/8)
 * Since the first constant is bigger than 1, to maintain the same 16 bit
 * fixed point precision as the second one, we use a trick of
 *         x * a = x + x*(a-1)
 * so
 *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
 ****************************************************************************/
 extern unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
 static const int cospi8sqrt2minus1 = 20091;
 static const int sinpi8sqrt2      = 35468;
 inline void prefetch_load_short(short *src)
 {
    __asm__ __volatile__ (
        "pref   0,  0(%[src])   \n\t"
        :
        : [src] "r" (src)
    );
 }
 void vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred_ptr,
                                int pred_stride, unsigned char *dst_ptr,
                                int dst_stride)
 {
    int r, c;
    int a1, b1, c1, d1;
    short output[16];
    short *ip = input;
    short *op = output;
    int temp1, temp2;
    int shortpitch = 4;
    int c2, d2;
    int temp3, temp4;
    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
    /* prepare data for load */
    prefetch_load_short(ip + 8);
    /* first loop is unrolled */
    a1 = ip[0] + ip[8];
    b1 = ip[0] - ip[8];
    temp1 = (ip[4] * sinpi8sqrt2) >> 16;
    temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
    c1 = temp1 - temp2;
    temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
    temp2 = (ip[12] * sinpi8sqrt2) >> 16;
    d1 = temp1 + temp2;
    temp3 = (ip[5] * sinpi8sqrt2) >> 16;
    temp4 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
    c2 = temp3 - temp4;
    temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
    temp4 = (ip[13] * sinpi8sqrt2) >> 16;
    d2 = temp3 + temp4;
    op[0] = a1 + d1;
    op[12] = a1 - d1;
    op[4] = b1 + c1;
    op[8] = b1 - c1;
    a1 = ip[1] + ip[9];
    b1 = ip[1] - ip[9];
    op[1] = a1 + d2;
    op[13] = a1 - d2;
    op[5] = b1 + c2;
    op[9] = b1 - c2;
    a1 = ip[2] + ip[10];
    b1 = ip[2] - ip[10];
    temp1 = (ip[6] * sinpi8sqrt2) >> 16;
    temp2 = ip[14] + ((ip[14] * cospi8sqrt2minus1) >> 16);
    c1 = temp1 - temp2;
    temp1 = ip[6] + ((ip[6] * cospi8sqrt2minus1) >> 16);
    temp2 = (ip[14] * sinpi8sqrt2) >> 16;
    d1 = temp1 + temp2;
    temp3 = (ip[7] * sinpi8sqrt2) >> 16;
    temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
    c2 = temp3 - temp4;
    temp3 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
    temp4 = (ip[15] * sinpi8sqrt2) >> 16;
    d2 = temp3 + temp4;
    op[2] = a1 + d1;
    op[14] = a1 - d1;
    op[6] = b1 + c1;
    op[10] = b1 - c1;
    a1 = ip[3] + ip[11];
    b1 = ip[3] - ip[11];
    op[3] = a1 + d2;
    op[15] = a1 - d2;
    op[7] = b1 + c2;
    op[11] = b1 - c2;
    ip = output;
    /* prepare data for load */
    prefetch_load_short(ip + shortpitch);
    /* second loop is unrolled */
    a1 = ip[0] + ip[2];
    b1 = ip[0] - ip[2];
    temp1 = (ip[1] * sinpi8sqrt2) >> 16;
    temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
    c1 = temp1 - temp2;
    temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
    temp2 = (ip[3] * sinpi8sqrt2) >> 16;
    d1 = temp1 + temp2;
    temp3 = (ip[5] * sinpi8sqrt2) >> 16;
    temp4 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
    c2 = temp3 - temp4;
    temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
    temp4 = (ip[7] * sinpi8sqrt2) >> 16;
    d2 = temp3 + temp4;
    op[0] = (a1 + d1 + 4) >> 3;
    op[3] = (a1 - d1 + 4) >> 3;
    op[1] = (b1 + c1 + 4) >> 3;
    op[2] = (b1 - c1 + 4) >> 3;
    a1 = ip[4] + ip[6];
    b1 = ip[4] - ip[6];
    op[4] = (a1 + d2 + 4) >> 3;
    op[7] = (a1 - d2 + 4) >> 3;
    op[5] = (b1 + c2 + 4) >> 3;
    op[6] = (b1 - c2 + 4) >> 3;
    a1 = ip[8] + ip[10];
    b1 = ip[8] - ip[10];
    temp1 = (ip[9] * sinpi8sqrt2) >> 16;
    temp2 = ip[11] + ((ip[11] * cospi8sqrt2minus1) >> 16);
    c1 = temp1 - temp2;
    temp1 = ip[9] + ((ip[9] * cospi8sqrt2minus1) >> 16);
    temp2 = (ip[11] * sinpi8sqrt2) >> 16;
    d1 = temp1 + temp2;
    temp3 = (ip[13] * sinpi8sqrt2) >> 16;
    temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
    c2 = temp3 - temp4;
    temp3 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
    temp4 = (ip[15] * sinpi8sqrt2) >> 16;
    d2 = temp3 + temp4;
    op[8] = (a1 + d1 + 4) >> 3;
    op[11] = (a1 - d1 + 4) >> 3;
    op[9] = (b1 + c1 + 4) >> 3;
    op[10] = (b1 - c1 + 4) >> 3;
    a1 = ip[12] + ip[14];
    b1 = ip[12] - ip[14];
    op[12] = (a1 + d2 + 4) >> 3;
    op[15] = (a1 - d2 + 4) >> 3;
    op[13] = (b1 + c2 + 4) >> 3;
    op[14] = (b1 - c2 + 4) >> 3;
    ip = output;
    for (r = 0; r < 4; r++)
    {
        for (c = 0; c < 4; c++)
        {
            short a = ip[c] + pred_ptr[c] ;
            dst_ptr[c] = cm[a] ;
        }
        ip += 4;
        dst_ptr += dst_stride;
        pred_ptr += pred_stride;
    }
 }
 void vp8_dc_only_idct_add_dspr2(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride)
 {
    int a1;
    int i, absa1;
    int t2, vector_a1, vector_a;
    /* a1 = ((input_dc + 4) >> 3); */
    __asm__ __volatile__ (
        "addi  %[a1], %[input_dc], 4   \n\t"
        "sra   %[a1], %[a1],       3   \n\t"
        : [a1] "=r" (a1)
        : [input_dc] "r" (input_dc)
    );
    if (a1 < 0)
    {
        /* use quad-byte
         * input and output memory are four byte aligned
         */
        __asm__ __volatile__ (
            "abs        %[absa1],     %[a1]         \n\t"
            "replv.qb   %[vector_a1], %[absa1]      \n\t"
            : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
            : [a1] "r" (a1)
        );
        /* use (a1 - predptr[c]) instead a1 + predptr[c] */
        for (i = 4; i--;)
        {
            __asm__ __volatile__ (
                "lw             %[t2],       0(%[pred_ptr])                     \n\t"
                "add            %[pred_ptr], %[pred_ptr],    %[pred_stride]     \n\t"
                "subu_s.qb      %[vector_a], %[t2],          %[vector_a1]       \n\t"
                "sw             %[vector_a], 0(%[dst_ptr])                      \n\t"
                "add            %[dst_ptr],  %[dst_ptr],     %[dst_stride]      \n\t"
                : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
                  [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr)
                : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1)
            );
        }
    }
    else
    {
        /* use quad-byte
         * input and output memory are four byte aligned
         */
        __asm__ __volatile__ (
            "replv.qb       %[vector_a1], %[a1]     \n\t"
            : [vector_a1] "=r" (vector_a1)
            : [a1] "r" (a1)
        );
        for (i = 4; i--;)
        {
            __asm__ __volatile__ (
                "lw             %[t2],       0(%[pred_ptr])                 \n\t"
                "add            %[pred_ptr], %[pred_ptr],    %[pred_stride] \n\t"
                "addu_s.qb      %[vector_a], %[vector_a1],   %[t2]          \n\t"
                "sw             %[vector_a], 0(%[dst_ptr])                  \n\t"
                "add            %[dst_ptr],  %[dst_ptr],     %[dst_stride]  \n\t"
                : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
                  [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr)
                : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1)
            );
        }
    }
 }
 void vp8_short_inv_walsh4x4_dspr2(short *input, short *mb_dqcoeff)
 {
    short output[16];
    int i;
    int a1, b1, c1, d1;
    int a2, b2, c2, d2;
    short *ip = input;
    short *op = output;
    prefetch_load_short(ip);
    for (i = 4; i--;)
    {
        a1 = ip[0] + ip[12];
        b1 = ip[4] + ip[8];
        c1 = ip[4] - ip[8];
        d1 = ip[0] - ip[12];
        op[0] = a1 + b1;
        op[4] = c1 + d1;
        op[8] = a1 - b1;
        op[12] = d1 - c1;
        ip++;
        op++;
    }
    ip = output;
    op = output;
    prefetch_load_short(ip);
    for (i = 4; i--;)
    {
        a1 = ip[0] + ip[3] + 3;
        b1 = ip[1] + ip[2];
        c1 = ip[1] - ip[2];
        d1 = ip[0] - ip[3] + 3;
        a2 = a1 + b1;
        b2 = d1 + c1;
        c2 = a1 - b1;
        d2 = d1 - c1;
        op[0] = a2 >> 3;
        op[1] = b2 >> 3;
        op[2] = c2 >> 3;
        op[3] = d2 >> 3;
        ip += 4;
        op += 4;
    }
    for (i = 0; i < 16; i++)
    {
        mb_dqcoeff[i * 16] = output[i];
    }
 }
 void vp8_short_inv_walsh4x4_1_dspr2(short *input, short *mb_dqcoeff)
 {
    int a1;
    a1 = ((input[0] + 3) >> 3);
    __asm__ __volatile__ (
        "sh             %[a1], 0(%[mb_dqcoeff])                    \n\t"
        "sh             %[a1], 32(%[mb_dqcoeff])                   \n\t"
        "sh             %[a1], 64(%[mb_dqcoeff])                   \n\t"
        "sh             %[a1], 96(%[mb_dqcoeff])                   \n\t"
        "sh             %[a1], 128(%[mb_dqcoeff])                  \n\t"
        "sh             %[a1], 160(%[mb_dqcoeff])                  \n\t"
        "sh             %[a1], 192(%[mb_dqcoeff])                  \n\t"
        "sh             %[a1], 224(%[mb_dqcoeff])                  \n\t"
        "sh             %[a1], 256(%[mb_dqcoeff])                  \n\t"
        "sh             %[a1], 288(%[mb_dqcoeff])                  \n\t"
        "sh             %[a1], 320(%[mb_dqcoeff])                  \n\t"
        "sh             %[a1], 352(%[mb_dqcoeff])                  \n\t"
        "sh             %[a1], 384(%[mb_dqcoeff])                  \n\t"
        "sh             %[a1], 416(%[mb_dqcoeff])                  \n\t"
        "sh             %[a1], 448(%[mb_dqcoeff])                  \n\t"
        "sh             %[a1], 480(%[mb_dqcoeff])                  \n\t"
        :
        : [a1] "r" (a1), [mb_dqcoeff] "r" (mb_dqcoeff)
    );
 }
 #endif
--- a/vp8/common/mips/dspr2/reconinter_dspr2.c
+++ b/vp8/common/mips/dspr2/reconinter_dspr2.c
@@ -1,121 +0,0 @@
 /*
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "vpx/vpx_integer.h"
 #if HAVE_DSPR2
 inline void prefetch_load_int(unsigned char *src)
 {
    __asm__ __volatile__ (
        "pref   0,  0(%[src])   \n\t"
        :
        : [src] "r" (src)
    );
 }
 __inline void vp8_copy_mem16x16_dspr2(
    unsigned char *RESTRICT src,
    int src_stride,
    unsigned char *RESTRICT dst,
    int dst_stride)
 {
    int r;
    unsigned int a0, a1, a2, a3;
    for (r = 16; r--;)
    {
        /* load src data in cache memory */
        prefetch_load_int(src + src_stride);
        /* use unaligned memory load and store */
        __asm__ __volatile__ (
            "ulw    %[a0], 0(%[src])            \n\t"
            "ulw    %[a1], 4(%[src])            \n\t"
            "ulw    %[a2], 8(%[src])            \n\t"
            "ulw    %[a3], 12(%[src])           \n\t"
            "sw     %[a0], 0(%[dst])            \n\t"
            "sw     %[a1], 4(%[dst])            \n\t"
            "sw     %[a2], 8(%[dst])            \n\t"
            "sw     %[a3], 12(%[dst])           \n\t"
            : [a0] "=&r" (a0), [a1] "=&r" (a1),
              [a2] "=&r" (a2), [a3] "=&r" (a3)
            : [src] "r" (src), [dst] "r" (dst)
        );
        src += src_stride;
        dst += dst_stride;
    }
 }
 __inline void vp8_copy_mem8x8_dspr2(
    unsigned char *RESTRICT src,
    int src_stride,
    unsigned char *RESTRICT dst,
    int dst_stride)
 {
    int r;
    unsigned int a0, a1;
    /* load src data in cache memory */
    prefetch_load_int(src + src_stride);
    for (r = 8; r--;)
    {
        /* use unaligned memory load and store */
        __asm__ __volatile__ (
            "ulw    %[a0], 0(%[src])            \n\t"
            "ulw    %[a1], 4(%[src])            \n\t"
            "sw     %[a0], 0(%[dst])            \n\t"
            "sw     %[a1], 4(%[dst])            \n\t"
            : [a0] "=&r" (a0), [a1] "=&r" (a1)
            : [src] "r" (src), [dst] "r" (dst)
        );
        src += src_stride;
        dst += dst_stride;
    }
 }
 __inline void vp8_copy_mem8x4_dspr2(
    unsigned char *RESTRICT src,
    int src_stride,
    unsigned char *RESTRICT dst,
    int dst_stride)
 {
    int r;
    unsigned int a0, a1;
    /* load src data in cache memory */
    prefetch_load_int(src + src_stride);
    for (r = 4; r--;)
    {
        /* use unaligned memory load and store */
        __asm__ __volatile__ (
            "ulw    %[a0], 0(%[src])            \n\t"
            "ulw    %[a1], 4(%[src])            \n\t"
            "sw     %[a0], 0(%[dst])            \n\t"
            "sw     %[a1], 4(%[dst])            \n\t"
           : [a0] "=&r" (a0), [a1] "=&r" (a1)
           : [src] "r" (src), [dst] "r" (dst)
        );
        src += src_stride;
        dst += dst_stride;
    }
 }
 #endif
--- a/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
+++ b/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
--- a/vp8/common/mips/msa/bilinear_filter_msa.c
+++ b/vp8/common/mips/msa/bilinear_filter_msa.c
@@ -1,911 +0,0 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vp8_rtcd.h"
 #include "vpx_ports/mem.h"
 #include "vp8/common/filter.h"
 #include "vp8/common/mips/msa/vp8_macros_msa.h"
 DECLARE_ALIGNED(16, static const int8_t, vp8_bilinear_filters_msa[7][2]) =
 {
    { 112, 16 },
    { 96, 32 },
    { 80, 48 },
    { 64, 64 },
    { 48, 80 },
    { 32, 96 },
    { 16, 112 }
 };
 static const uint8_t vp8_mc_filt_mask_arr[16 * 3] =
 {
    /* 8 width cases */
    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
    /* 4 width cases */
    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
    /* 4 width cases */
    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
 };
 static void common_hz_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                 uint8_t *RESTRICT dst, int32_t dst_stride,
                                 const int8_t *filter)
 {
    v16i8 src0, src1, src2, src3, mask;
    v16u8 filt0, vec0, vec1, res0, res1;
    v8u16 vec2, vec3, filt;
    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
    filt = LD_UH(filter);
    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    LD_SB4(src, src_stride, src0, src1, src2, src3);
    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
    SRARI_H2_UH(vec2, vec3, VP8_FILTER_SHIFT);
    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 }
 static void common_hz_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                 uint8_t *RESTRICT dst, int32_t dst_stride,
                                 const int8_t *filter)
 {
    v16u8 vec0, vec1, vec2, vec3, filt0;
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
    v16i8 res0, res1, res2, res3;
    v8u16 vec4, vec5, vec6, vec7, filt;
    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
    filt = LD_UH(filter);
    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
                vec4, vec5, vec6, vec7);
    SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
                res0, res1, res2, res3);
    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
    dst += (4 * dst_stride);
    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
 }
 static void common_hz_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                uint8_t *RESTRICT dst, int32_t dst_stride,
                                const int8_t *filter, int32_t height)
 {
    if (4 == height)
    {
        common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
    }
    else if (8 == height)
    {
        common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
    }
 }
 static void common_hz_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                 uint8_t *RESTRICT dst, int32_t dst_stride,
                                 const int8_t *filter)
 {
    v16u8 filt0;
    v16i8 src0, src1, src2, src3, mask;
    v8u16 vec0, vec1, vec2, vec3, filt;
    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
    filt = LD_UH(filter);
    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    LD_SB4(src, src_stride, src0, src1, src2, src3);
    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
                vec0, vec1, vec2, vec3);
    SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
    PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
    ST8x4_UB(src0, src1, dst, dst_stride);
 }
 static void common_hz_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                     uint8_t *RESTRICT dst, int32_t dst_stride,
                                     const int8_t *filter, int32_t height)
 {
    v16u8 filt0;
    v16i8 src0, src1, src2, src3, mask, out0, out1;
    v8u16 vec0, vec1, vec2, vec3, filt;
    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
    filt = LD_UH(filter);
    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    LD_SB4(src, src_stride, src0, src1, src2, src3);
    src += (4 * src_stride);
    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
                vec0, vec1, vec2, vec3);
    SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
    LD_SB4(src, src_stride, src0, src1, src2, src3);
    src += (4 * src_stride);
    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
    ST8x4_UB(out0, out1, dst, dst_stride);
    dst += (4 * dst_stride);
    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
                vec0, vec1, vec2, vec3);
    SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
    ST8x4_UB(out0, out1, dst, dst_stride);
    dst += (4 * dst_stride);
    if (16 == height)
    {
        LD_SB4(src, src_stride, src0, src1, src2, src3);
        src += (4 * src_stride);
        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
                    vec0, vec1, vec2, vec3);
        SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
        LD_SB4(src, src_stride, src0, src1, src2, src3);
        src += (4 * src_stride);
        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
        ST8x4_UB(out0, out1, dst, dst_stride);
        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
                    vec0, vec1, vec2, vec3);
        SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
        ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
    }
 }
 static void common_hz_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                uint8_t *RESTRICT dst, int32_t dst_stride,
                                const int8_t *filter, int32_t height)
 {
    if (4 == height)
    {
        common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
    }
    else
    {
        common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
    }
 }
 static void common_hz_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                 uint8_t *RESTRICT dst, int32_t dst_stride,
                                 const int8_t *filter, int32_t height)
 {
    uint32_t loop_cnt;
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
    loop_cnt = (height >> 2) - 1;
    filt = LD_UH(filter);
    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    LD_SB4(src, src_stride, src0, src2, src4, src6);
    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
    src += (4 * src_stride);
    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
                out0, out1, out2, out3);
    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
                out4, out5, out6, out7);
    SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
    SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
    PCKEV_ST_SB(out0, out1, dst);
    dst += dst_stride;
    PCKEV_ST_SB(out2, out3, dst);
    dst += dst_stride;
    PCKEV_ST_SB(out4, out5, dst);
    dst += dst_stride;
    PCKEV_ST_SB(out6, out7, dst);
    dst += dst_stride;
    for (; loop_cnt--;)
    {
        LD_SB4(src, src_stride, src0, src2, src4, src6);
        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
        src += (4 * src_stride);
        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
                    out0, out1, out2, out3);
        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
                    out4, out5, out6, out7);
        SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
        SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
        PCKEV_ST_SB(out0, out1, dst);
        dst += dst_stride;
        PCKEV_ST_SB(out2, out3, dst);
        dst += dst_stride;
        PCKEV_ST_SB(out4, out5, dst);
        dst += dst_stride;
        PCKEV_ST_SB(out6, out7, dst);
        dst += dst_stride;
    }
 }
 static void common_vt_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                 uint8_t *RESTRICT dst, int32_t dst_stride,
                                 const int8_t *filter)
 {
    v16i8 src0, src1, src2, src3, src4;
    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
    v16u8 filt0;
    v8i16 filt;
    v8u16 tmp0, tmp1;
    filt = LD_SH(filter);
    filt0 = (v16u8)__msa_splati_h(filt, 0);
    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
    src += (5 * src_stride);
    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
               src10_r, src21_r, src32_r, src43_r);
    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
    SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
    src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
 }
 static void common_vt_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                 uint8_t *RESTRICT dst, int32_t dst_stride,
                                 const int8_t *filter)
 {
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
    v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
    v8u16 tmp0, tmp1, tmp2, tmp3;
    v16u8 filt0;
    v8i16 filt;
    filt = LD_SH(filter);
    filt0 = (v16u8)__msa_splati_h(filt, 0);
    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
    src += (8 * src_stride);
    src8 = LD_SB(src);
    src += src_stride;
    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
               src32_r, src43_r);
    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
               src76_r, src87_r);
    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
               src87_r, src76_r, src2110, src4332, src6554, src8776);
    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
                tmp0, tmp1, tmp2, tmp3);
    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
    ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
 }
 static void common_vt_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                uint8_t *RESTRICT dst, int32_t dst_stride,
                                const int8_t *filter, int32_t height)
 {
    if (4 == height)
    {
        common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
    }
    else if (8 == height)
    {
        common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
    }
 }
 static void common_vt_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                 uint8_t *RESTRICT dst, int32_t dst_stride,
                                 const int8_t *filter)
 {
    v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
    v16i8 out0, out1;
    v8u16 tmp0, tmp1, tmp2, tmp3;
    v8i16 filt;
    filt = LD_SH(filter);
    filt0 = (v16u8)__msa_splati_h(filt, 0);
    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
                tmp0, tmp1, tmp2, tmp3);
    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
    ST8x4_UB(out0, out1, dst, dst_stride);
 }
 static void common_vt_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                     uint8_t *RESTRICT dst, int32_t dst_stride,
                                     const int8_t *filter, int32_t height)
 {
    uint32_t loop_cnt;
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
    v16i8 out0, out1;
    v8u16 tmp0, tmp1, tmp2, tmp3;
    v8i16 filt;
    filt = LD_SH(filter);
    filt0 = (v16u8)__msa_splati_h(filt, 0);
    src0 = LD_UB(src);
    src += src_stride;
    for (loop_cnt = (height >> 3); loop_cnt--;)
    {
        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
        src += (8 * src_stride);
        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
                   vec0, vec1, vec2, vec3);
        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
                   vec4, vec5, vec6, vec7);
        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
                    tmp0, tmp1, tmp2, tmp3);
        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
        ST8x4_UB(out0, out1, dst, dst_stride);
        dst += (4 * dst_stride);
        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
                    tmp0, tmp1, tmp2, tmp3);
        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
        ST8x4_UB(out0, out1, dst, dst_stride);
        dst += (4 * dst_stride);
        src0 = src8;
    }
 }
 static void common_vt_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                uint8_t *RESTRICT dst, int32_t dst_stride,
                                const int8_t *filter, int32_t height)
 {
    if (4 == height)
    {
        common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
    }
    else
    {
        common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
                                 height);
    }
 }
 static void common_vt_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                 uint8_t *RESTRICT dst, int32_t dst_stride,
                                 const int8_t *filter, int32_t height)
 {
    uint32_t loop_cnt;
    v16u8 src0, src1, src2, src3, src4;
    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
    v8u16 tmp0, tmp1, tmp2, tmp3;
    v8i16 filt;
    filt = LD_SH(filter);
    filt0 = (v16u8)__msa_splati_h(filt, 0);
    src0 = LD_UB(src);
    src += src_stride;
    for (loop_cnt = (height >> 2); loop_cnt--;)
    {
        LD_UB4(src, src_stride, src1, src2, src3, src4);
        src += (4 * src_stride);
        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
        SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
        PCKEV_ST_SB(tmp0, tmp1, dst);
        dst += dst_stride;
        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
        SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
        PCKEV_ST_SB(tmp2, tmp3, dst);
        dst += dst_stride;
        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
        SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
        PCKEV_ST_SB(tmp0, tmp1, dst);
        dst += dst_stride;
        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
        SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
        PCKEV_ST_SB(tmp2, tmp3, dst);
        dst += dst_stride;
        src0 = src4;
    }
 }
 static void common_hv_2ht_2vt_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                      uint8_t *RESTRICT dst, int32_t dst_stride,
                                      const int8_t *filter_horiz,
                                      const int8_t *filter_vert)
 {
    v16i8 src0, src1, src2, src3, src4, mask;
    v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
    filt = LD_UH(filter_horiz);
    filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
    filt = LD_UH(filter_vert);
    filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
    SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 }
 static void common_hv_2ht_2vt_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                      uint8_t *RESTRICT dst, int32_t dst_stride,
                                      const int8_t *filter_horiz,
                                      const int8_t *filter_vert)
 {
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
    v16i8 res0, res1, res2, res3;
    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
    v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
    filt = LD_UH(filter_horiz);
    filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
    filt = LD_UH(filter_vert);
    filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
    src += (8 * src_stride);
    src8 = LD_SB(src);
    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, VP8_FILTER_SHIFT);
    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, VP8_FILTER_SHIFT);
    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, VP8_FILTER_SHIFT);
    SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
               hz_out3, hz_out5, 8);
    hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
                vec4, vec5, vec6, vec7);
    SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
                res0, res1, res2, res3);
    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
    dst += (4 * dst_stride);
    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
 }
 static void common_hv_2ht_2vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                     uint8_t *RESTRICT dst, int32_t dst_stride,
                                     const int8_t *filter_horiz,
                                     const int8_t *filter_vert,
                                     int32_t height)
 {
    if (4 == height)
    {
        common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
                                  filter_horiz, filter_vert);
    }
    else if (8 == height)
    {
        common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
                                  filter_horiz, filter_vert);
    }
 }
 static void common_hv_2ht_2vt_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                      uint8_t *RESTRICT dst, int32_t dst_stride,
                                      const int8_t *filter_horiz,
                                      const int8_t *filter_vert)
 {
    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
    v8i16 filt;
    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
    filt = LD_SH(filter_horiz);
    filt_hz = (v16u8)__msa_splati_h(filt, 0);
    filt = LD_SH(filter_vert);
    filt_vt = (v16u8)__msa_splati_h(filt, 0);
    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
    vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
    vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
    vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
    ST8x4_UB(out0, out1, dst, dst_stride);
 }
 static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *RESTRICT src,
                                          int32_t src_stride,
                                          uint8_t *RESTRICT dst,
                                          int32_t dst_stride,
                                          const int8_t *filter_horiz,
                                          const int8_t *filter_vert,
                                          int32_t height)
 {
    uint32_t loop_cnt;
    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
    v16u8 filt_hz, filt_vt, vec0;
    v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
    v8i16 filt;
    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
    filt = LD_SH(filter_horiz);
    filt_hz = (v16u8)__msa_splati_h(filt, 0);
    filt = LD_SH(filter_vert);
    filt_vt = (v16u8)__msa_splati_h(filt, 0);
    src0 = LD_SB(src);
    src += src_stride;
    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
    for (loop_cnt = (height >> 3); loop_cnt--;)
    {
        LD_SB4(src, src_stride, src1, src2, src3, src4);
        src += (4 * src_stride);
        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        LD_SB4(src, src_stride, src1, src2, src3, src4);
        src += (4 * src_stride);
        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
        tmp4 = __msa_dotp_u_h(vec0, filt_vt);
        SRARI_H2_UH(tmp3, tmp4, VP8_FILTER_SHIFT);
        PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
        ST8x4_UB(out0, out1, dst, dst_stride);
        dst += (4 * dst_stride);
        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
        tmp5 = __msa_dotp_u_h(vec0, filt_vt);
        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
        tmp6 = __msa_dotp_u_h(vec0, filt_vt);
        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
        tmp7 = __msa_dotp_u_h(vec0, filt_vt);
        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
        tmp8 = __msa_dotp_u_h(vec0, filt_vt);
        SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, VP8_FILTER_SHIFT);
        PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
        ST8x4_UB(out0, out1, dst, dst_stride);
        dst += (4 * dst_stride);
    }
 }
 static void common_hv_2ht_2vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                     uint8_t *RESTRICT dst, int32_t dst_stride,
                                     const int8_t *filter_horiz,
                                     const int8_t *filter_vert,
                                     int32_t height)
 {
    if (4 == height)
    {
        common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
                                  filter_horiz, filter_vert);
    }
    else
    {
        common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
                                      filter_horiz, filter_vert, height);
    }
 }
 static void common_hv_2ht_2vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                      uint8_t *RESTRICT dst, int32_t dst_stride,
                                      const int8_t *filter_horiz,
                                      const int8_t *filter_vert,
                                      int32_t height)
 {
    uint32_t loop_cnt;
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
    v16u8 filt_hz, filt_vt, vec0, vec1;
    v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
    v8i16 filt;
    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
    /* rearranging filter */
    filt = LD_SH(filter_horiz);
    filt_hz = (v16u8)__msa_splati_h(filt, 0);
    filt = LD_SH(filter_vert);
    filt_vt = (v16u8)__msa_splati_h(filt, 0);
    LD_SB2(src, 8, src0, src1);
    src += src_stride;
    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
    for (loop_cnt = (height >> 2); loop_cnt--;)
    {
        LD_SB4(src, src_stride, src0, src2, src4, src6);
        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
        src += (4 * src_stride);
        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
        PCKEV_ST_SB(tmp1, tmp2, dst);
        dst += dst_stride;
        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
        PCKEV_ST_SB(tmp1, tmp2, dst);
        dst += dst_stride;
        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
        PCKEV_ST_SB(tmp1, tmp2, dst);
        dst += dst_stride;
        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz,
                                     VP8_FILTER_SHIFT);
        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
        PCKEV_ST_SB(tmp1, tmp2, dst);
        dst += dst_stride;
    }
 }
 void vp8_bilinear_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                 int32_t xoffset, int32_t yoffset,
                                 uint8_t *RESTRICT dst, int32_t dst_stride)
 {
    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
    if (yoffset)
    {
        if (xoffset)
        {
            common_hv_2ht_2vt_4w_msa(src, src_stride, dst, dst_stride,
                                     h_filter, v_filter, 4);
        }
        else
        {
            common_vt_2t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
        }
    }
    else
    {
        if (xoffset)
        {
            common_hz_2t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
        }
        else
        {
            uint32_t tp0, tp1, tp2, tp3;
            LW4(src, src_stride, tp0, tp1, tp2, tp3);
            SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
        }
    }
 }
 void vp8_bilinear_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                 int32_t xoffset, int32_t yoffset,
                                 uint8_t *RESTRICT dst, int32_t dst_stride)
 {
    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
    if (yoffset)
    {
        if (xoffset)
        {
            common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride,
                                     h_filter, v_filter, 4);
        }
        else
        {
            common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
        }
    }
    else
    {
        if (xoffset)
        {
            common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
        }
        else
        {
            vp8_copy_mem8x4(src, src_stride, dst, dst_stride);
        }
    }
 }
 void vp8_bilinear_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                 int32_t xoffset, int32_t yoffset,
                                 uint8_t *RESTRICT dst, int32_t dst_stride)
 {
    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
    if (yoffset)
    {
        if (xoffset)
        {
            common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride,
                                     h_filter, v_filter, 8);
        }
        else
        {
            common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8);
        }
    }
    else
    {
        if (xoffset)
        {
            common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8);
        }
        else
        {
            vp8_copy_mem8x8(src, src_stride, dst, dst_stride);
        }
    }
 }
 void vp8_bilinear_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
                                   int32_t xoffset, int32_t yoffset,
                                   uint8_t *RESTRICT dst, int32_t dst_stride)
 {
    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
    if (yoffset)
    {
        if (xoffset)
        {
            common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride,
                                      h_filter, v_filter, 16);
        }
        else
        {
            common_vt_2t_16w_msa(src, src_stride, dst, dst_stride, v_filter,
                                 16);
        }
    }
    else
    {
        if (xoffset)
        {
            common_hz_2t_16w_msa(src, src_stride, dst, dst_stride, h_filter,
                                 16);
        }
        else
        {
            vp8_copy_mem16x16(src, src_stride, dst, dst_stride);
        }
    }
 }
--- a/vp8/common/mips/msa/copymem_msa.c
+++ b/vp8/common/mips/msa/copymem_msa.c
@@ -1,70 +0,0 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vp8_rtcd.h"
 #include "vp8/common/mips/msa/vp8_macros_msa.h"
 static void copy_8x4_msa(uint8_t *src, int32_t src_stride,
                         uint8_t *dst, int32_t dst_stride)
 {
    uint64_t src0, src1, src2, src3;
    LD4(src, src_stride, src0, src1, src2, src3);
    SD4(src0, src1, src2, src3, dst, dst_stride);
 }
 static void copy_8x8_msa(uint8_t *src, int32_t src_stride,
                         uint8_t *dst, int32_t dst_stride)
 {
    uint64_t src0, src1, src2, src3;
    LD4(src, src_stride, src0, src1, src2, src3);
    src += (4 * src_stride);
    SD4(src0, src1, src2, src3, dst, dst_stride);
    dst += (4 * dst_stride);
    LD4(src, src_stride, src0, src1, src2, src3);
    SD4(src0, src1, src2, src3, dst, dst_stride);
 }
 static void copy_16x16_msa(uint8_t *src, int32_t src_stride,
                           uint8_t *dst, int32_t dst_stride)
 {
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
    src += (8 * src_stride);
    LD_UB8(src, src_stride, src8, src9, src10, src11, src12, src13, src14,
           src15);
    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
    dst += (8 * dst_stride);
    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst,
           dst_stride);
 }
 void vp8_copy_mem16x16_msa(uint8_t *src, int32_t src_stride,
                           uint8_t *dst, int32_t dst_stride)
 {
    copy_16x16_msa(src, src_stride, dst, dst_stride);
 }
 void vp8_copy_mem8x8_msa(uint8_t *src, int32_t src_stride,
                         uint8_t *dst, int32_t dst_stride)
 {
    copy_8x8_msa(src, src_stride, dst, dst_stride);
 }
 void vp8_copy_mem8x4_msa(uint8_t *src, int32_t src_stride,
                         uint8_t *dst, int32_t dst_stride)
 {
    copy_8x4_msa(src, src_stride, dst, dst_stride);
 }
--- a/vp8/common/mips/msa/idct_msa.c
+++ b/vp8/common/mips/msa/idct_msa.c
@@ -1,457 +0,0 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vp8/common/mips/msa/vp8_macros_msa.h"
 static const int32_t cospi8sqrt2minus1 = 20091;
 static const int32_t sinpi8sqrt2 = 35468;
 #define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3)  \
 {                                                                        \
    v8i16 s4_m, s5_m, s6_m, s7_m;                                        \
                                                                         \
    TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m);      \
    ILVR_D2_SH(s6_m, s4_m, s7_m, s5_m, out0, out2);                      \
    out1 = (v8i16)__msa_ilvl_d((v2i64)s6_m, (v2i64)s4_m);                \
    out3 = (v8i16)__msa_ilvl_d((v2i64)s7_m, (v2i64)s5_m);                \
 }
 #define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in)     \
 ({                                                        \
    v8i16 out_m;                                          \
    v8i16 zero_m = { 0 };                                 \
    v4i32 tmp1_m, tmp2_m;                                 \
    v4i32 sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2);     \
                                                          \
    ILVRL_H2_SW(in, zero_m, tmp1_m, tmp2_m);              \
    tmp1_m >>= 16;                                        \
    tmp2_m >>= 16;                                        \
    tmp1_m = (tmp1_m * sinpi8_sqrt2_m) >> 16;             \
    tmp2_m = (tmp2_m * sinpi8_sqrt2_m) >> 16;             \
    out_m = __msa_pckev_h((v8i16)tmp2_m, (v8i16)tmp1_m);  \
                                                          \
    out_m;                                                \
 })
 #define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3)  \
 {                                                                  \
    v8i16 a1_m, b1_m, c1_m, d1_m;                                  \
    v8i16 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
    v8i16 const_cospi8sqrt2minus1_m;                               \
                                                                   \
    const_cospi8sqrt2minus1_m = __msa_fill_h(cospi8sqrt2minus1);   \
    a1_m = in0 + in2;                                              \
    b1_m = in0 - in2;                                              \
    c_tmp1_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1);     \
    c_tmp2_m = __msa_mul_q_h(in3, const_cospi8sqrt2minus1_m);      \
    c_tmp2_m = c_tmp2_m >> 1;                                      \
    c_tmp2_m = in3 + c_tmp2_m;                                     \
    c1_m = c_tmp1_m - c_tmp2_m;                                    \
    d_tmp1_m = __msa_mul_q_h(in1, const_cospi8sqrt2minus1_m);      \
    d_tmp1_m = d_tmp1_m >> 1;                                      \
    d_tmp1_m = in1 + d_tmp1_m;                                     \
    d_tmp2_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3);     \
    d1_m = d_tmp1_m + d_tmp2_m;                                    \
    BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
 }
 #define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3)  \
 {                                                                  \
    v4i32 a1_m, b1_m, c1_m, d1_m;                                  \
    v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
    v4i32 const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m;               \
                                                                   \
    const_cospi8sqrt2minus1_m = __msa_fill_w(cospi8sqrt2minus1);   \
    sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2);                    \
    a1_m = in0 + in2;                                              \
    b1_m = in0 - in2;                                              \
    c_tmp1_m = (in1 * sinpi8_sqrt2_m) >> 16;                       \
    c_tmp2_m = in3 + ((in3 * const_cospi8sqrt2minus1_m) >> 16);    \
    c1_m = c_tmp1_m - c_tmp2_m;                                    \
    d_tmp1_m = in1 + ((in1 * const_cospi8sqrt2minus1_m) >> 16);    \
    d_tmp2_m = (in3 * sinpi8_sqrt2_m) >> 16;                       \
    d1_m = d_tmp1_m + d_tmp2_m;                                    \
    BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
 }
 static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred,
                               int32_t pred_stride,
                               uint8_t *dest, int32_t dest_stride)
 {
    v8i16 input0, input1;
    v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
    v4i32 res0, res1, res2, res3;
    v16i8 zero = { 0 };
    v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
    v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24,
                   25, 26, 27, 28, 29, 30, 31 };
    LD_SH2(input, 8, input0, input1);
    UNPCK_SH_SW(input0, in0, in1);
    UNPCK_SH_SW(input1, in2, in3);
    VP8_IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
    TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
    VP8_IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
    SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
    TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
    LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3);
    ILVR_B4_SW(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1,
               res2, res3);
    ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1,
               res2, res3);
    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
    res0 = CLIP_SW_0_255(res0);
    res1 = CLIP_SW_0_255(res1);
    res2 = CLIP_SW_0_255(res2);
    res3 = CLIP_SW_0_255(res3);
    LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
    VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
    VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
    ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
 }
 static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred,
                                 int32_t pred_stride,
                                 uint8_t *dest, int32_t dest_stride)
 {
    v8i16 vec;
    v8i16 res0, res1, res2, res3;
    v16i8 zero = { 0 };
    v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
    v16i8 mask = { 0, 2, 4, 6, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
    vec = __msa_fill_h(in_dc);
    vec = __msa_srari_h(vec, 3);
    LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3);
    ILVR_B4_SH(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1,
               res2, res3);
    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
    CLIP_SH4_0_255(res0, res1, res2, res3);
    LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
    VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
    VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
    ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
 }
 void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff)
 {
    v8i16 input0, input1;
    v4i32 in0, in1, in2, in3, a1, b1, c1, d1;
    v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
    LD_SH2(input, 8, input0, input1);
    UNPCK_SH_SW(input0, in0, in1);
    UNPCK_SH_SW(input1, in2, in3);
    BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1);
    BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2);
    TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
    BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1);
    BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2);
    ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3);
    SRA_4V(vt0, vt1, vt2, vt3, 3);
    mb_dq_coeff[0] = __msa_copy_s_h((v8i16)vt0, 0);
    mb_dq_coeff[16] = __msa_copy_s_h((v8i16)vt1, 0);
    mb_dq_coeff[32] = __msa_copy_s_h((v8i16)vt2, 0);
    mb_dq_coeff[48] = __msa_copy_s_h((v8i16)vt3, 0);
    mb_dq_coeff[64] = __msa_copy_s_h((v8i16)vt0, 2);
    mb_dq_coeff[80] = __msa_copy_s_h((v8i16)vt1, 2);
    mb_dq_coeff[96] = __msa_copy_s_h((v8i16)vt2, 2);
    mb_dq_coeff[112] = __msa_copy_s_h((v8i16)vt3, 2);
    mb_dq_coeff[128] = __msa_copy_s_h((v8i16)vt0, 4);
    mb_dq_coeff[144] = __msa_copy_s_h((v8i16)vt1, 4);
    mb_dq_coeff[160] = __msa_copy_s_h((v8i16)vt2, 4);
    mb_dq_coeff[176] = __msa_copy_s_h((v8i16)vt3, 4);
    mb_dq_coeff[192] = __msa_copy_s_h((v8i16)vt0, 6);
    mb_dq_coeff[208] = __msa_copy_s_h((v8i16)vt1, 6);
    mb_dq_coeff[224] = __msa_copy_s_h((v8i16)vt2, 6);
    mb_dq_coeff[240] = __msa_copy_s_h((v8i16)vt3, 6);
 }
 static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
                                       uint8_t *dest, int32_t dest_stride)
 {
    v8i16 input0, input1, dequant_in0, dequant_in1, mul0, mul1;
    v8i16 in0, in1, in2, in3;
    v8i16 hz0_h, hz1_h, hz2_h, hz3_h;
    v16i8 dest0, dest1, dest2, dest3;
    v4i32 hz0_w, hz1_w, hz2_w, hz3_w;
    v4i32 vt0, vt1, vt2, vt3, res0, res1, res2, res3;
    v2i64 zero = { 0 };
    v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24,
                   25, 26, 27, 28, 29, 30, 31 };
    LD_SH2(input, 8, input0, input1);
    LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
    MUL2(input0, dequant_in0, input1, dequant_in1, mul0, mul1);
    PCKEV_D2_SH(zero, mul0, zero, mul1, in0, in2);
    PCKOD_D2_SH(zero, mul0, zero, mul1, in1, in3);
    VP8_IDCT_1D_H(in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h);
    PCKEV_D2_SH(hz1_h, hz0_h, hz3_h, hz2_h, mul0, mul1);
    UNPCK_SH_SW(mul0, hz0_w, hz1_w);
    UNPCK_SH_SW(mul1, hz2_w, hz3_w);
    TRANSPOSE4x4_SW_SW(hz0_w, hz1_w, hz2_w, hz3_w, hz0_w, hz1_w, hz2_w, hz3_w);
    VP8_IDCT_1D_W(hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3);
    SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
    TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
    LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
    ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
               res2, res3);
    ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1,
               res2, res3);
    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
    res0 = CLIP_SW_0_255(res0);
    res1 = CLIP_SW_0_255(res1);
    res2 = CLIP_SW_0_255(res2);
    res3 = CLIP_SW_0_255(res3);
    VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
    VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
    ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
 }
 static void dequant_idct4x4_addblk_2x_msa(int16_t *input,
                                          int16_t *dequant_input,
                                          uint8_t *dest, int32_t dest_stride)
 {
    v16u8 dest0, dest1, dest2, dest3;
    v8i16 in0, in1, in2, in3;
    v8i16 mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
    v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
    v8i16 res0, res1, res2, res3;
    v4i32 hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
    v4i32 vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
    v16i8 zero = { 0 };
    LD_SH4(input, 8, in0, in1, in2, in3);
    LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
    MUL4(in0, dequant_in0, in1, dequant_in1, in2, dequant_in0, in3, dequant_in1,
         mul0, mul1, mul2, mul3);
    PCKEV_D2_SH(mul2, mul0, mul3, mul1, in0, in2);
    PCKOD_D2_SH(mul2, mul0, mul3, mul1, in1, in3);
    VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
    TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
    UNPCK_SH_SW(hz0, hz0r, hz0l);
    UNPCK_SH_SW(hz1, hz1r, hz1l);
    UNPCK_SH_SW(hz2, hz2r, hz2l);
    UNPCK_SH_SW(hz3, hz3r, hz3l);
    VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l);
    SRARI_W4_SW(vt0l, vt1l, vt2l, vt3l, 3);
    VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r);
    SRARI_W4_SW(vt0r, vt1r, vt2r, vt3r, 3);
    PCKEV_H4_SH(vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r, vt0, vt1, vt2,
                vt3);
    TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
    LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
    ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
               res2, res3);
    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
    CLIP_SH4_0_255(res0, res1, res2, res3);
    PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1,
                res2, res3);
    PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
    PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
    ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
    __asm__ __volatile__(
        "sw   $zero,    0(%[input])  \n\t"
        "sw   $zero,    4(%[input])  \n\t"
        "sw   $zero,    8(%[input])  \n\t"
        "sw   $zero,   12(%[input])  \n\t"
        "sw   $zero,   16(%[input])  \n\t"
        "sw   $zero,   20(%[input])  \n\t"
        "sw   $zero,   24(%[input])  \n\t"
        "sw   $zero,   28(%[input])  \n\t"
        "sw   $zero,   32(%[input])  \n\t"
        "sw   $zero,   36(%[input])  \n\t"
        "sw   $zero,   40(%[input])  \n\t"
        "sw   $zero,   44(%[input])  \n\t"
        "sw   $zero,   48(%[input])  \n\t"
        "sw   $zero,   52(%[input])  \n\t"
        "sw   $zero,   56(%[input])  \n\t"
        "sw   $zero,   60(%[input])  \n\t"::
        [input] "r"(input)
    );
 }
 static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input,
                                         uint8_t *dest, int32_t dest_stride)
 {
    v8i16 input_dc0, input_dc1, vec;
    v16u8 dest0, dest1, dest2, dest3;
    v16i8 zero = { 0 };
    v8i16 res0, res1, res2, res3;
    input_dc0 = __msa_fill_h(input[0] * dequant_input[0]);
    input_dc1 = __msa_fill_h(input[16] * dequant_input[0]);
    SRARI_H2_SH(input_dc0, input_dc1, 3);
    vec = (v8i16)__msa_pckev_d((v2i64)input_dc1, (v2i64)input_dc0);
    input[0] = 0;
    input[16] = 0;
    LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
    ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0,
               res1, res2, res3);
    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
    CLIP_SH4_0_255(res0, res1, res2, res3);
    PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1,
                res2, res3);
    PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
    PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
    ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
 }
 void vp8_short_idct4x4llm_msa(int16_t *input, uint8_t *pred_ptr,
                              int32_t pred_stride, uint8_t *dst_ptr,
                              int32_t dst_stride)
 {
    idct4x4_addblk_msa(input, pred_ptr, pred_stride, dst_ptr, dst_stride);
 }
 void vp8_dc_only_idct_add_msa(int16_t input_dc, uint8_t *pred_ptr,
                              int32_t pred_stride, uint8_t *dst_ptr,
                              int32_t dst_stride)
 {
    idct4x4_addconst_msa(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride);
 }
 void vp8_dequantize_b_msa(BLOCKD *d, int16_t *DQC)
 {
    v8i16 dqc0, dqc1, q0, q1, dq0, dq1;
    LD_SH2(DQC, 8, dqc0, dqc1);
    LD_SH2(d->qcoeff, 8, q0, q1);
    MUL2(dqc0, q0, dqc1, q1, dq0, dq1);
    ST_SH2(dq0, dq1, d->dqcoeff, 8);
 }
 void vp8_dequant_idct_add_msa(int16_t *input, int16_t *dq,
                              uint8_t *dest, int32_t stride)
 {
    dequant_idct4x4_addblk_msa(input, dq, dest, stride);
    __asm__ __volatile__ (
        "sw     $zero,    0(%[input])     \n\t"
        "sw     $zero,    4(%[input])     \n\t"
        "sw     $zero,    8(%[input])     \n\t"
        "sw     $zero,   12(%[input])     \n\t"
        "sw     $zero,   16(%[input])     \n\t"
        "sw     $zero,   20(%[input])     \n\t"
        "sw     $zero,   24(%[input])     \n\t"
        "sw     $zero,   28(%[input])     \n\t"
        :
        : [input] "r" (input)
    );
 }
 void vp8_dequant_idct_add_y_block_msa(int16_t *q, int16_t *dq,
                                      uint8_t *dst, int32_t stride,
                                      char *eobs)
 {
    int16_t *eobs_h = (int16_t *)eobs;
    uint8_t i;
    for (i = 4; i--;)
    {
        if (eobs_h[0])
        {
            if (eobs_h[0] & 0xfefe)
            {
                dequant_idct4x4_addblk_2x_msa(q, dq, dst, stride);
            }
            else
            {
                dequant_idct_addconst_2x_msa(q, dq, dst, stride);
            }
        }
        q += 32;
        if (eobs_h[1])
        {
            if (eobs_h[1] & 0xfefe)
            {
                dequant_idct4x4_addblk_2x_msa(q, dq, dst + 8, stride);
            }
            else
            {
                dequant_idct_addconst_2x_msa(q, dq, dst + 8, stride);
            }
        }
        q += 32;
        dst += (4 * stride);
        eobs_h += 2;
    }
 }
 void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq,
                                       uint8_t *dstu, uint8_t *dstv,
                                       int32_t stride, char *eobs)
 {
    int16_t *eobs_h = (int16_t *)eobs;
    if (eobs_h[0])
    {
        if (eobs_h[0] & 0xfefe)
        {
            dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
        }
        else
        {
            dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
        }
    }
    q += 32;
    dstu += (stride * 4);
    if (eobs_h[1])
    {
        if (eobs_h[1] & 0xfefe)
        {
            dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
        }
        else
        {
            dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
        }
    }
    q += 32;
    if (eobs_h[2])
    {
        if (eobs_h[2] & 0xfefe)
        {
            dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
        }
        else
        {
            dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
        }
    }
    q += 32;
    dstv += (stride * 4);
    if (eobs_h[3])
    {
        if (eobs_h[3] & 0xfefe)
        {
            dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
        }
        else
        {
            dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
        }
    }
 }
--- a/vp8/common/mips/msa/loopfilter_filters_msa.c
+++ b/vp8/common/mips/msa/loopfilter_filters_msa.c
@@ -1,826 +0,0 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vp8_rtcd.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/mips/msa/vp8_macros_msa.h"
 #define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask)         \
 {                                                              \
    v16u8 p1_a_sub_q1, p0_a_sub_q0;                            \
                                                               \
    p0_a_sub_q0 = __msa_asub_u_b(p0, q0);                      \
    p1_a_sub_q1 = __msa_asub_u_b(p1, q1);                      \
    p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1);  \
    p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0);    \
    mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1);           \
    mask = ((v16u8)mask <= b_limit);                           \
 }
 #define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out,  \
                           mask_in, hev_in)                             \
 {                                                                       \
    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
                                                                        \
    p1_m = (v16i8)__msa_xori_b(p1_in_out, 0x80);                        \
    p0_m = (v16i8)__msa_xori_b(p0_in_out, 0x80);                        \
    q0_m = (v16i8)__msa_xori_b(q0_in_out, 0x80);                        \
    q1_m = (v16i8)__msa_xori_b(q1_in_out, 0x80);                        \
                                                                        \
    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
                                                                        \
    filt = filt & (v16i8)hev_in;                                        \
                                                                        \
    q0_sub_p0 = q0_m - p0_m;                                            \
    filt_sign = __msa_clti_s_b(filt, 0);                                \
                                                                        \
    cnst3h = __msa_ldi_h(3);                                            \
    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
    filt_r += q0_sub_p0_r;                                              \
    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
                                                                        \
    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);            \
    q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);    \
    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                      \
    filt_l += q0_sub_p0_l;                                              \
    filt_l = __msa_sat_s_h(filt_l, 7);                                  \
                                                                        \
    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                 \
    filt = filt & (v16i8)mask_in;                                       \
                                                                        \
    cnst4b = __msa_ldi_b(4);                                            \
    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
    filt1 >>= 3;                                                        \
                                                                        \
    cnst3b = __msa_ldi_b(3);                                            \
    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
    filt2 >>= 3;                                                        \
                                                                        \
    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
    q0_in_out = __msa_xori_b((v16u8)q0_m, 0x80);                        \
    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
    p0_in_out = __msa_xori_b((v16u8)p0_m, 0x80);                        \
                                                                        \
    filt = __msa_srari_b(filt1, 1);                                     \
    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
    filt = filt & (v16i8)hev_in;                                        \
                                                                        \
    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
    q1_in_out = __msa_xori_b((v16u8)q1_m, 0x80);                        \
    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
    p1_in_out = __msa_xori_b((v16u8)p1_m, 0x80);                        \
 }
 #define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask)          \
 {                                                                  \
    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign;       \
    v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign;           \
    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;        \
                                                                   \
    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                       \
    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                       \
    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                       \
    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                       \
                                                                   \
    filt = __msa_subs_s_b(p1_m, q1_m);                             \
                                                                   \
    q0_sub_p0 = q0_m - p0_m;                                       \
    filt_sign = __msa_clti_s_b(filt, 0);                           \
                                                                   \
    cnst3h = __msa_ldi_h(3);                                       \
    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                 \
    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \
    q0_sub_p0_r *= cnst3h;                                         \
    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                 \
    filt_r += q0_sub_p0_r;                                         \
    filt_r = __msa_sat_s_h(filt_r, 7);                             \
                                                                   \
    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \
    q0_sub_p0_l *= cnst3h;                                         \
    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                 \
    filt_l += q0_sub_p0_l;                                         \
    filt_l = __msa_sat_s_h(filt_l, 7);                             \
                                                                   \
    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);            \
    filt = filt & (v16i8)(mask);                                   \
                                                                   \
    cnst4b = __msa_ldi_b(4);                                       \
    filt1 = __msa_adds_s_b(filt, cnst4b);                          \
    filt1 >>= 3;                                                   \
                                                                   \
    cnst3b = __msa_ldi_b(3);                                       \
    filt2 = __msa_adds_s_b(filt, cnst3b);                          \
    filt2 >>= 3;                                                   \
                                                                   \
    q0_m = __msa_subs_s_b(q0_m, filt1);                            \
    p0_m = __msa_adds_s_b(p0_m, filt2);                            \
    q0_in = __msa_xori_b((v16u8)q0_m, 0x80);                       \
    p0_in = __msa_xori_b((v16u8)p0_m, 0x80);                       \
 }
 #define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev)            \
 {                                                                  \
    v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                      \
    v16i8 filt, q0_sub_p0, cnst4b, cnst3b;                         \
    v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign;              \
    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l;      \
    v8i16 cnst3h, cnst27h, cnst18h, cnst63h;                       \
                                                                   \
    cnst3h = __msa_ldi_h(3);                                       \
                                                                   \
    p2_m = (v16i8)__msa_xori_b(p2, 0x80);                          \
    p1_m = (v16i8)__msa_xori_b(p1, 0x80);                          \
    p0_m = (v16i8)__msa_xori_b(p0, 0x80);                          \
    q0_m = (v16i8)__msa_xori_b(q0, 0x80);                          \
    q1_m = (v16i8)__msa_xori_b(q1, 0x80);                          \
    q2_m = (v16i8)__msa_xori_b(q2, 0x80);                          \
                                                                   \
    filt = __msa_subs_s_b(p1_m, q1_m);                             \
    q0_sub_p0 = q0_m - p0_m;                                       \
    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                 \
    filt_sign = __msa_clti_s_b(filt, 0);                           \
                                                                   \
    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \
    q0_sub_p0_r *= cnst3h;                                         \
    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                 \
    filt_r = filt_r + q0_sub_p0_r;                                 \
    filt_r = __msa_sat_s_h(filt_r, 7);                             \
                                                                   \
    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \
    q0_sub_p0_l *= cnst3h;                                         \
    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                 \
    filt_l = filt_l + q0_sub_p0_l;                                 \
    filt_l = __msa_sat_s_h(filt_l, 7);                             \
                                                                   \
    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);            \
    filt = filt & (v16i8)mask;                                     \
    filt2 = filt & (v16i8)hev;                                     \
                                                                   \
    hev = __msa_xori_b(hev, 0xff);                                 \
    filt = filt & (v16i8)hev;                                      \
    cnst4b = __msa_ldi_b(4);                                       \
    filt1 = __msa_adds_s_b(filt2, cnst4b);                         \
    filt1 >>= 3;                                                   \
    cnst3b = __msa_ldi_b(3);                                       \
    filt2 = __msa_adds_s_b(filt2, cnst3b);                         \
    filt2 >>= 3;                                                   \
    q0_m = __msa_subs_s_b(q0_m, filt1);                            \
    p0_m = __msa_adds_s_b(p0_m, filt2);                            \
                                                                   \
    filt_sign = __msa_clti_s_b(filt, 0);                           \
    ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);                  \
                                                                   \
    cnst27h = __msa_ldi_h(27);                                     \
    cnst63h = __msa_ldi_h(63);                                     \
                                                                   \
    u_r = filt_r * cnst27h;                                        \
    u_r += cnst63h;                                                \
    u_r >>= 7;                                                     \
    u_r = __msa_sat_s_h(u_r, 7);                                   \
    u_l = filt_l * cnst27h;                                        \
    u_l += cnst63h;                                                \
    u_l >>= 7;                                                     \
    u_l = __msa_sat_s_h(u_l, 7);                                   \
    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                     \
    q0_m = __msa_subs_s_b(q0_m, u);                                \
    q0 = __msa_xori_b((v16u8)q0_m, 0x80);                          \
    p0_m = __msa_adds_s_b(p0_m, u);                                \
    p0 = __msa_xori_b((v16u8)p0_m, 0x80);                          \
    cnst18h = __msa_ldi_h(18);                                     \
    u_r = filt_r * cnst18h;                                        \
    u_r += cnst63h;                                                \
    u_r >>= 7;                                                     \
    u_r = __msa_sat_s_h(u_r, 7);                                   \
                                                                   \
    u_l = filt_l * cnst18h;                                        \
    u_l += cnst63h;                                                \
    u_l >>= 7;                                                     \
    u_l = __msa_sat_s_h(u_l, 7);                                   \
    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                     \
    q1_m = __msa_subs_s_b(q1_m, u);                                \
    q1 = __msa_xori_b((v16u8)q1_m, 0x80);                          \
    p1_m = __msa_adds_s_b(p1_m, u);                                \
    p1 = __msa_xori_b((v16u8)p1_m, 0x80);                          \
    u_r = filt_r << 3;                                             \
    u_r += filt_r + cnst63h;                                       \
    u_r >>= 7;                                                     \
    u_r = __msa_sat_s_h(u_r, 7);                                   \
                                                                   \
    u_l = filt_l << 3;                                             \
    u_l += filt_l + cnst63h;                                       \
    u_l >>= 7;                                                     \
    u_l = __msa_sat_s_h(u_l, 7);                                   \
    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                     \
    q2_m = __msa_subs_s_b(q2_m, u);                                \
    q2 = __msa_xori_b((v16u8)q2_m, 0x80);                          \
    p2_m = __msa_adds_s_b(p2_m, u);                                \
    p2 = __msa_xori_b((v16u8)p2_m, 0x80);                          \
 }
 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \
                     q0_in, q1_in, q2_in, q3_in,                   \
                     limit_in, b_limit_in, thresh_in,              \
                     hev_out, mask_out, flat_out)                  \
 {                                                                  \
    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
                                                                   \
    p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in));               \
    p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in));               \
    p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in));               \
    q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in));               \
    q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in));               \
    q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in));               \
    p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in));               \
    p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in));               \
    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
    hev_out = (thresh_in) < (v16u8)flat_out;                       \
    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
    p1_asub_q1_m >>= 1;                                            \
    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
    mask_out = (b_limit_in) < p0_asub_q0_m;                        \
    mask_out = __msa_max_u_b(flat_out, mask_out);                  \
    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
    mask_out = (limit_in) < (v16u8)mask_out;                       \
    mask_out = __msa_xori_b(mask_out, 0xff);                       \
 }
 #define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride)  \
 {                                                               \
    uint16_t tmp0_h;                                            \
    uint32_t tmp0_w;                                            \
                                                                \
    tmp0_w = __msa_copy_u_w((v4i32)in0, in0_idx);               \
    tmp0_h = __msa_copy_u_h((v8i16)in1, in1_idx);               \
    SW(tmp0_w, pdst);                                           \
    SH(tmp0_h, pdst + stride);                                  \
 }
 static void loop_filter_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
                                              const uint8_t *b_limit0_ptr,
                                              const uint8_t *limit0_ptr,
                                              const uint8_t *thresh0_ptr,
                                              const uint8_t *b_limit1_ptr,
                                              const uint8_t *limit1_ptr,
                                              const uint8_t *thresh1_ptr)
 {
    v16u8 mask, hev, flat;
    v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
    thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
    thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
    thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
    b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
    b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
    b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
    limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
    limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
    limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
                 hev, mask, flat);
    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
    ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
 }
 static void loop_filter_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
                                            const uint8_t *b_limit0_ptr,
                                            const uint8_t *limit0_ptr,
                                            const uint8_t *thresh0_ptr,
                                            const uint8_t *b_limit1_ptr,
                                            const uint8_t *limit1_ptr,
                                            const uint8_t *thresh1_ptr)
 {
    v16u8 mask, hev, flat;
    v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
    LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
    LD_UB8(src - 4 + (8 * pitch), pitch,
           row8, row9, row10, row11, row12, row13, row14, row15);
    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
                        row8, row9, row10, row11, row12, row13, row14, row15,
                        p3, p2, p1, p0, q0, q1, q2, q3);
    thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
    thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
    thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
    b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
    b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
    b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
    limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
    limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
    limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
                 hev, mask, flat);
    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
    ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
    ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
    ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
    ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
    src -= 2;
    ST4x8_UB(tmp2, tmp3, src, pitch);
    src += (8 * pitch);
    ST4x8_UB(tmp4, tmp5, src, pitch);
 }
 static void mbloop_filter_horizontal_edge_y_msa(uint8_t *src, int32_t pitch,
                                                const uint8_t b_limit_in,
                                                const uint8_t limit_in,
                                                const uint8_t thresh_in)
 {
    uint8_t *temp_src;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 mask, hev, flat, thresh, limit, b_limit;
    b_limit = (v16u8)__msa_fill_b(b_limit_in);
    limit = (v16u8)__msa_fill_b(limit_in);
    thresh = (v16u8)__msa_fill_b(thresh_in);
    temp_src = src - (pitch << 2);
    LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
    temp_src = src - 3 * pitch;
    ST_UB4(p2, p1, p0, q0, temp_src, pitch);
    temp_src += (4 * pitch);
    ST_UB2(q1, q2, temp_src, pitch);
 }
 static void mbloop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
                                                 int32_t pitch,
                                                 const uint8_t b_limit_in,
                                                 const uint8_t limit_in,
                                                 const uint8_t thresh_in)
 {
    uint8_t *temp_src;
    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 mask, hev, flat, thresh, limit, b_limit;
    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
    b_limit = (v16u8)__msa_fill_b(b_limit_in);
    limit = (v16u8)__msa_fill_b(limit_in);
    thresh = (v16u8)__msa_fill_b(thresh_in);
    temp_src = src_u - (pitch << 2);
    LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
    temp_src = src_v - (pitch << 2);
    LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
    p2_d = __msa_copy_u_d((v2i64)p2, 0);
    p1_d = __msa_copy_u_d((v2i64)p1, 0);
    p0_d = __msa_copy_u_d((v2i64)p0, 0);
    q0_d = __msa_copy_u_d((v2i64)q0, 0);
    q1_d = __msa_copy_u_d((v2i64)q1, 0);
    q2_d = __msa_copy_u_d((v2i64)q2, 0);
    src_u -= (pitch * 3);
    SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
    src_u += 4 * pitch;
    SD(q1_d, src_u);
    src_u += pitch;
    SD(q2_d, src_u);
    p2_d = __msa_copy_u_d((v2i64)p2, 1);
    p1_d = __msa_copy_u_d((v2i64)p1, 1);
    p0_d = __msa_copy_u_d((v2i64)p0, 1);
    q0_d = __msa_copy_u_d((v2i64)q0, 1);
    q1_d = __msa_copy_u_d((v2i64)q1, 1);
    q2_d = __msa_copy_u_d((v2i64)q2, 1);
    src_v -= (pitch * 3);
    SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
    src_v += 4 * pitch;
    SD(q1_d, src_v);
    src_v += pitch;
    SD(q2_d, src_v);
 }
 static void mbloop_filter_vertical_edge_y_msa(uint8_t *src, int32_t pitch,
                                              const uint8_t b_limit_in,
                                              const uint8_t limit_in,
                                              const uint8_t thresh_in)
 {
    uint8_t *temp_src;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 mask, hev, flat, thresh, limit, b_limit;
    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
    v16u8 row9, row10, row11, row12, row13, row14, row15;
    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    b_limit = (v16u8)__msa_fill_b(b_limit_in);
    limit = (v16u8)__msa_fill_b(limit_in);
    thresh = (v16u8)__msa_fill_b(thresh_in);
    temp_src = src - 4;
    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
    temp_src += (8 * pitch);
    LD_UB8(temp_src, pitch,
           row8, row9, row10, row11, row12, row13, row14, row15);
    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
                        row8, row9, row10, row11, row12, row13, row14, row15,
                        p3, p2, p1, p0, q0, q1, q2, q3);
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
    ILVRL_B2_SH(q2, q1, tmp2, tmp5);
    temp_src = src - 3;
    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4);
 }
 static void mbloop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
                                               int32_t pitch,
                                               const uint8_t b_limit_in,
                                               const uint8_t limit_in,
                                               const uint8_t thresh_in)
 {
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 mask, hev, flat, thresh, limit, b_limit;
    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
    v16u8 row9, row10, row11, row12, row13, row14, row15;
    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    b_limit = (v16u8)__msa_fill_b(b_limit_in);
    limit = (v16u8)__msa_fill_b(limit_in);
    thresh = (v16u8)__msa_fill_b(thresh_in);
    LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
    LD_UB8(src_v - 4, pitch,
           row8, row9, row10, row11, row12, row13, row14, row15);
    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
                        row8, row9, row10, row11, row12, row13, row14, row15,
                        p3, p2, p1, p0, q0, q1, q2, q3);
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
    ILVRL_B2_SH(q2, q1, tmp2, tmp5);
    src_u -= 3;
    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4);
    src_u += pitch;
    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4);
    src_u += pitch;
    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4);
    src_u += pitch;
    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4);
    src_u += pitch;
    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4);
    src_u += pitch;
    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4);
    src_u += pitch;
    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4);
    src_u += pitch;
    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4);
    src_v -= 3;
    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4);
    src_v += pitch;
    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4);
    src_v += pitch;
    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4);
    src_v += pitch;
    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4);
    src_v += pitch;
    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4);
    src_v += pitch;
    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4);
    src_v += pitch;
    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4);
    src_v += pitch;
    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4);
 }
 void vp8_loop_filter_simple_horizontal_edge_msa(uint8_t *src, int32_t pitch,
                                                const uint8_t *b_limit_ptr)
 {
    v16u8 p1, p0, q1, q0;
    v16u8 mask, b_limit;
    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
    LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1);
    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
    ST_UB2(p0, q0, (src - pitch), pitch);
 }
 void vp8_loop_filter_simple_vertical_edge_msa(uint8_t *src, int32_t pitch,
                                              const uint8_t *b_limit_ptr)
 {
    uint8_t *temp_src;
    v16u8 p1, p0, q1, q0;
    v16u8 mask, b_limit;
    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
    v16u8 row9, row10, row11, row12, row13, row14, row15;
    v8i16 tmp0, tmp1;
    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
    temp_src = src - 2;
    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
    temp_src += (8 * pitch);
    LD_UB8(temp_src, pitch,
           row8, row9, row10, row11, row12, row13, row14, row15);
    TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
                        row8, row9, row10, row11, row12, row13, row14, row15,
                        p1, p0, q0, q1);
    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
    ILVRL_B2_SH(q0, p0, tmp1, tmp0);
    src -= 1;
    ST2x4_UB(tmp1, 0, src, pitch);
    src += 4 * pitch;
    ST2x4_UB(tmp1, 4, src, pitch);
    src += 4 * pitch;
    ST2x4_UB(tmp0, 0, src, pitch);
    src += 4 * pitch;
    ST2x4_UB(tmp0, 4, src, pitch);
    src += 4 * pitch;
 }
 static void loop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
                                               int32_t pitch,
                                               const uint8_t b_limit_in,
                                               const uint8_t limit_in,
                                               const uint8_t thresh_in)
 {
    uint64_t p1_d, p0_d, q0_d, q1_d;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 mask, hev, flat, thresh, limit, b_limit;
    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
    thresh = (v16u8)__msa_fill_b(thresh_in);
    limit = (v16u8)__msa_fill_b(limit_in);
    b_limit = (v16u8)__msa_fill_b(b_limit_in);
    src_u = src_u - (pitch << 2);
    LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
    src_u += (5 * pitch);
    src_v = src_v - (pitch << 2);
    LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
    src_v += (5 * pitch);
    /* right 8 element of p3 are u pixel and
       left 8 element of p3 are v pixel */
    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
    p1_d = __msa_copy_u_d((v2i64)p1, 0);
    p0_d = __msa_copy_u_d((v2i64)p0, 0);
    q0_d = __msa_copy_u_d((v2i64)q0, 0);
    q1_d = __msa_copy_u_d((v2i64)q1, 0);
    SD4(q1_d, q0_d, p0_d, p1_d, src_u, (- pitch));
    p1_d = __msa_copy_u_d((v2i64)p1, 1);
    p0_d = __msa_copy_u_d((v2i64)p0, 1);
    q0_d = __msa_copy_u_d((v2i64)q0, 1);
    q1_d = __msa_copy_u_d((v2i64)q1, 1);
    SD4(q1_d, q0_d, p0_d, p1_d, src_v, (- pitch));
 }
 static void loop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
                                             int32_t pitch,
                                             const uint8_t b_limit_in,
                                             const uint8_t limit_in,
                                             const uint8_t thresh_in)
 {
    uint8_t *temp_src_u, *temp_src_v;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 mask, hev, flat, thresh, limit, b_limit;
    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
    v16u8 row9, row10, row11, row12, row13, row14, row15;
    v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
    thresh = (v16u8)__msa_fill_b(thresh_in);
    limit = (v16u8)__msa_fill_b(limit_in);
    b_limit = (v16u8)__msa_fill_b(b_limit_in);
    LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
    LD_UB8(src_v - 4, pitch,
           row8, row9, row10, row11, row12, row13, row14, row15);
    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
                        row8, row9, row10, row11, row12, row13, row14, row15,
                        p3, p2, p1, p0, q0, q1, q2, q3);
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
    ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
    ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
    tmp0 = (v4i32)__msa_ilvl_b((v16i8)p0, (v16i8)p1);
    tmp1 = (v4i32)__msa_ilvl_b((v16i8)q1, (v16i8)q0);
    ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
    temp_src_u = src_u - 2;
    ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
    temp_src_u += 4 * pitch;
    ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);
    temp_src_v = src_v - 2;
    ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
    temp_src_v += 4 * pitch;
    ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
 }
 void vp8_loop_filter_mbh_msa(uint8_t *src_y, uint8_t *src_u,
                             uint8_t *src_v, int32_t pitch_y,
                             int32_t pitch_u_v,
                             loop_filter_info *lpf_info_ptr)
 {
    mbloop_filter_horizontal_edge_y_msa(src_y, pitch_y,
                                        *lpf_info_ptr->mblim,
                                        *lpf_info_ptr->lim,
                                        *lpf_info_ptr->hev_thr);
    if (src_u)
    {
        mbloop_filter_horizontal_edge_uv_msa(src_u, src_v, pitch_u_v,
                                             *lpf_info_ptr->mblim,
                                             *lpf_info_ptr->lim,
                                             *lpf_info_ptr->hev_thr);
    }
 }
 void vp8_loop_filter_mbv_msa(uint8_t *src_y, uint8_t *src_u,
                             uint8_t *src_v, int32_t pitch_y,
                             int32_t pitch_u_v,
                             loop_filter_info *lpf_info_ptr)
 {
    mbloop_filter_vertical_edge_y_msa(src_y, pitch_y,
                                      *lpf_info_ptr->mblim,
                                      *lpf_info_ptr->lim,
                                      *lpf_info_ptr->hev_thr);
    if (src_u)
    {
        mbloop_filter_vertical_edge_uv_msa(src_u, src_v, pitch_u_v,
                                           *lpf_info_ptr->mblim,
                                           *lpf_info_ptr->lim,
                                           *lpf_info_ptr->hev_thr);
    }
 }
 void vp8_loop_filter_bh_msa(uint8_t *src_y, uint8_t *src_u,
                            uint8_t *src_v, int32_t pitch_y,
                            int32_t pitch_u_v,
                            loop_filter_info *lpf_info_ptr)
 {
    loop_filter_horizontal_4_dual_msa(src_y + 4 * pitch_y, pitch_y,
                                      lpf_info_ptr->blim,
                                      lpf_info_ptr->lim,
                                      lpf_info_ptr->hev_thr,
                                      lpf_info_ptr->blim,
                                      lpf_info_ptr->lim,
                                      lpf_info_ptr->hev_thr);
    loop_filter_horizontal_4_dual_msa(src_y + 8 * pitch_y, pitch_y,
                                      lpf_info_ptr->blim,
                                      lpf_info_ptr->lim,
                                      lpf_info_ptr->hev_thr,
                                      lpf_info_ptr->blim,
                                      lpf_info_ptr->lim,
                                      lpf_info_ptr->hev_thr);
    loop_filter_horizontal_4_dual_msa(src_y + 12 * pitch_y, pitch_y,
                                      lpf_info_ptr->blim,
                                      lpf_info_ptr->lim,
                                      lpf_info_ptr->hev_thr,
                                      lpf_info_ptr->blim,
                                      lpf_info_ptr->lim,
                                      lpf_info_ptr->hev_thr);
    if (src_u)
    {
        loop_filter_horizontal_edge_uv_msa(src_u + (4 * pitch_u_v),
                                           src_v + (4 * pitch_u_v),
                                           pitch_u_v,
                                           *lpf_info_ptr->blim,
                                           *lpf_info_ptr->lim,
                                           *lpf_info_ptr->hev_thr);
    }
 }
 void vp8_loop_filter_bv_msa(uint8_t *src_y, uint8_t *src_u,
                            uint8_t *src_v, int32_t pitch_y,
                            int32_t pitch_u_v,
                            loop_filter_info *lpf_info_ptr)
 {
    loop_filter_vertical_4_dual_msa(src_y + 4, pitch_y, lpf_info_ptr->blim,
                                    lpf_info_ptr->lim,
                                    lpf_info_ptr->hev_thr,
                                    lpf_info_ptr->blim,
                                    lpf_info_ptr->lim,
                                    lpf_info_ptr->hev_thr);
    loop_filter_vertical_4_dual_msa(src_y + 8, pitch_y,
                                    lpf_info_ptr->blim,
                                    lpf_info_ptr->lim,
                                    lpf_info_ptr->hev_thr,
                                    lpf_info_ptr->blim,
                                    lpf_info_ptr->lim,
                                    lpf_info_ptr->hev_thr);
    loop_filter_vertical_4_dual_msa(src_y + 12, pitch_y,
                                    lpf_info_ptr->blim,
                                    lpf_info_ptr->lim,
                                    lpf_info_ptr->hev_thr,
                                    lpf_info_ptr->blim,
                                    lpf_info_ptr->lim,
                                    lpf_info_ptr->hev_thr);
    if (src_u)
    {
        loop_filter_vertical_edge_uv_msa(src_u + 4, src_v + 4, pitch_u_v,
                                         *lpf_info_ptr->blim,
                                         *lpf_info_ptr->lim,
                                         *lpf_info_ptr->hev_thr);
    }
 }
 void vp8_loop_filter_bhs_msa(uint8_t *src_y, int32_t pitch_y,
                             const uint8_t *b_limit_ptr)
 {
    vp8_loop_filter_simple_horizontal_edge_msa(src_y + (4 * pitch_y),
                                               pitch_y, b_limit_ptr);
    vp8_loop_filter_simple_horizontal_edge_msa(src_y + (8 * pitch_y),
                                               pitch_y, b_limit_ptr);
    vp8_loop_filter_simple_horizontal_edge_msa(src_y + (12 * pitch_y),
                                               pitch_y, b_limit_ptr);
 }
 void vp8_loop_filter_bvs_msa(uint8_t *src_y, int32_t pitch_y,
                             const uint8_t *b_limit_ptr)
 {
    vp8_loop_filter_simple_vertical_edge_msa(src_y + 4, pitch_y, b_limit_ptr);
    vp8_loop_filter_simple_vertical_edge_msa(src_y + 8, pitch_y, b_limit_ptr);
    vp8_loop_filter_simple_vertical_edge_msa(src_y + 12, pitch_y, b_limit_ptr);
 }
--- a/vp8/common/mips/msa/mfqe_msa.c
+++ b/vp8/common/mips/msa/mfqe_msa.c
@@ -1,146 +0,0 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vp8_rtcd.h"
 #include "vp8/common/postproc.h"
 #include "vp8/common/mips/msa/vp8_macros_msa.h"
 static void filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,
                                    uint8_t *dst_ptr, int32_t dst_stride,
                                    int32_t src_weight)
 {
    int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
    int32_t row;
    uint64_t src0_d, src1_d, dst0_d, dst1_d;
    v16i8 src0 = { 0 };
    v16i8 src1 = { 0 };
    v16i8 dst0 = { 0 };
    v16i8 dst1 = { 0 };
    v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
    src_wt = __msa_fill_h(src_weight);
    dst_wt = __msa_fill_h(dst_weight);
    for (row = 2; row--;)
    {
        LD2(src_ptr, src_stride, src0_d, src1_d);
        src_ptr += (2 * src_stride);
        LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
        INSERT_D2_SB(src0_d, src1_d, src0);
        INSERT_D2_SB(dst0_d, dst1_d, dst0);
        LD2(src_ptr, src_stride, src0_d, src1_d);
        src_ptr += (2 * src_stride);
        LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
        INSERT_D2_SB(src0_d, src1_d, src1);
        INSERT_D2_SB(dst0_d, dst1_d, dst1);
        UNPCK_UB_SH(src0, src_r, src_l);
        UNPCK_UB_SH(dst0, dst_r, dst_l);
        res_h_r = (src_r * src_wt);
        res_h_r += (dst_r * dst_wt);
        res_h_l = (src_l * src_wt);
        res_h_l += (dst_l * dst_wt);
        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
        dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
        ST8x2_UB(dst0, dst_ptr, dst_stride);
        dst_ptr += (2 * dst_stride);
        UNPCK_UB_SH(src1, src_r, src_l);
        UNPCK_UB_SH(dst1, dst_r, dst_l);
        res_h_r = (src_r * src_wt);
        res_h_r += (dst_r * dst_wt);
        res_h_l = (src_l * src_wt);
        res_h_l += (dst_l * dst_wt);
        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
        dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
        ST8x2_UB(dst1, dst_ptr, dst_stride);
        dst_ptr += (2 * dst_stride);
    }
 }
 static void filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,
                                      uint8_t *dst_ptr, int32_t dst_stride,
                                      int32_t src_weight)
 {
    int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
    int32_t row;
    v16i8 src0, src1, src2, src3;
    v16i8 dst0, dst1, dst2, dst3;
    v8i16 src_wt, dst_wt;
    v8i16 res_h_r, res_h_l;
    v8i16 src_r, src_l, dst_r, dst_l;
    src_wt = __msa_fill_h(src_weight);
    dst_wt = __msa_fill_h(dst_weight);
    for (row = 4; row--;)
    {
        LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
        src_ptr += (4 * src_stride);
        LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
        UNPCK_UB_SH(src0, src_r, src_l);
        UNPCK_UB_SH(dst0, dst_r, dst_l);
        res_h_r = (src_r * src_wt);
        res_h_r += (dst_r * dst_wt);
        res_h_l = (src_l * src_wt);
        res_h_l += (dst_l * dst_wt);
        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
        dst_ptr += dst_stride;
        UNPCK_UB_SH(src1, src_r, src_l);
        UNPCK_UB_SH(dst1, dst_r, dst_l);
        res_h_r = (src_r * src_wt);
        res_h_r += (dst_r * dst_wt);
        res_h_l = (src_l * src_wt);
        res_h_l += (dst_l * dst_wt);
        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
        dst_ptr += dst_stride;
        UNPCK_UB_SH(src2, src_r, src_l);
        UNPCK_UB_SH(dst2, dst_r, dst_l);
        res_h_r = (src_r * src_wt);
        res_h_r += (dst_r * dst_wt);
        res_h_l = (src_l * src_wt);
        res_h_l += (dst_l * dst_wt);
        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
        dst_ptr += dst_stride;
        UNPCK_UB_SH(src3, src_r, src_l);
        UNPCK_UB_SH(dst3, dst_r, dst_l);
        res_h_r = (src_r * src_wt);
        res_h_r += (dst_r * dst_wt);
        res_h_l = (src_l * src_wt);
        res_h_l += (dst_l * dst_wt);
        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
        dst_ptr += dst_stride;
    }
 }
 void vp8_filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,
                                   uint8_t *dst_ptr, int32_t dst_stride,
                                   int32_t src_weight)
 {
    filter_by_weight16x16_msa(src_ptr, src_stride, dst_ptr, dst_stride,
                              src_weight);
 }
 void vp8_filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,
                                 uint8_t *dst_ptr, int32_t dst_stride,
                                 int32_t src_weight)
 {
    filter_by_weight8x8_msa(src_ptr, src_stride, dst_ptr, dst_stride,
                            src_weight);
 }
--- a/vp8/common/mips/msa/sixtap_filter_msa.c
+++ b/vp8/common/mips/msa/sixtap_filter_msa.c
--- a/Show More
+++ b/Show More