Merge branch 'master' into experimental

Pick up VP8 encryption, quantization changes, and some fixes to vpxenc Conflicts: test/decode_test_driver.cc test/decode_test_driver.h test/encode_test_driver.cc vp8/vp8cx.mk vpxdec.c vpxenc.c Change-Id: I9fbcc64808ead47e22f1f22501965cc7f0c4791c
2013-03-27 10:41:29 -07:00 · 2013-03-27 10:41:29 -07:00 · 771fc832f3
commit 771fc832f3
parent 513157e093 8015a9aedc
46 changed files with 558 additions and 492 deletions
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@ -112,12 +112,12 @@ endef
 # Use ads2gas script to convert from RVCT format to GAS format.  This passes
 #  puts the processed file under $(ASM_CNV_PATH).  Local clean rule
 #  to handle removing these
-ASM_CNV_OFFSETS_DEPEND = $(ASM_CNV_PATH)/asm_com_offsets.asm
+ASM_CNV_OFFSETS_DEPEND = $(ASM_CNV_PATH)/vp8_asm_com_offsets.asm
 ifeq ($(CONFIG_VP8_DECODER), yes)
-  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/asm_dec_offsets.asm
+  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vp8_asm_dec_offsets.asm
 endif
 ifeq ($(CONFIG_VP8_ENCODER), yes)
-  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/asm_enc_offsets.asm
+  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vp8_asm_enc_offsets.asm
 endif

 .PRECIOUS: %.asm.s
@ -190,19 +190,19 @@ clean:
 include $(BUILD_SHARED_LIBRARY)

 $(eval $(call asm_offsets_template,\
-    $(ASM_CNV_PATH)/asm_com_offsets.asm, \
-    $(LIBVPX_PATH)/vp8/common/asm_com_offsets.c))
+    $(ASM_CNV_PATH)/vp8_asm_com_offsets.asm, \
+    $(LIBVPX_PATH)/vp8/common/vp8_asm_com_offsets.c))

 ifeq ($(CONFIG_VP8_DECODER), yes)
  $(eval $(call asm_offsets_template,\
-    $(ASM_CNV_PATH)/asm_dec_offsets.asm, \
-    $(LIBVPX_PATH)/vp8/decoder/asm_dec_offsets.c))
+    $(ASM_CNV_PATH)/vp8_asm_dec_offsets.asm, \
+    $(LIBVPX_PATH)/vp8/decoder/vp8_asm_dec_offsets.c))
 endif

 ifeq ($(CONFIG_VP8_ENCODER), yes)
  $(eval $(call asm_offsets_template,\
-    $(ASM_CNV_PATH)/asm_enc_offsets.asm, \
-    $(LIBVPX_PATH)/vp8/encoder/asm_enc_offsets.c))
+    $(ASM_CNV_PATH)/vp8_asm_enc_offsets.asm, \
+    $(LIBVPX_PATH)/vp8/encoder/vp8_asm_enc_offsets.c))
 endif

 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
--- a/build/make/Makefile
+++ b/build/make/Makefile
@ -377,7 +377,7 @@ ifneq ($(call enabled,DIST-SRCS),)
    DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/yasm.rules
    DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/obj_int_extract.bat
    DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
-    # Include obj_int_extract if we use offsets from asm_*_offsets
+    # Include obj_int_extract if we use offsets from *_asm_*_offsets
    DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64)    += build/make/obj_int_extract.c
    DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas.pl
    DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas_apple.pl
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
@ -144,7 +144,7 @@ int parse_macho(uint8_t *base_buf, size_t sz) {
          /* Location of string is cacluated each time from the
           * start of the string buffer.  On darwin the symbols
           * are prefixed by "_", so we bump the pointer by 1.
-           * The target value is defined as an int in asm_*_offsets.c,
+           * The target value is defined as an int in *_asm_*_offsets.c,
           * which is 4 bytes on all targets we currently use.
           */
          if (bits == 32) {
@ -446,7 +446,7 @@ int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode) {
            if (strcmp(section_name, ".bss")) {
              if (sizeof(val) != sym.st_size) {
                /* The target value is declared as an int in
-                 * asm_*_offsets.c, which is 4 bytes on all
+                 * *_asm_*_offsets.c, which is 4 bytes on all
                 * targets we currently use. Complain loudly if
                 * this is not true.
                 */
@ -528,7 +528,7 @@ int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode) {
            if ((strcmp(section_name, ".bss"))) {
              if (sizeof(val) != sym.st_size) {
                /* The target value is declared as an int in
-                 * asm_*_offsets.c, which is 4 bytes on all
+                 * *_asm_*_offsets.c, which is 4 bytes on all
                 * targets we currently use. Complain loudly if
                 * this is not true.
                 */
--- a/build/x86-msvs/obj_int_extract.bat
+++ b/build/x86-msvs/obj_int_extract.bat
@ -14,10 +14,10 @@ obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm"
 obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm"
 obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm"

-cl /I "./" /I "%1" /nologo /c "%1/vp8/common/asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/asm_dec_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/asm_enc_offsets.c"
-obj_int_extract.exe rvds "asm_com_offsets.obj" > "vp8_asm_com_offsets.asm"
-obj_int_extract.exe rvds "asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm"
-obj_int_extract.exe rvds "asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/common/vp8_asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/vp8_asm_dec_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/vp8_asm_enc_offsets.c"
+obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm"
+obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm"
+obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"

--- a/2
+++ b/2
@ -299,6 +299,7 @@ CONFIG_LIST="
    multi_res_encoding
    temporal_denoising
    experimental
+    decrypt
    ${EXPERIMENT_LIST}
 "
 CMDLINE_SELECT="
@ -348,6 +349,7 @@ CMDLINE_SELECT="
    multi_res_encoding
    temporal_denoising
    experimental
+    decrypt
 "

 process_cmdline() {
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@ -14,18 +14,13 @@
 #include "test/video_source.h"

 namespace libvpx_test {
-void Decoder::DecodeFrame(const uint8_t *cxdata, int size) {
-  if (!decoder_.priv) {
-    const vpx_codec_err_t res_init = vpx_codec_dec_init(&decoder_,
-                                                        CodecInterface(),
-                                                        &cfg_, 0);
-    ASSERT_EQ(VPX_CODEC_OK, res_init) << DecodeError();
-  }

+vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, int size) {
  vpx_codec_err_t res_dec;
+  InitOnce();
  REGISTER_STATE_CHECK(res_dec = vpx_codec_decode(&decoder_,
                                                  cxdata, size, NULL, 0));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << DecodeError();
+  return res_dec;
 }

 void DecoderTest::RunLoop(CompressedVideoSource *video) {
@ -35,7 +30,9 @@ void DecoderTest::RunLoop(CompressedVideoSource *video) {

  // Decode frames.
  for (video->Begin(); video->cxdata(); video->Next()) {
-    decoder->DecodeFrame(video->cxdata(), video->frame_size());
+    vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(),
+                                                   video->frame_size());
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();

    DxDataIterator dec_iter = decoder->GetDxData();
    const vpx_image_t *img = NULL;
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@ -42,7 +42,7 @@ class DxDataIterator {
 class Decoder {
 public:
  Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
-      : cfg_(cfg), deadline_(deadline) {
+      : cfg_(cfg), deadline_(deadline), init_done_(false) {
    memset(&decoder_, 0, sizeof(decoder_));
  }

@ -50,7 +50,7 @@ class Decoder {
    vpx_codec_destroy(&decoder_);
  }

-  void DecodeFrame(const uint8_t *cxdata, int size);
+  vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, int size);

  DxDataIterator GetDxData() {
    return DxDataIterator(&decoder_);
@ -61,21 +61,39 @@ class Decoder {
  }

  void Control(int ctrl_id, int arg) {
+    InitOnce();
    const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
  }

- protected:
-  virtual const vpx_codec_iface_t* CodecInterface() const = 0;
+  void Control(int ctrl_id, const void *arg) {
+    InitOnce();
+    const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
+  }

  const char* DecodeError() {
    const char *detail = vpx_codec_error_detail(&decoder_);
    return detail ? detail : vpx_codec_error(&decoder_);
  }

+ protected:
+  virtual const vpx_codec_iface_t* CodecInterface() const = 0;
+
+  void InitOnce() {
+    if (!init_done_) {
+      const vpx_codec_err_t res = vpx_codec_dec_init(&decoder_,
+                                                     CodecInterface(),
+                                                     &cfg_, 0);
+      ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
+      init_done_ = true;
+    }
+  }
+
  vpx_codec_ctx_t     decoder_;
  vpx_codec_dec_cfg_t cfg_;
  unsigned int        deadline_;
+  bool                init_done_;
 };

 // Common test functionality for all Decoder tests.
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@ -175,8 +175,9 @@ void EncoderTest::RunLoop(VideoSource *video) {
          case VPX_CODEC_CX_FRAME_PKT:
            has_cxdata = true;
            if (decoder && DoDecode()) {
-              decoder->DecodeFrame((const uint8_t*)pkt->data.frame.buf,
-                                   pkt->data.frame.sz);
+              vpx_codec_err_t res_dec = decoder->DecodeFrame(
+                  (const uint8_t*)pkt->data.frame.buf, pkt->data.frame.sz);
+              ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
              has_dxdata = true;
            }
            ASSERT_GE(pkt->data.frame.pts, last_pts_);
--- a/test/test.mk
+++ b/test/test.mk
@ -31,6 +31,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h


 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += test_vector_test.cc
+
 ##
 ## WHITE BOX TESTS
 ##
@ -55,6 +56,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc

 endif # VP8
--- a/test/vp8_boolcoder_test.cc
+++ b/test/vp8_boolcoder_test.cc
@ -26,6 +26,20 @@ extern "C" {

 namespace {
 const int num_tests = 10;
+
+void encrypt_buffer(uint8_t *buffer, int size, const uint8_t *key) {
+  for (int i = 0; i < size; ++i) {
+    buffer[i] ^= key[i % 32];
+  }
+}
+
+const uint8_t secret_key[32] = {
+  234,  32,   2,  3,  4, 230,   6,  11,
+    0, 132,  22, 23, 45,  21, 124, 255,
+    0,  43,  52,  3, 23,  63,  99,   7,
+  120,   8, 252, 84,  4,  83,   6,  13
+};
+
 }  // namespace

 using libvpx_test::ACMRandom;
@ -71,7 +85,12 @@ TEST(VP8, TestBitIO) {
        vp8_stop_encode(&bw);

        BOOL_DECODER br;
-        vp8dx_start_decode(&br, bw_buffer, buffer_size);
+
+#if CONFIG_DECRYPT
+        encrypt_buffer(bw_buffer, buffer_size, secret_key);
+#endif
+
+        vp8dx_start_decode(&br, bw_buffer, buffer_size, bw_buffer, secret_key);
        bit_rnd.Reset(random_seed);
        for (int i = 0; i < bits_to_test; ++i) {
          if (bit_method == 2) {
--- a/test/vp8_decrypt_test.cc
+++ b/test/vp8_decrypt_test.cc
@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+
+#if CONFIG_DECRYPT
+
+namespace {
+
+const uint8_t decrypt_key[32] = {
+  255, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+}  // namespace
+
+namespace libvpx_test {
+
+TEST(TestDecrypt, NullKey) {
+  vpx_codec_dec_cfg_t cfg = {0};
+  vpx_codec_ctx_t decoder = {0};
+  vpx_codec_err_t res = vpx_codec_dec_init(&decoder, &vpx_codec_vp8_dx_algo,
+                                           &cfg, 0);
+  ASSERT_EQ(VPX_CODEC_OK, res);
+
+  res = vpx_codec_control(&decoder, VP8_SET_DECRYPT_KEY, NULL);
+  ASSERT_EQ(VPX_CODEC_INVALID_PARAM, res);
+}
+
+TEST(TestDecrypt, DecryptWorks) {
+  libvpx_test::IVFVideoSource video("vp80-00-comprehensive-001.ivf");
+  video.Init();
+
+  vpx_codec_dec_cfg_t dec_cfg = {0};
+  Decoder decoder(dec_cfg, 0);
+
+  // Zero decrypt key (by default)
+  video.Begin();
+  vpx_codec_err_t res = decoder.DecodeFrame(video.cxdata(), video.frame_size());
+  ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+
+  // Non-zero decrypt key
+  video.Next();
+  decoder.Control(VP8_SET_DECRYPT_KEY, decrypt_key);
+  res = decoder.DecodeFrame(video.cxdata(), video.frame_size());
+  ASSERT_NE(VPX_CODEC_OK, res) << decoder.DecodeError();
+}
+
+}  // namespace libvpx_test
+
+#endif  // CONFIG_DECRYPT
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@ -156,39 +156,38 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm,
            continue;
        }

-        lvl_ref = lvl_seg;
-
        /* INTRA_FRAME */
        ref = INTRA_FRAME;

        /* Apply delta for reference frame */
-        lvl_ref += mbd->ref_lf_deltas[ref];
+        lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref];

        /* Apply delta for Intra modes */
        mode = 0; /* B_PRED */
        /* Only the split mode BPRED has a further special case */
-        lvl_mode = lvl_ref +  mbd->mode_lf_deltas[mode];
-        lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+        lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
+        /* clamp */
+        lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0;

        lfi->lvl[seg][ref][mode] = lvl_mode;

        mode = 1; /* all the rest of Intra modes */
-        lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref)  : 0; /* clamp */
+        /* clamp */
+        lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0;
        lfi->lvl[seg][ref][mode] = lvl_mode;

        /* LAST, GOLDEN, ALT */
        for(ref = 1; ref < MAX_REF_FRAMES; ref++)
        {
-            int lvl_ref = lvl_seg;
-
            /* Apply delta for reference frame */
-            lvl_ref += mbd->ref_lf_deltas[ref];
+            lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref];

            /* Apply delta for Inter modes */
            for (mode = 1; mode < 4; mode++)
            {
                lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
-                lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+                /* clamp */
+                lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0;

                lfi->lvl[seg][ref][mode] = lvl_mode;
            }
--- a/vp8/common/loopfilter_filters.c
+++ b/vp8/common/loopfilter_filters.c
@ -54,7 +54,7 @@ static void vp8_filter(signed char mask, uc hev, uc *op1,
 {
    signed char ps0, qs0;
    signed char ps1, qs1;
-    signed char vp8_filter, Filter1, Filter2;
+    signed char filter_value, Filter1, Filter2;
    signed char u;

    ps1 = (signed char) * op1 ^ 0x80;
@ -63,35 +63,35 @@ static void vp8_filter(signed char mask, uc hev, uc *op1,
    qs1 = (signed char) * oq1 ^ 0x80;

    /* add outer taps if we have high edge variance */
-    vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
-    vp8_filter &= hev;
+    filter_value = vp8_signed_char_clamp(ps1 - qs1);
+    filter_value &= hev;

    /* inner taps */
-    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
-    vp8_filter &= mask;
+    filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
+    filter_value &= mask;

    /* save bottom 3 bits so that we round one side +4 and the other +3
     * if it equals 4 we'll set to adjust by -1 to account for the fact
     * we'd round 3 the other way
     */
-    Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
-    Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
+    Filter1 = vp8_signed_char_clamp(filter_value + 4);
+    Filter2 = vp8_signed_char_clamp(filter_value + 3);
    Filter1 >>= 3;
    Filter2 >>= 3;
    u = vp8_signed_char_clamp(qs0 - Filter1);
    *oq0 = u ^ 0x80;
    u = vp8_signed_char_clamp(ps0 + Filter2);
    *op0 = u ^ 0x80;
-    vp8_filter = Filter1;
+    filter_value = Filter1;

    /* outer tap adjustments */
-    vp8_filter += 1;
-    vp8_filter >>= 1;
-    vp8_filter &= ~hev;
+    filter_value += 1;
+    filter_value >>= 1;
+    filter_value &= ~hev;

-    u = vp8_signed_char_clamp(qs1 - vp8_filter);
+    u = vp8_signed_char_clamp(qs1 - filter_value);
    *oq1 = u ^ 0x80;
-    u = vp8_signed_char_clamp(ps1 + vp8_filter);
+    u = vp8_signed_char_clamp(ps1 + filter_value);
    *op1 = u ^ 0x80;

 }
@ -162,7 +162,7 @@ static void vp8_mbfilter(signed char mask, uc hev,
                           uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
 {
    signed char s, u;
-    signed char vp8_filter, Filter1, Filter2;
+    signed char filter_value, Filter1, Filter2;
    signed char ps2 = (signed char) * op2 ^ 0x80;
    signed char ps1 = (signed char) * op1 ^ 0x80;
    signed char ps0 = (signed char) * op0 ^ 0x80;
@ -171,11 +171,11 @@ static void vp8_mbfilter(signed char mask, uc hev,
    signed char qs2 = (signed char) * oq2 ^ 0x80;

    /* add outer taps if we have high edge variance */
-    vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
-    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
-    vp8_filter &= mask;
+    filter_value = vp8_signed_char_clamp(ps1 - qs1);
+    filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
+    filter_value &= mask;

-    Filter2 = vp8_filter;
+    Filter2 = filter_value;
    Filter2 &= hev;

    /* save bottom 3 bits so that we round one side +4 and the other +3 */
@ -188,8 +188,8 @@ static void vp8_mbfilter(signed char mask, uc hev,


    /* only apply wider filter if not high edge variance */
-    vp8_filter &= ~hev;
-    Filter2 = vp8_filter;
+    filter_value &= ~hev;
+    Filter2 = filter_value;

    /* roughly 3/7th difference across boundary */
    u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
@ -291,24 +291,24 @@ static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)

 static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1)
 {
-    signed char vp8_filter, Filter1, Filter2;
+    signed char filter_value, Filter1, Filter2;
    signed char p1 = (signed char) * op1 ^ 0x80;
    signed char p0 = (signed char) * op0 ^ 0x80;
    signed char q0 = (signed char) * oq0 ^ 0x80;
    signed char q1 = (signed char) * oq1 ^ 0x80;
    signed char u;

-    vp8_filter = vp8_signed_char_clamp(p1 - q1);
-    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (q0 - p0));
-    vp8_filter &= mask;
+    filter_value = vp8_signed_char_clamp(p1 - q1);
+    filter_value = vp8_signed_char_clamp(filter_value + 3 * (q0 - p0));
+    filter_value &= mask;

    /* save bottom 3 bits so that we round one side +4 and the other +3 */
-    Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
+    Filter1 = vp8_signed_char_clamp(filter_value + 4);
    Filter1 >>= 3;
    u = vp8_signed_char_clamp(q0 - Filter1);
    *oq0  = u ^ 0x80;

-    Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
+    Filter2 = vp8_signed_char_clamp(filter_value + 3);
    Filter2 >>= 3;
    u = vp8_signed_char_clamp(p0 + Filter2);
    *op0 = u ^ 0x80;
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@ -36,7 +36,6 @@ void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x,
    case DC_PRED:
    {
        int expected_dc;
-        int i;
        int shift;
        int average = 0;

@ -168,7 +167,6 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
    {
        int expected_udc;
        int expected_vdc;
-        int i;
        int shift;
        int Uaverage = 0;
        int Vaverage = 0;
@ -217,8 +215,6 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
    break;
    case V_PRED:
    {
-        int i;
-
        for (i = 0; i < 8; i++)
        {
            vpx_memcpy(upred_ptr, uabove_row, 8);
@ -231,8 +227,6 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
    break;
    case H_PRED:
    {
-        int i;
-
        for (i = 0; i < 8; i++)
        {
            vpx_memset(upred_ptr, uleft_col[i], 8);
@ -245,8 +239,6 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
    break;
    case TM_PRED:
    {
-        int i;
-
        for (i = 0; i < 8; i++)
        {
            for (j = 0; j < 8; j++)
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@ -444,8 +444,9 @@ vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6
 # Quantizer
 #
 prototype void vp8_regular_quantize_b "struct block *, struct blockd *"
-specialize vp8_regular_quantize_b sse2 sse4_1
-vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4
+specialize vp8_regular_quantize_b sse2 #sse4_1
+# TODO(johann) Update sse4 implementation and re-enable
+#vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4

 prototype void vp8_fast_quantize_b "struct block *, struct blockd *"
 specialize vp8_fast_quantize_b sse2 ssse3 media neon
--- a/vp8/common/vp8_asm_com_offsets.c
+++ b/vp8/common/vp8_asm_com_offsets.c
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@ -61,7 +61,7 @@ sym(vp8_mbpost_proc_down_mmx):
            mov         rcx,        8
 .init_borderd                                                    ; initialize borders
            lea         rdi,        [rdi + rax]
-            movq        [rdi],      xmm1
+            movq        [rdi],      mm1

            dec         rcx
            jne         .init_borderd
@ -193,7 +193,6 @@ sym(vp8_mbpost_proc_down_mmx):
            movq        mm4,        [sym(vp8_rv) + rcx*2]
 %endif
            paddw       mm1,        mm4
-            ;paddw     xmm1,       eight8s
            psraw       mm1,        4

            packuswb    mm1,        mm0
--- a/vp8/decoder/dboolhuff.c
+++ b/vp8/decoder/dboolhuff.c
@ -10,18 +10,20 @@


 #include "dboolhuff.h"
-#include "vpx_ports/mem.h"
-#include "vpx_mem/vpx_mem.h"

 int vp8dx_start_decode(BOOL_DECODER *br,
                       const unsigned char *source,
-                       unsigned int source_sz)
+                       unsigned int source_sz,
+                       const unsigned char *origin,
+                       const unsigned char *key)
 {
    br->user_buffer_end = source+source_sz;
    br->user_buffer     = source;
    br->value    = 0;
    br->count    = -8;
    br->range    = 255;
+    br->origin = origin;
+    br->key = key;

    if (source_sz && !source)
        return 1;
@ -32,19 +34,34 @@ int vp8dx_start_decode(BOOL_DECODER *br,
    return 0;
 }

-
 void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
 {
-    const unsigned char *bufptr;
-    const unsigned char *bufend;
-    VP8_BD_VALUE         value;
-    int                  count;
-    bufend = br->user_buffer_end;
-    bufptr = br->user_buffer;
-    value = br->value;
-    count = br->count;
+    const unsigned char *bufptr = br->user_buffer;
+    const unsigned char *bufend = br->user_buffer_end;
+    VP8_BD_VALUE value = br->value;
+    int count = br->count;
+    int shift = VP8_BD_VALUE_SIZE - 8 - (count + 8);
+    size_t bits_left = (bufend - bufptr)*CHAR_BIT;
+    int x = (int)(shift + CHAR_BIT - bits_left);
+    int loop_end = 0;

-    VP8DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);
+    if(x >= 0)
+    {
+        count += VP8_LOTS_OF_BITS;
+        loop_end = x;
+    }
+
+    if (x < 0 || bits_left)
+    {
+        while(shift >= loop_end)
+        {
+            count += CHAR_BIT;
+            value |= ((VP8_BD_VALUE)decrypt_byte(bufptr, br->origin,
+                                                 br->key)) << shift;
+            ++bufptr;
+            shift -= CHAR_BIT;
+        }
+    }

    br->user_buffer = bufptr;
    br->value = value;
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@ -9,21 +9,36 @@
 */


-#ifndef DBOOLHUFF_H
-#define DBOOLHUFF_H
+#ifndef DBOOLHUFF_H_
+#define DBOOLHUFF_H_
+
 #include <stddef.h>
 #include <limits.h>
+
 #include "vpx_config.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"

 typedef size_t VP8_BD_VALUE;

-# define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT)
+#define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT)
+
 /*This is meant to be a large, positive constant that can still be efficiently
   loaded as an immediate (on platforms like ARM, for example).
  Even relatively modest values like 100 would work fine.*/
-# define VP8_LOTS_OF_BITS (0x40000000)
+#define VP8_LOTS_OF_BITS (0x40000000)
+
+static unsigned char decrypt_byte(const unsigned char *ch,
+                                  const unsigned char *origin,
+                                  const unsigned char *key)
+{
+#if CONFIG_DECRYPT
+    const int offset = (int)(ch - origin);
+    return *ch ^ key[offset % 32];  // VP8_DECRYPT_KEY_SIZE
+#else
+    return *ch;
+#endif
+}

 typedef struct
 {
@ -32,46 +47,20 @@ typedef struct
    VP8_BD_VALUE         value;
    int                  count;
    unsigned int         range;
+    const unsigned char *origin;
+    const unsigned char *key;
 } BOOL_DECODER;

 DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);

 int vp8dx_start_decode(BOOL_DECODER *br,
                       const unsigned char *source,
-                       unsigned int source_sz);
+                       unsigned int source_sz,
+                       const unsigned char *origin,
+                       const unsigned char *key);

 void vp8dx_bool_decoder_fill(BOOL_DECODER *br);

-/*The refill loop is used in several places, so define it in a macro to make
-   sure they're all consistent.
-  An inline function would be cleaner, but has a significant penalty, because
-   multiple BOOL_DECODER fields must be modified, and the compiler is not smart
-   enough to eliminate the stores to those fields and the subsequent reloads
-   from them when inlining the function.*/
-#define VP8DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \
-    do \
-    { \
-        int shift = VP8_BD_VALUE_SIZE - 8 - ((_count) + 8); \
-        int loop_end, x; \
-        size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \
-        \
-        x = (int)(shift + CHAR_BIT - bits_left); \
-        loop_end = 0; \
-        if(x >= 0) \
-        { \
-            (_count) += VP8_LOTS_OF_BITS; \
-            loop_end = x; \
-            if(!bits_left) break; \
-        } \
-        while(shift >= loop_end) \
-        { \
-            (_count) += CHAR_BIT; \
-            (_value) |= (VP8_BD_VALUE)*(_bufptr)++ << shift; \
-            shift -= CHAR_BIT; \
-        } \
-    } \
-    while(0) \
-

 static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
    unsigned int bit = 0;
@ -151,4 +140,5 @@ static int vp8dx_bool_error(BOOL_DECODER *br)
    /* No error. */
    return 0;
 }
-#endif
+
+#endif  // DBOOLHUFF_H_
--- a/vp8/decoder/decodemv.h
+++ b/vp8/decoder/decodemv.h
@ -8,7 +8,11 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#ifndef DECODEMV_H_
+#define DECODEMV_H_

 #include "onyxd_int.h"

 void vp8_decode_mode_mvs(VP8D_COMP *);
+
+#endif  // DECODEMV_H_
--- a/vp8/decoder/decoderthreading.h
+++ b/vp8/decoder/decoderthreading.h
@ -8,19 +8,15 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
-
-
-
-#ifndef _DECODER_THREADING_H
-#define _DECODER_THREADING_H
+#ifndef DECODERTHREADING_H_
+#define DECODERTHREADING_H_

 #if CONFIG_MULTITHREAD
-extern void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
-extern void vp8_decoder_remove_threads(VP8D_COMP *pbi);
-extern void vp8_decoder_create_threads(VP8D_COMP *pbi);
-extern void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
-extern void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
+void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
+void vp8_decoder_remove_threads(VP8D_COMP *pbi);
+void vp8_decoder_create_threads(VP8D_COMP *pbi);
+void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
+void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
 #endif

-#endif
+#endif  // DECODERTHREADING_H_
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@ -893,7 +893,9 @@ static void setup_token_decoder(VP8D_COMP *pbi,
    {
        if (vp8dx_start_decode(bool_decoder,
                               pbi->fragments.ptrs[partition_idx],
-                               pbi->fragments.sizes[partition_idx]))
+                               pbi->fragments.sizes[partition_idx],
+                               pbi->fragments.ptrs[0],
+                               pbi->decrypt_key))
            vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
                               "Failed to allocate bool decoder %d",
                               partition_idx);
@ -980,10 +982,11 @@ static void init_frame(VP8D_COMP *pbi)

 int vp8_decode_frame(VP8D_COMP *pbi)
 {
-    vp8_reader *const bc = & pbi->mbc[8];
-    VP8_COMMON *const pc = & pbi->common;
-    MACROBLOCKD *const xd  = & pbi->mb;
+    vp8_reader *const bc = &pbi->mbc[8];
+    VP8_COMMON *const pc = &pbi->common;
+    MACROBLOCKD *const xd  = &pbi->mb;
    const unsigned char *data = pbi->fragments.ptrs[0];
+    const unsigned char *const origin = data;
    const unsigned char *data_end =  data + pbi->fragments.sizes[0];
    ptrdiff_t first_partition_length_in_bytes;

@ -1016,13 +1019,21 @@ int vp8_decode_frame(VP8D_COMP *pbi)
    }
    else
    {
-        pc->frame_type = (FRAME_TYPE)(data[0] & 1);
-        pc->version = (data[0] >> 1) & 7;
-        pc->show_frame = (data[0] >> 4) & 1;
-        first_partition_length_in_bytes =
-            (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
+        const unsigned char data0 = decrypt_byte(data + 0, origin,
+                                                 pbi->decrypt_key);
+        const unsigned char data1 = decrypt_byte(data + 1, origin,
+                                                 pbi->decrypt_key);
+        const unsigned char data2 = decrypt_byte(data + 2, origin,
+                                                 pbi->decrypt_key);

-        if (!pbi->ec_active && (data + first_partition_length_in_bytes > data_end
+        pc->frame_type = (FRAME_TYPE)(data0 & 1);
+        pc->version = (data0 >> 1) & 7;
+        pc->show_frame = (data0 >> 4) & 1;
+        first_partition_length_in_bytes =
+            (data0 | (data1 << 8) | (data2 << 16)) >> 5;
+
+        if (!pbi->ec_active &&
+            (data + first_partition_length_in_bytes > data_end
            || data + first_partition_length_in_bytes < data))
            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                               "Truncated packet or corrupt partition 0 length");
@ -1040,7 +1051,13 @@ int vp8_decode_frame(VP8D_COMP *pbi)
             */
            if (!pbi->ec_active || data + 3 < data_end)
            {
-                if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)
+                const unsigned char data0 = decrypt_byte(data + 0, origin,
+                                                         pbi->decrypt_key);
+                const unsigned char data1 = decrypt_byte(data + 1, origin,
+                                                         pbi->decrypt_key);
+                const unsigned char data2 = decrypt_byte(data + 2, origin,
+                                                         pbi->decrypt_key);
+                if (data0 != 0x9d || data1 != 0x01 || data2 != 0x2a)
                    vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
                                   "Invalid frame sync code");
            }
@ -1051,10 +1068,19 @@ int vp8_decode_frame(VP8D_COMP *pbi)
             */
            if (!pbi->ec_active || data + 6 < data_end)
            {
-                pc->Width = (data[3] | (data[4] << 8)) & 0x3fff;
-                pc->horiz_scale = data[4] >> 6;
-                pc->Height = (data[5] | (data[6] << 8)) & 0x3fff;
-                pc->vert_scale = data[6] >> 6;
+                const unsigned char data3 = decrypt_byte(data + 3, origin,
+                                                         pbi->decrypt_key);
+                const unsigned char data4 = decrypt_byte(data + 4, origin,
+                                                         pbi->decrypt_key);
+                const unsigned char data5 = decrypt_byte(data + 5, origin,
+                                                         pbi->decrypt_key);
+                const unsigned char data6 = decrypt_byte(data + 6, origin,
+                                                         pbi->decrypt_key);
+
+                pc->Width = (data3 | (data4 << 8)) & 0x3fff;
+                pc->horiz_scale = data4 >> 6;
+                pc->Height = (data5 | (data6 << 8)) & 0x3fff;
+                pc->vert_scale = data6 >> 6;
            }
            data += 7;

@ -1072,7 +1098,11 @@ int vp8_decode_frame(VP8D_COMP *pbi)

    init_frame(pbi);

-    if (vp8dx_start_decode(bc, data, (unsigned int)(data_end - data)))
+    if (vp8dx_start_decode(bc,
+                           data,
+                           (unsigned int)(data_end - data),
+                           pbi->fragments.ptrs[0],
+                           pbi->decrypt_key))
        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                           "Failed to allocate bool decoder 0");
    if (pc->frame_type == KEY_FRAME) {
--- a/vp8/decoder/detokenize.h
+++ b/vp8/decoder/detokenize.h
@ -8,13 +8,12 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
-#ifndef DETOKENIZE_H
-#define DETOKENIZE_H
+#ifndef DETOKENIZE_H_
+#define DETOKENIZE_H_

 #include "onyxd_int.h"

 void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
 int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);

-#endif /* DETOKENIZE_H */
+#endif  // DETOKENIZE_H
--- a/vp8/decoder/ec_types.h
+++ b/vp8/decoder/ec_types.h
@ -14,7 +14,6 @@
 #define MAX_OVERLAPS 16


-
 /* The area (pixel area in Q6) the block pointed to by bmi overlaps
 * another block with.
 */
@ -48,4 +47,4 @@ typedef struct
    MV_REFERENCE_FRAME ref_frame;
 } EC_BLOCK;

-#endif /* VP8_DEC_EC_TYPES_H */
+#endif  // VP8_DEC_EC_TYPES_H
--- a/vp8/decoder/error_concealment.c
+++ b/vp8/decoder/error_concealment.c
@ -8,14 +8,14 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#include <assert.h>
+
 #include "error_concealment.h"
 #include "onyxd_int.h"
 #include "decodemv.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/findnearmv.h"

-#include <assert.h>
-
 #define MIN(x,y) (((x)<(y))?(x):(y))
 #define MAX(x,y) (((x)>(y))?(x):(y))

--- a/vp8/decoder/error_concealment.h
+++ b/vp8/decoder/error_concealment.h
@ -9,8 +9,8 @@
 */


-#ifndef ERROR_CONCEALMENT_H
-#define ERROR_CONCEALMENT_H
+#ifndef ERROR_CONCEALMENT_H_
+#define ERROR_CONCEALMENT_H_

 #include "onyxd_int.h"
 #include "ec_types.h"
@ -38,4 +38,4 @@ void vp8_interpolate_motion(MACROBLOCKD *mb,
 */
 void vp8_conceal_corrupt_mb(MACROBLOCKD *xd);

-#endif
+#endif  // ERROR_CONCEALMENT_H_
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@ -9,8 +9,9 @@
 */


-#ifndef __INC_VP8D_INT_H
-#define __INC_VP8D_INT_H
+#ifndef ONYXD_INT_H_
+#define ONYXD_INT_H_
+
 #include "vpx_config.h"
 #include "vp8/common/onyxd.h"
 #include "treereader.h"
@ -121,6 +122,7 @@ typedef struct VP8D_COMP
    int independent_partitions;
    int frame_corrupt_residual;

+    const unsigned char *decrypt_key;
 } VP8D_COMP;

 int vp8_decode_frame(VP8D_COMP *cpi);
@ -145,4 +147,4 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb);
    } while(0)
 #endif

-#endif
+#endif  // ONYXD_INT_H_
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@ -36,7 +36,7 @@
 } while (0)


-extern void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
+void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);

 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
--- a/vp8/decoder/treereader.h
+++ b/vp8/decoder/treereader.h
@ -9,18 +9,17 @@
 */


-#ifndef tree_reader_h
-#define tree_reader_h 1
+#ifndef TREEREADER_H_
+#define TREEREADER_H_

 #include "vp8/common/treecoder.h"
-
 #include "dboolhuff.h"

 typedef BOOL_DECODER vp8_reader;

 #define vp8_read vp8dx_decode_bool
 #define vp8_read_literal vp8_decode_value
-#define vp8_read_bit( R) vp8_read( R, vp8_prob_half)
+#define vp8_read_bit(R) vp8_read(R, vp8_prob_half)


 /* Intent of tree data structure is to make decoding trivial. */
@ -38,4 +37,4 @@ static int vp8_treed_read(
    return -i;
 }

-#endif /* tree_reader_h */
+#endif  // TREEREADER_H_
--- a/vp8/decoder/vp8_asm_dec_offsets.c
+++ b/vp8/decoder/vp8_asm_dec_offsets.c
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@ -980,6 +980,12 @@ void vp8_calc_ref_frame_costs(int *ref_frame_cost,
                              int prob_garf
                             )
 {
+    assert(prob_intra >= 0);
+    assert(prob_intra <= 255);
+    assert(prob_last >= 0);
+    assert(prob_last <= 255);
+    assert(prob_garf >= 0);
+    assert(prob_garf <= 255);
    ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(prob_intra);
    ref_frame_cost[LAST_FRAME]    = vp8_cost_one(prob_intra)
                                    + vp8_cost_zero(prob_last);
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@ -37,7 +37,7 @@ typedef struct block
    /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
    short *quant;
    short *quant_fast;
-    unsigned char *quant_shift;
+    short *quant_shift;
    short *zbin;
    short *zrun_zbin_boost;
    short *round;
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@ -641,7 +641,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)
    for (i = 0; i < MAX_MODES; i ++)
    {
        cpi->mode_check_freq[i] = 0;
-        cpi->mode_chosen_counts[i] = 0;
    }

    cpi->mb.mbs_tested_so_far = 0;
@ -2816,6 +2815,8 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
        if (cpi->common.refresh_alt_ref_frame)
        {
            cpi->prob_intra_coded += 40;
+            if (cpi->prob_intra_coded > 255)
+                cpi->prob_intra_coded = 255;
            cpi->prob_last_coded = 200;
            cpi->prob_gf_coded = 1;
        }
@ -4598,9 +4599,6 @@ static void encode_frame_to_data_rate
                        cm->frame_type, cm->refresh_golden_frame,
                        cm->refresh_alt_ref_frame);

-            for (i = 0; i < MAX_MODES; i++)
-                fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-
            fprintf(fmodes, "\n");

            fclose(fmodes);
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@ -282,17 +282,17 @@ typedef struct VP8_COMP
 {

    DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);

    DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);

    DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);

@ -349,7 +349,6 @@ typedef struct VP8_COMP
    int ambient_err;

    unsigned int mode_check_freq[MAX_MODES];
-    unsigned int mode_chosen_counts[MAX_MODES];

    int rd_baseline_thresh[MAX_MODES];

--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@ -50,8 +50,8 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
        if (x >= zbin)
        {
            x += round_ptr[rc];
-            y  = (((x * quant_ptr[rc]) >> 16) + x)
-                 >> quant_shift_ptr[rc];             /* quantize (x) */
+            y  = ((((x * quant_ptr[rc]) >> 16) + x)
+                 * quant_shift_ptr[rc]) >> 16;       /* quantize (x) */
            x  = (y ^ sz) - sz;                      /* get the sign back */
            qcoeff_ptr[rc] = x;                      /* write to destination */
            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
@ -113,7 +113,7 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
    short *zbin_ptr        = b->zbin;
    short *round_ptr       = b->round;
    short *quant_ptr       = b->quant;
-    unsigned char *quant_shift_ptr = b->quant_shift;
+    short *quant_shift_ptr = b->quant_shift;
    short *qcoeff_ptr      = d->qcoeff;
    short *dqcoeff_ptr     = d->dqcoeff;
    short *dequant_ptr     = d->dequant;
@ -138,8 +138,8 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
        if (x >= zbin)
        {
            x += round_ptr[rc];
-            y  = (((x * quant_ptr[rc]) >> 16) + x)
-                 >> quant_shift_ptr[rc];             /* quantize (x) */
+            y  = ((((x * quant_ptr[rc]) >> 16) + x)
+                 * quant_shift_ptr[rc]) >> 16;       /* quantize (x) */
            x  = (y ^ sz) - sz;                      /* get the sign back */
            qcoeff_ptr[rc]  = x;                     /* write to destination */
            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
@ -167,7 +167,7 @@ void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d)
    int sz;
    short *coeff_ptr;
    short *quant_ptr;
-    unsigned char *quant_shift_ptr;
+    short *quant_shift_ptr;
    short *qcoeff_ptr;
    short *dqcoeff_ptr;
    short *dequant_ptr;
@ -198,7 +198,7 @@ void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d)
        if (x >= dq)
        {
            /* Quantize x. */
-            y  = (((x * quant_ptr[rc]) >> 16) + x) >> quant_shift_ptr[rc];
+            y  = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16;
            /* Put the sign back. */
            x = (y + sz) ^ sz;
            /* Save the coefficient and its dequantized value. */
@ -406,7 +406,7 @@ static const int qzbin_factors_y2[129] =
 #define EXACT_QUANT
 #ifdef EXACT_QUANT
 static void invert_quant(int improved_quant, short *quant,
-                               unsigned char *shift, short d)
+                         short *shift, short d)
 {
    if(improved_quant)
    {
@ -418,11 +418,15 @@ static void invert_quant(int improved_quant, short *quant,
        t = 1 + (1<<(16+l))/d;
        *quant = (short)(t - (1<<16));
        *shift = l;
+        /* use multiplication and constant shift by 16 */
+        *shift = 1 << (16 - *shift);
    }
    else
    {
        *quant = (1 << 16) / d;
        *shift = 0;
+        /* use multiplication and constant shift by 16 */
+        *shift = 1 << (16 - *shift);
    }
 }

--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@ -2512,9 +2512,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                x->rd_thresh_mult[best_mode_index];
    }

-    /* Note how often each mode chosen as best */
-    cpi->mode_chosen_counts[best_mode_index] ++;
-
 #if CONFIG_TEMPORAL_DENOISING
    if (cpi->oxcf.noise_sensitivity)
    {
--- a/vp8/encoder/vp8_asm_enc_offsets.c
+++ b/vp8/encoder/vp8_asm_enc_offsets.c
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@ -1,245 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "vp8_asm_enc_offsets.asm"
-
-
-; void vp8_regular_quantize_b_sse2 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp8_regular_quantize_b_sse2) PRIVATE
-sym(vp8_regular_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SAVE_XMM 7
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %endif
-%endif
-
-    ALIGN_STACK 16, rax
-    %define zrun_zbin_boost   0  ;  8
-    %define abs_minus_zbin    8  ; 32
-    %define temp_qcoeff       40 ; 32
-    %define qcoeff            72 ; 32
-    %define stack_size        104
-    sub         rsp, stack_size
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rdx, [rdi + vp8_block_coeff] ; coeff_ptr
-    mov         rcx, [rdi + vp8_block_zbin] ; zbin_ptr
-    movd        xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
-
-    ; z
-    movdqa      xmm0, [rdx]
-    movdqa      xmm4, [rdx + 16]
-    mov         rdx, [rdi + vp8_block_round] ; round_ptr
-
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; (z ^ sz)
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-
-    ; x = abs(z)
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-    mov         rcx, [rdi + vp8_block_quant] ; quant_ptr
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm2, xmm7
-    paddw       xmm3, xmm7
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm1, xmm2
-    psubw       xmm5, xmm3
-    movdqa      [rsp + abs_minus_zbin], xmm1
-    movdqa      [rsp + abs_minus_zbin + 16], xmm5
-
-    ; add (zbin_ptr + zbin_oq_value) back
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    movdqa      xmm2, [rdx]
-    movdqa      xmm6, [rdx + 16]
-
-    movdqa      xmm3, [rcx]
-    movdqa      xmm7, [rcx + 16]
-
-    ; x + round
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm6
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm3, xmm1
-    pmulhw      xmm7, xmm5
-
-    ; y += x
-    paddw       xmm1, xmm3
-    paddw       xmm5, xmm7
-
-    movdqa      [rsp + temp_qcoeff], xmm1
-    movdqa      [rsp + temp_qcoeff + 16], xmm5
-
-    pxor        xmm6, xmm6
-    ; zero qcoeff
-    movdqa      [rsp + qcoeff], xmm6
-    movdqa      [rsp + qcoeff + 16], xmm6
-
-    mov         rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
-    mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
-    mov         [rsp + zrun_zbin_boost], rdx
-
-%macro ZIGZAG_LOOP 1
-    ; x
-    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1           ; x < zbin
-
-    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
-
-    ; downshift by quant_shift[rc]
-    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1           ; !y
-    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
-ZIGZAG_LOOP  0
-ZIGZAG_LOOP  1
-ZIGZAG_LOOP  4
-ZIGZAG_LOOP  8
-ZIGZAG_LOOP  5
-ZIGZAG_LOOP  2
-ZIGZAG_LOOP  3
-ZIGZAG_LOOP  6
-ZIGZAG_LOOP  9
-ZIGZAG_LOOP 12
-ZIGZAG_LOOP 13
-ZIGZAG_LOOP 10
-ZIGZAG_LOOP  7
-ZIGZAG_LOOP 11
-ZIGZAG_LOOP 14
-ZIGZAG_LOOP 15
-
-    movdqa      xmm2, [rsp + qcoeff]
-    movdqa      xmm3, [rsp + qcoeff + 16]
-
-    mov         rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr
-    mov         rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
-
-    ; y ^ sz
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm4
-    ; x = (y ^ sz) - sz
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm4
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr
-
-    pmullw      xmm0, xmm2
-    pmullw      xmm1, xmm3
-
-    movdqa      [rcx], xmm2        ; store qcoeff
-    movdqa      [rcx + 16], xmm3
-    movdqa      [rdi], xmm0        ; store dqcoeff
-    movdqa      [rdi + 16], xmm1
-
-    mov         rcx, [rsi + vp8_blockd_eob]
-
-    ; select the last value (in zig_zag order) for EOB
-    pcmpeqw     xmm2, xmm6
-    pcmpeqw     xmm3, xmm6
-    ; !
-    pcmpeqw     xmm6, xmm6
-    pxor        xmm2, xmm6
-    pxor        xmm3, xmm6
-    ; mask inv_zig_zag
-    pand        xmm2, [GLOBAL(inv_zig_zag)]
-    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
-    ; select the max value
-    pmaxsw      xmm2, xmm3
-    pshufd      xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00000001b
-    pmaxsw      xmm2, xmm3
-    movd        eax, xmm2
-    and         eax, 0xff
-
-    mov         BYTE PTR [rcx], al          ; store eob
-
-    ; begin epilog
-    add         rsp, stack_size
-    pop         rsp
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-    RESTORE_GOT
-    RESTORE_XMM
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-inv_zig_zag:
-  dw 0x0001, 0x0002, 0x0006, 0x0007
-  dw 0x0003, 0x0005, 0x0008, 0x000d
-  dw 0x0004, 0x0009, 0x000c, 0x000e
-  dw 0x000a, 0x000b, 0x000f, 0x0010
--- a/vp8/encoder/x86/quantize_sse2_intrinsics.c
+++ b/vp8/encoder/x86/quantize_sse2_intrinsics.c
@ -9,13 +9,139 @@
 */


-#include "vp8/common/blockd.h"
-#include "vp8/common/entropy.h"
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_ports/x86.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vp8/encoder/block.h"
+#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */

-#include <mmintrin.h> //MMX
-#include <xmmintrin.h> //SSE
-#include <emmintrin.h> //SSE2
+#include <mmintrin.h> /* MMX */
+#include <xmmintrin.h> /* SSE */
+#include <emmintrin.h> /* SSE2 */
+
+#define SELECT_EOB(i, z) \
+    do { \
+        short boost = *zbin_boost_ptr; \
+        int cmp = (x[z] < boost) | (y[z] == 0); \
+        zbin_boost_ptr++; \
+        if (cmp) \
+            goto select_eob_end_##i; \
+        qcoeff_ptr[z] = y[z]; \
+        eob = i; \
+        zbin_boost_ptr = b->zrun_zbin_boost; \
+        select_eob_end_##i:; \
+    } while (0)
+
+void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
+{
+    char eob = 0;
+    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+    short *qcoeff_ptr      = d->qcoeff;
+    DECLARE_ALIGNED_ARRAY(16, short, x, 16);
+    DECLARE_ALIGNED_ARRAY(16, short, y, 16);
+
+    __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;
+    __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
+    __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
+    __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+    __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));
+    __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
+    __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
+    __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
+    __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+    __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+    __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
+    __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
+    __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+    __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+
+    vpx_memset(qcoeff_ptr, 0, 32);
+
+    /* Duplicate to all lanes. */
+    zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
+    zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
+
+    /* Sign of z: z >> 15 */
+    sz0 = _mm_srai_epi16(z0, 15);
+    sz1 = _mm_srai_epi16(z1, 15);
+
+    /* x = abs(z): (z ^ sz) - sz */
+    x0 = _mm_xor_si128(z0, sz0);
+    x1 = _mm_xor_si128(z1, sz1);
+    x0 = _mm_sub_epi16(x0, sz0);
+    x1 = _mm_sub_epi16(x1, sz1);
+
+    /* zbin[] + zbin_extra */
+    zbin0 = _mm_add_epi16(zbin0, zbin_extra);
+    zbin1 = _mm_add_epi16(zbin1, zbin_extra);
+
+    /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
+     * the equation because boost is the only value which can change:
+     * x - (zbin[] + extra) >= boost */
+    x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
+    x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
+
+    _mm_store_si128((__m128i *)(x), x_minus_zbin0);
+    _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1);
+
+    /* All the remaining calculations are valid whether they are done now with
+     * simd or later inside the loop one at a time. */
+    x0 = _mm_add_epi16(x0, round0);
+    x1 = _mm_add_epi16(x1, round1);
+
+    y0 = _mm_mulhi_epi16(x0, quant0);
+    y1 = _mm_mulhi_epi16(x1, quant1);
+
+    y0 = _mm_add_epi16(y0, x0);
+    y1 = _mm_add_epi16(y1, x1);
+
+    /* Instead of shifting each value independently we convert the scaling
+     * factor with 1 << (16 - shift) so we can use multiply/return high half. */
+    y0 = _mm_mulhi_epi16(y0, quant_shift0);
+    y1 = _mm_mulhi_epi16(y1, quant_shift1);
+
+    /* Return the sign: (y ^ sz) - sz */
+    y0 = _mm_xor_si128(y0, sz0);
+    y1 = _mm_xor_si128(y1, sz1);
+    y0 = _mm_sub_epi16(y0, sz0);
+    y1 = _mm_sub_epi16(y1, sz1);
+
+    _mm_store_si128((__m128i *)(y), y0);
+    _mm_store_si128((__m128i *)(y + 8), y1);
+
+    zbin_boost_ptr = b->zrun_zbin_boost;
+
+    /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
+    SELECT_EOB(1, 0);
+    SELECT_EOB(2, 1);
+    SELECT_EOB(3, 4);
+    SELECT_EOB(4, 8);
+    SELECT_EOB(5, 5);
+    SELECT_EOB(6, 2);
+    SELECT_EOB(7, 3);
+    SELECT_EOB(8, 6);
+    SELECT_EOB(9, 9);
+    SELECT_EOB(10, 12);
+    SELECT_EOB(11, 13);
+    SELECT_EOB(12, 10);
+    SELECT_EOB(13, 7);
+    SELECT_EOB(14, 11);
+    SELECT_EOB(15, 14);
+    SELECT_EOB(16, 15);
+
+    y0 = _mm_load_si128((__m128i *)(d->qcoeff));
+    y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8));
+
+    /* dqcoeff = qcoeff * dequant */
+    y0 = _mm_mullo_epi16(y0, dequant0);
+    y1 = _mm_mullo_epi16(y1, dequant1);
+
+    _mm_store_si128((__m128i *)(d->dqcoeff), y0);
+    _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1);
+
+    *d->eob = eob;
+}

 void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@ -14,7 +14,6 @@ VP8_COMMON_SRCS-yes += common/ppflags.h
 VP8_COMMON_SRCS-yes += common/onyx.h
 VP8_COMMON_SRCS-yes += common/onyxd.h
 VP8_COMMON_SRCS-yes += common/alloccommon.c
-VP8_COMMON_SRCS-yes += common/asm_com_offsets.c
 VP8_COMMON_SRCS-yes += common/blockd.c
 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
 VP8_COMMON_SRCS-yes += common/debugmodes.c
@ -67,6 +66,7 @@ VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
 VP8_COMMON_SRCS-yes += common/variance_c.c
 VP8_COMMON_SRCS-yes += common/variance.h
+VP8_COMMON_SRCS-yes += common/vp8_asm_com_offsets.c
 VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h


@ -193,6 +193,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16_neon$
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)

 $(eval $(call asm_offsets_template,\
-         vp8_asm_com_offsets.asm, $(VP8_PREFIX)common/asm_com_offsets.c))
+         vp8_asm_com_offsets.asm, $(VP8_PREFIX)common/vp8_asm_com_offsets.c))

 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh))
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@ -29,6 +29,8 @@
 #define VP8_CAP_ERROR_CONCEALMENT (CONFIG_ERROR_CONCEALMENT ? \
                                    VPX_CODEC_CAP_ERROR_CONCEALMENT : 0)

+#define VP8_DECRYPT_KEY_SIZE 32
+
 typedef vpx_codec_stream_info_t  vp8_stream_info_t;

 /* Structures for handling memory allocations */
@ -73,6 +75,7 @@ struct vpx_codec_alg_priv
    int                     dbg_color_b_modes_flag;
    int                     dbg_display_mv_flag;
 #endif
+    unsigned char           decrypt_key[VP8_DECRYPT_KEY_SIZE];
    vpx_image_t             img;
    int                     img_setup;
    struct frame_buffers    yv12_frame_buffers;
@ -150,6 +153,8 @@ static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
    return res;
 }

+static const unsigned char fake_decrypt_key[VP8_DECRYPT_KEY_SIZE] = { 0 };
+
 static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
 {
    int i;
@ -164,6 +169,8 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)

    ctx->priv->alg_priv->mmaps[0] = *mmap;
    ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
+    memcpy(ctx->priv->alg_priv->decrypt_key, fake_decrypt_key,
+           VP8_DECRYPT_KEY_SIZE);
    ctx->priv->init_flags = ctx->init_flags;

    if (ctx->config.dec)
@ -211,21 +218,19 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
        mmap.flags = vp8_mem_req_segs[0].flags;

        res = vp8_mmap_alloc(&mmap);
+        if (res != VPX_CODEC_OK) return res;

-        if (!res)
-        {
-            vp8_init_ctx(ctx, &mmap);
+        vp8_init_ctx(ctx, &mmap);

-            /* initialize number of fragments to zero */
-            ctx->priv->alg_priv->fragments.count = 0;
-            /* is input fragments enabled? */
-            ctx->priv->alg_priv->fragments.enabled =
-                    (ctx->priv->alg_priv->base.init_flags &
-                        VPX_CODEC_USE_INPUT_FRAGMENTS);
+        /* initialize number of fragments to zero */
+        ctx->priv->alg_priv->fragments.count = 0;
+        /* is input fragments enabled? */
+        ctx->priv->alg_priv->fragments.enabled =
+                (ctx->priv->alg_priv->base.init_flags &
+                    VPX_CODEC_USE_INPUT_FRAGMENTS);

-            ctx->priv->alg_priv->defer_alloc = 1;
-            /*post processing level initialized to do nothing */
-        }
+        ctx->priv->alg_priv->defer_alloc = 1;
+        /*post processing level initialized to do nothing */
    }

    ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads =
@ -264,14 +269,17 @@ static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx)
    return VPX_CODEC_OK;
 }

-static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,
-                                   unsigned int           data_sz,
-                                   vpx_codec_stream_info_t *si)
+static vpx_codec_err_t vp8_peek_si_external(const uint8_t         *data,
+                                            unsigned int           data_sz,
+                                            vpx_codec_stream_info_t *si,
+                                            const unsigned char *decrypt_key)
 {
    vpx_codec_err_t res = VPX_CODEC_OK;

    if(data + data_sz <= data)
+    {
        res = VPX_CODEC_INVALID_PARAM;
+    }
    else
    {
        /* Parse uncompresssed part of key frame header.
@ -280,30 +288,45 @@ static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,
         * 4 bytes:- including image width and height in the lowest 14 bits
         *           of each 2-byte value.
         */
-        si->is_kf = 0;

-        if (data_sz >= 10 && !(data[0] & 0x01))  /* I-Frame */
+        const uint8_t data0 = decrypt_byte(data, data, decrypt_key);
+        si->is_kf = 0;
+        if (data_sz >= 10 && !(data0 & 0x01))  /* I-Frame */
        {
-            const uint8_t *c = data + 3;
+            const uint8_t data3 = decrypt_byte(data + 3, data, decrypt_key);
+            const uint8_t data4 = decrypt_byte(data + 4, data, decrypt_key);
+            const uint8_t data5 = decrypt_byte(data + 5, data, decrypt_key);
+            const uint8_t data6 = decrypt_byte(data + 6, data, decrypt_key);
+            const uint8_t data7 = decrypt_byte(data + 7, data, decrypt_key);
+            const uint8_t data8 = decrypt_byte(data + 8, data, decrypt_key);
+            const uint8_t data9 = decrypt_byte(data + 9, data, decrypt_key);
+
            si->is_kf = 1;

            /* vet via sync code */
-            if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)
+            if (data3 != 0x9d || data4 != 0x01 || data5 != 0x2a)
                res = VPX_CODEC_UNSUP_BITSTREAM;

-            si->w = (c[3] | (c[4] << 8)) & 0x3fff;
-            si->h = (c[5] | (c[6] << 8)) & 0x3fff;
+            si->w = (data6 | (data7 << 8)) & 0x3fff;
+            si->h = (data8 | (data9 << 8)) & 0x3fff;

            /*printf("w=%d, h=%d\n", si->w, si->h);*/
            if (!(si->h | si->w))
                res = VPX_CODEC_UNSUP_BITSTREAM;
        }
        else
+        {
            res = VPX_CODEC_UNSUP_BITSTREAM;
+        }
    }

    return res;
+}

+static vpx_codec_err_t vp8_peek_si(const uint8_t *data,
+                                   unsigned int data_sz,
+                                   vpx_codec_stream_info_t *si) {
+    return vp8_peek_si_external(data, data_sz, si, fake_decrypt_key);
 }

 static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t    *ctx,
@ -432,8 +455,10 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
    w = ctx->si.w;
    h = ctx->si.h;

-    res = ctx->base.iface->dec.peek_si(ctx->fragments.ptrs[0],
-                                       ctx->fragments.sizes[0], &ctx->si);
+    res = vp8_peek_si_external(ctx->fragments.ptrs[0],
+                               ctx->fragments.sizes[0],
+                               &ctx->si,
+                               ctx->decrypt_key);

    if((res == VPX_CODEC_UNSUP_BITSTREAM) && !ctx->si.is_kf)
    {
@ -507,6 +532,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
            }

            res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf);
+            ctx->yv12_frame_buffers.pbi[0]->decrypt_key = ctx->decrypt_key;
        }

        ctx->decoder_init = 1;
@ -928,6 +954,20 @@ static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,

 }

+
+static vpx_codec_err_t vp8_set_decrypt_key(vpx_codec_alg_priv_t *ctx,
+                                           int ctr_id,
+                                           va_list args)
+{
+    const unsigned char *data = va_arg(args, const unsigned char *);
+    if (data == NULL) {
+        return VPX_CODEC_INVALID_PARAM;
+    }
+
+    memcpy(ctx->decrypt_key, data, VP8_DECRYPT_KEY_SIZE);
+    return VPX_CODEC_OK;
+}
+
 vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
 {
    {VP8_SET_REFERENCE,             vp8_set_reference},
@ -940,6 +980,7 @@ vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
    {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},
    {VP8D_GET_FRAME_CORRUPTED,      vp8_get_frame_corrupted},
    {VP8D_GET_LAST_REF_USED,        vp8_get_last_ref_frame},
+    {VP8_SET_DECRYPT_KEY,           vp8_set_decrypt_key},
    { -1, NULL},
 };

--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@ -24,7 +24,6 @@ VP8_CX_SRCS-yes += vp8cx.mk

 VP8_CX_SRCS-yes += vp8_cx_iface.c

-VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c
 VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h
 VP8_CX_SRCS-yes += encoder/bitstream.c
 VP8_CX_SRCS-yes += encoder/boolhuff.c
@ -78,6 +77,7 @@ VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
 VP8_CX_SRCS-yes += encoder/temporal_filter.c
 VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
 VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
+VP8_CX_SRCS-yes += encoder/vp8_asm_enc_offsets.c

 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
@ -90,7 +90,6 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2_intrinsics.c
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm

 # TODO(johann) make this generic
 ifeq ($(HAVE_SSE2),yes)
@ -122,4 +121,4 @@ endif
 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))

 $(eval $(call asm_offsets_template,\
-         vp8_asm_enc_offsets.asm, $(VP8_PREFIX)encoder/asm_enc_offsets.c))
+         vp8_asm_enc_offsets.asm, $(VP8_PREFIX)encoder/vp8_asm_enc_offsets.c))
--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@ -20,7 +20,6 @@ VP8_DX_SRCS-yes += vp8dx.mk

 VP8_DX_SRCS-yes += vp8_dx_iface.c

-VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
 VP8_DX_SRCS-yes += decoder/dboolhuff.c
 VP8_DX_SRCS-yes += decoder/decodemv.c
 VP8_DX_SRCS-yes += decoder/decodframe.c
@ -36,8 +35,9 @@ VP8_DX_SRCS-yes += decoder/onyxd_int.h
 VP8_DX_SRCS-yes += decoder/treereader.h
 VP8_DX_SRCS-yes += decoder/onyxd_if.c
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
+VP8_DX_SRCS-yes += decoder/vp8_asm_dec_offsets.c

 VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))

 $(eval $(call asm_offsets_template,\
-         vp8_asm_dec_offsets.asm, $(VP8_PREFIX)decoder/asm_dec_offsets.c))
+         vp8_asm_dec_offsets.asm, $(VP8_PREFIX)decoder/vp8_asm_dec_offsets.c))
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@ -63,6 +63,12 @@ enum vp8_dec_control_id {
   */
  VP8D_GET_LAST_REF_USED,

+  /** decryption key to protect encoded data buffer before decoding,
+   *  pointer to 32 byte array which is copied, so the array passed
+   *  does not need to be preserved
+   */
+  VP8_SET_DECRYPT_KEY,
+
  VP8_DECODER_CTRL_ID_MAX
 };

@ -78,6 +84,7 @@ enum vp8_dec_control_id {
 VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES,   int *)
 VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED,    int *)
 VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED,      int *)
+VPX_CTRL_USE_TYPE(VP8_SET_DECRYPT_KEY,         const unsigned char *)

 /*! @} - end defgroup vp8_decoder */

--- a/vpxdec.c
+++ b/vpxdec.c
@ -1024,7 +1024,7 @@ int main(int argc, const char **argv_) {

    if (!noblit) {
      if (do_scale) {
-        if (frame_out == 1) {
+        if (img && frame_out == 1) {
          stream_w = img->d_w;
          stream_h = img->d_h;
          scaled_img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420,
--- a/vpxenc.c
+++ b/vpxenc.c
@ -89,8 +89,8 @@ static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,

 static const char *exec_name;

-#define VP8_FOURCC (0x00385056)
-#define VP9_FOURCC (0x00395056)
+#define VP8_FOURCC (0x30385056)
+#define VP9_FOURCC (0x30395056)
 static const struct codec_item {
  char const              *name;
  const vpx_codec_iface_t *(*iface)(void);
@ -2560,7 +2560,7 @@ int main(int argc, const char **argv_) {
    usage_exit();

  for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
-    int frames_in = 0;
+    int frames_in = 0, seen_frames = 0;
    int64_t estimated_time_left = -1;
    int64_t average_rate = -1;
    off_t lagged_count = 0;
@ -2640,9 +2640,11 @@ int main(int argc, const char **argv_) {

        if (frame_avail)
          frames_in++;
+        seen_frames = frames_in > global.skip_frames ?
+                          frames_in - global.skip_frames : 0;

        if (!global.quiet) {
-          float fps = usec_to_fps(cx_time, frames_in);
+          float fps = usec_to_fps(cx_time, seen_frames);
          fprintf(stderr, "\rPass %d/%d ", pass + 1, global.passes);

          if (stream_cnt == 1)
@ -2678,16 +2680,17 @@ int main(int argc, const char **argv_) {
        FOREACH_STREAM(get_cx_data(stream, &global, &got_data));

        if (!got_data && input.length && !streams->frames_out) {
-          lagged_count = global.limit ? frames_in : ftello(input.file);
+          lagged_count = global.limit ? seen_frames : ftello(input.file);
        } else if (input.length) {
          int64_t remaining;
          int64_t rate;

          if (global.limit) {
-            int frame_in_lagged = (frames_in - lagged_count) * 1000;
+            int frame_in_lagged = (seen_frames - lagged_count) * 1000;

            rate = cx_time ? frame_in_lagged * (int64_t)1000000 / cx_time : 0;
-            remaining = 1000 * (global.limit - frames_in + lagged_count);
+            remaining = 1000 * (global.limit - global.skip_frames
+                                - seen_frames + lagged_count);
          } else {
            off_t input_pos = ftello(input.file);
            off_t input_pos_lagged = input_pos - lagged_count;
@ -2719,14 +2722,14 @@ int main(int argc, const char **argv_) {
                       "\rPass %d/%d frame %4d/%-4d %7"PRId64"B %7lub/f %7"PRId64"b/s"
                       " %7"PRId64" %s (%.2f fps)\033[K\n", pass + 1,
                       global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes,
-                       frames_in ? (unsigned long)(stream->nbytes * 8 / frames_in) : 0,
-                       frames_in ? (int64_t)stream->nbytes * 8
+                       seen_frames ? (unsigned long)(stream->nbytes * 8 / seen_frames) : 0,
+                       seen_frames ? (int64_t)stream->nbytes * 8
                       * (int64_t)global.framerate.num / global.framerate.den
-                       / frames_in
+                       / seen_frames
                       : 0,
                       stream->cx_time > 9999999 ? stream->cx_time / 1000 : stream->cx_time,
                       stream->cx_time > 9999999 ? "ms" : "us",
-                       usec_to_fps(stream->cx_time, frames_in));
+                       usec_to_fps(stream->cx_time, seen_frames));
                    );

    if (global.show_psnr)