WIP: reuse splitmv segmentation (partial)

still not reusing split mvs, but reusing partition size (8x8, 8x16, etc) Change-Id: I4655b06fcdcbc71a97bff07def78297ae8e5104c
WIP: force keyframe based on frame refresh flags
2012-03-22 15:06:53 -07:00 · 2012-03-22 15:05:35 -07:00 · 2012-03-21 17:23:25 -07:00 · 2012-03-21 17:11:19 -07:00 · 2012-03-21 16:08:12 -07:00 · 2012-03-09 18:00:12 -08:00
149 changed files with 4863 additions and 5992 deletions
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -391,6 +391,8 @@ LDFLAGS = ${LDFLAGS}
 ASFLAGS = ${ASFLAGS}
 extralibs = ${extralibs}
 AS_SFX    = ${AS_SFX:-.asm}
+EXE_SFX   = ${EXE_SFX}
+RTCD_OPTIONS = ${RTCD_OPTIONS}
 EOF

    if enabled rvct; then cat >> $1 << EOF
@@ -454,9 +456,22 @@ process_common_cmdline() {
        ;;
        --enable-?*|--disable-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        echo "${CMDLINE_SELECT} ${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null || die_unknown $opt
+        if echo "${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null; then
+            [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}${opt} "
+        else
+            echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
+                die_unknown $opt
+        fi
        $action $option
        ;;
+        --require-?*)
+        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
+        if echo "${ARCH_EXT_LIST}" none | grep "^ *$option\$" >/dev/null; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}${opt} "
+        else
+            die_unknown $opt
+        fi
+        ;;
        --force-enable-?*|--force-disable-?*)
        eval `echo "$opt" | sed 's/--force-/action=/;s/-/ option=/;s/-/_/g'`
        $action $option
@@ -526,6 +541,7 @@ setup_gnu_toolchain() {
    STRIP=${STRIP:-${CROSS}strip}
    NM=${NM:-${CROSS}nm}
        AS_SFX=.s
+        EXE_SFX=
 }

 process_common_toolchain() {
@@ -579,6 +595,9 @@ process_common_toolchain() {
            *solaris2.10)
                tgt_os=solaris
                ;;
+            *os2*)
+                tgt_os=os2
+                ;;
        esac

        if [ -n "$tgt_isa" ] && [ -n "$tgt_os" ]; then
@@ -671,10 +690,22 @@ process_common_toolchain() {
    case ${toolchain} in
    arm*)
        # on arm, isa versions are supersets
-        enabled armv7a && soft_enable armv7 ### DEBUG
-        enabled armv7 && soft_enable armv6
-        enabled armv7 || enabled armv6 && soft_enable armv5te
-        enabled armv7 || enabled armv6 && soft_enable fast_unaligned
+        case ${tgt_isa} in
+        armv7)
+            soft_enable neon
+            soft_enable media
+            soft_enable edsp
+            soft_enable fast_unaligned
+            ;;
+        armv6)
+            soft_enable media
+            soft_enable edsp
+            soft_enable fast_unaligned
+            ;;
+        armv5te)
+            soft_enable edsp
+            ;;
+        esac

        asm_conversion_cmd="cat"

@@ -687,10 +718,14 @@ process_common_toolchain() {
            arch_int=${arch_int%%te}
            check_add_asflags --defsym ARCHITECTURE=${arch_int}
            tune_cflags="-mtune="
-            if enabled armv7
-            then
-                check_add_cflags -march=armv7-a -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp  #-ftree-vectorize
-                check_add_asflags -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp  #-march=armv7-a
+            if [ ${tgt_isa} == "armv7" ]; then
+                if enabled neon
+                then
+                    check_add_cflags -mfpu=neon #-ftree-vectorize
+                    check_add_asflags -mfpu=neon
+                fi
+                check_add_cflags -march=armv7-a -mcpu=cortex-a8 -mfloat-abi=softfp
+                check_add_asflags -mcpu=cortex-a8 -mfloat-abi=softfp  #-march=armv7-a
            else
                check_add_cflags -march=${tgt_isa}
                check_add_asflags -march=${tgt_isa}
@@ -708,10 +743,14 @@ process_common_toolchain() {
            tune_cflags="--cpu="
            tune_asflags="--cpu="
            if [ -z "${tune_cpu}" ]; then
-            if enabled armv7
-                then
-                    check_add_cflags --cpu=Cortex-A8 --fpu=softvfp+vfpv3
-                    check_add_asflags --cpu=Cortex-A8 --fpu=softvfp+vfpv3
+                if [ ${tgt_isa} == "armv7" ]; then
+                    if enabled neon
+                    then
+                        check_add_cflags --fpu=softvfp+vfpv3
+                        check_add_asflags --fpu=softvfp+vfpv3
+                    fi
+                    check_add_cflags --cpu=Cortex-A8
+                    check_add_asflags --cpu=Cortex-A8
                else
                    check_add_cflags --cpu=${tgt_isa##armv}
                    check_add_asflags --cpu=${tgt_isa##armv}
@@ -759,8 +798,7 @@ process_common_toolchain() {

            enable pic
            soft_enable realtime_only
-            if enabled armv7
-            then
+            if [ ${tgt_isa} == "armv7" ]; then
                enable runtime_cpu_detect
            fi
          ;;
@@ -886,6 +924,9 @@ process_common_toolchain() {
                LD=${LD:-${CROSS}gcc}
                CROSS=${CROSS:-g}
                ;;
+            os2)
+                AS=${AS:-nasm}
+                ;;
        esac

        AS="${alt_as:-${AS:-auto}}"
@@ -956,6 +997,11 @@ process_common_toolchain() {
                # enabled icc && ! enabled pic && add_cflags -fno-pic -mdynamic-no-pic
                enabled icc && ! enabled pic && add_cflags -fno-pic
            ;;
+            os2)
+                add_asflags -f aout
+                enabled debug && add_asflags -g
+                EXE_SFX=.exe
+            ;;
            *) log "Warning: Unknown os $tgt_os while setting up $AS flags"
            ;;
        esac
--- a/build/make/rtcd.sh
+++ b/build/make/rtcd.sh
@@ -0,0 +1,330 @@
+#!/bin/sh
+self=$0
+
+usage() {
+  cat <<EOF >&2
+Usage: $self [options] FILE
+
+Reads the Run Time CPU Detections definitions from FILE and generates a
+C header file on stdout.
+
+Options:
+  --arch=ARCH   Architecture to generate defs for (required)
+  --disable-EXT Disable support for EXT extensions
+  --require-EXT Require support for EXT extensions
+  --sym=SYMBOL  Unique symbol to use for RTCD initialization function
+  --config=FILE File with CONFIG_FOO=yes lines to parse
+EOF
+  exit 1
+}
+
+die() {
+  echo "$@" >&2
+  exit 1
+}
+
+die_argument_required() {
+  die "Option $opt requires argument"
+}
+
+for opt; do
+  optval="${opt#*=}"
+  case "$opt" in
+    --arch) die_argument_required;;
+    --arch=*) arch=${optval};;
+    --disable-*) eval "disable_${opt#--disable-}=true";;
+    --require-*) REQUIRES="${REQUIRES}${opt#--require-} ";;
+    --sym) die_argument_required;;
+    --sym=*) symbol=${optval};;
+    --config=*) config_file=${optval};;
+    -h|--help)
+      usage
+      ;;
+    -*)
+      die "Unrecognized option: ${opt%%=*}"
+      ;;
+    *)
+      defs_file="$defs_file $opt"
+      ;;
+  esac
+  shift
+done
+for f in $defs_file; do [ -f "$f" ] || usage; done
+[ -n "$arch" ] || usage
+
+# Import the configuration
+[ -f "$config_file" ] && eval $(grep CONFIG_ "$config_file")
+
+#
+# Routines for the RTCD DSL to call
+#
+prototype() {
+  local rtyp
+  case "$1" in
+    unsigned) rtyp="$1 "; shift;;
+  esac
+  rtyp="${rtyp}$1"
+  local fn="$2"
+  local args="$3"
+
+  eval "${2}_rtyp='$rtyp'"
+  eval "${2}_args='$3'"
+  ALL_FUNCS="$ALL_FUNCS $fn"
+  specialize $fn c
+}
+
+specialize() {
+  local fn="$1"
+  shift
+  for opt in "$@"; do
+    eval "${fn}_${opt}=${fn}_${opt}"
+  done
+}
+
+require() {
+  for fn in $ALL_FUNCS; do
+    for opt in "$@"; do
+      local ofn=$(eval "echo \$${fn}_${opt}")
+      [ -z "$ofn" ] && continue
+
+      # if we already have a default, then we can disable it, as we know
+      # we can do better.
+      local best=$(eval "echo \$${fn}_default")
+      local best_ofn=$(eval "echo \$${best}")
+      [ -n "$best" ] && [ "$best_ofn" != "$ofn" ] && eval "${best}_link=false"
+      eval "${fn}_default=${fn}_${opt}"
+      eval "${fn}_${opt}_link=true"
+    done
+  done
+}
+
+forward_decls() {
+  ALL_FORWARD_DECLS="$ALL_FORWARD_DECLS $1"
+}
+
+#
+# Include the user's directives
+#
+for f in $defs_file; do
+  . $f
+done
+
+#
+# Process the directives according to the command line
+#
+process_forward_decls() {
+  for fn in $ALL_FORWARD_DECLS; do
+    eval $fn
+  done
+}
+
+determine_indirection() {
+  [ "$CONFIG_RUNTIME_CPU_DETECT" = "yes" ] || require $ALL_ARCHS
+  for fn in $ALL_FUNCS; do
+    local n=""
+    local rtyp="$(eval "echo \$${fn}_rtyp")"
+    local args="$(eval "echo \"\$${fn}_args\"")"
+    local dfn="$(eval "echo \$${fn}_default")"
+    dfn=$(eval "echo \$${dfn}")
+    for opt in "$@"; do
+      local ofn=$(eval "echo \$${fn}_${opt}")
+      [ -z "$ofn" ] && continue
+      local link=$(eval "echo \$${fn}_${opt}_link")
+      [ "$link" = "false" ] && continue
+      n="${n}x"
+    done
+    if [ "$n" = "x" ]; then
+      eval "${fn}_indirect=false"
+    else
+      eval "${fn}_indirect=true"
+    fi
+  done
+}
+
+declare_function_pointers() {
+  for fn in $ALL_FUNCS; do
+    local rtyp="$(eval "echo \$${fn}_rtyp")"
+    local args="$(eval "echo \"\$${fn}_args\"")"
+    local dfn="$(eval "echo \$${fn}_default")"
+    dfn=$(eval "echo \$${dfn}")
+    for opt in "$@"; do
+      local ofn=$(eval "echo \$${fn}_${opt}")
+      [ -z "$ofn" ] && continue
+      echo "$rtyp ${ofn}($args);"
+    done
+    if [ "$(eval "echo \$${fn}_indirect")" = "false" ]; then
+      echo "#define ${fn} ${dfn}"
+    else
+      echo "RTCD_EXTERN $rtyp (*${fn})($args);"
+    fi
+    echo
+  done
+}
+
+set_function_pointers() {
+  for fn in $ALL_FUNCS; do
+    local n=""
+    local rtyp="$(eval "echo \$${fn}_rtyp")"
+    local args="$(eval "echo \"\$${fn}_args\"")"
+    local dfn="$(eval "echo \$${fn}_default")"
+    dfn=$(eval "echo \$${dfn}")
+    if $(eval "echo \$${fn}_indirect"); then
+      echo "    $fn = $dfn;"
+      for opt in "$@"; do
+        local ofn=$(eval "echo \$${fn}_${opt}")
+        [ -z "$ofn" ] && continue
+        [ "$ofn" = "$dfn" ] && continue;
+        local link=$(eval "echo \$${fn}_${opt}_link")
+        [ "$link" = "false" ] && continue
+        local cond="$(eval "echo \$have_${opt}")"
+        echo "    if (${cond}) $fn = $ofn;"
+      done
+    fi
+    echo
+  done
+}
+
+filter() {
+  local filtered
+  for opt in "$@"; do
+    [ -z $(eval "echo \$disable_${opt}") ] && filtered="$filtered $opt"
+  done
+  echo $filtered
+}
+
+#
+# Helper functions for generating the arch specific RTCD files
+#
+common_top() {
+  local outfile_basename=$(basename ${outfile:-rtcd.h})
+  local include_guard=$(echo -n $outfile_basename | tr '[a-z]' '[A-Z]' | tr -c '[A-Z]' _)
+  cat <<EOF
+#ifndef ${include_guard}
+#define ${include_guard}
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+$(process_forward_decls)
+
+$(declare_function_pointers c $ALL_ARCHS)
+EOF
+}
+
+common_bottom() {
+  cat <<EOF
+#endif
+EOF
+}
+
+x86() {
+  determine_indirection c $ALL_ARCHS
+
+  # Assign the helper variable for each enabled extension
+  for opt in $ALL_ARCHS; do
+    local uc=$(echo -n $opt | tr '[a-z]' '[A-Z]')
+    eval "have_${opt}=\"flags & HAS_${uc}\""
+  done
+
+  cat <<EOF
+$(common_top)
+void ${symbol:-rtcd}(void);
+
+#ifdef RTCD_C
+#include "vpx_ports/x86.h"
+void ${symbol:-rtcd}(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+$(set_function_pointers c $ALL_ARCHS)
+}
+#endif
+$(common_bottom)
+EOF
+}
+
+arm() {
+  determine_indirection c $ALL_ARCHS
+
+  # Assign the helper variable for each enabled extension
+  for opt in $ALL_ARCHS; do
+    local uc=$(echo -n $opt | tr '[a-z]' '[A-Z]')
+    eval "have_${opt}=\"flags & HAS_${uc}\""
+  done
+
+  cat <<EOF
+$(common_top)
+#include "vpx_config.h"
+
+void ${symbol:-rtcd}(void);
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+void ${symbol:-rtcd}(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+$(set_function_pointers c $ALL_ARCHS)
+}
+#endif
+$(common_bottom)
+EOF
+}
+
+
+unoptimized() {
+  determine_indirection c
+  cat <<EOF
+$(common_top)
+#include "vpx_config.h"
+
+void ${symbol:-rtcd}(void);
+
+#ifdef RTCD_C
+void ${symbol:-rtcd}(void)
+{
+$(set_function_pointers c)
+}
+#endif
+$(common_bottom)
+EOF
+
+}
+#
+# Main Driver
+#
+require c
+case $arch in
+  x86)
+    ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1)
+    x86
+    ;;
+  x86_64)
+    ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1)
+    REQUIRES=${REQUIRES:-mmx sse sse2}
+    require $(filter $REQUIRES)
+    x86
+    ;;
+  armv5te)
+    ALL_ARCHS=$(filter edsp)
+    arm
+    ;;
+  armv6)
+    ALL_ARCHS=$(filter edsp media)
+    arm
+    ;;
+  armv7)
+    ALL_ARCHS=$(filter edsp media neon)
+    arm
+    ;;
+  *)
+    unoptimized
+    ;;
+esac
--- a/7
+++ b/7
@@ -109,6 +109,7 @@ all_platforms="${all_platforms} x86-darwin9-icc"
 all_platforms="${all_platforms} x86-darwin10-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
+all_platforms="${all_platforms} x86-os2-gcc"
 all_platforms="${all_platforms} x86-solaris-gcc"
 all_platforms="${all_platforms} x86-win32-gcc"
 all_platforms="${all_platforms} x86-win32-vs7"
@@ -192,9 +193,9 @@ ARCH_LIST="
    ppc64
 "
 ARCH_EXT_LIST="
-    armv5te
-    armv6
-    armv7
+    edsp
+    media
+    neon

    mips32

--- a/examples.mk
+++ b/examples.mk
@@ -8,6 +8,20 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##

+LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
+                third_party/libyuv/include/libyuv/cpu_id.h  \
+                third_party/libyuv/include/libyuv/scale.h  \
+                third_party/libyuv/source/row.h \
+                third_party/libyuv/source/scale.c  \
+                third_party/libyuv/source/cpu_id.c
+
+NESTEGG_SRCS += nestegg/halloc/halloc.h \
+                nestegg/halloc/src/align.h \
+                nestegg/halloc/src/halloc.c \
+                nestegg/halloc/src/hlist.h \
+                nestegg/halloc/src/macros.h \
+                nestegg/include/nestegg/nestegg.h \
+                nestegg/src/nestegg.c

 # List of examples to build. UTILS are files that are taken from the source
 # tree directly, and GEN_EXAMPLES are files that are created from the
@@ -18,13 +32,7 @@ vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h
 vpxdec.SRCS                 += tools_common.c tools_common.h
-vpxdec.SRCS                 += nestegg/halloc/halloc.h
-vpxdec.SRCS                 += nestegg/halloc/src/align.h
-vpxdec.SRCS                 += nestegg/halloc/src/halloc.c
-vpxdec.SRCS                 += nestegg/halloc/src/hlist.h
-vpxdec.SRCS                 += nestegg/halloc/src/macros.h
-vpxdec.SRCS                 += nestegg/include/nestegg/nestegg.h
-vpxdec.SRCS                 += nestegg/src/nestegg.c
+vpxdec.SRCS                 += $(NESTEGG_SRCS)
 vpxdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
 vpxdec.DESCRIPTION           = Full featured decoder
 UTILS-$(CONFIG_ENCODERS)    += vpxenc.c
@@ -35,6 +43,8 @@ vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
 vpxenc.SRCS                 += libmkv/EbmlIDs.h
 vpxenc.SRCS                 += libmkv/EbmlWriter.c
 vpxenc.SRCS                 += libmkv/EbmlWriter.h
+vpxenc.SRCS                 += $(LIBYUV_SRCS)
+vpxenc.SRCS                 += $(NESTEGG_SRCS)
 vpxenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
 vpxenc.DESCRIPTION           = Full featured encoder
 UTILS-$(CONFIG_ENCODERS)    += vp8_scalable_patterns.c
@@ -98,13 +108,7 @@ vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame

 # C file is provided, not generated automatically.
 GEN_EXAMPLES-$(CONFIG_MULTI_RES_ENCODING) += vp8_multi_resolution_encoder.c
-vp8_multi_resolution_encoder.SRCS  \
-                         += third_party/libyuv/include/libyuv/basic_types.h  \
-                            third_party/libyuv/include/libyuv/cpu_id.h  \
-                            third_party/libyuv/include/libyuv/scale.h  \
-                            third_party/libyuv/source/row.h \
-                            third_party/libyuv/source/scale.c  \
-                            third_party/libyuv/source/cpu_id.c
+vp8_multi_resolution_encoder.SRCS         += $(LIBYUV_SRCS)
 vp8_multi_resolution_encoder.GUID         = 04f8738e-63c8-423b-90fa-7c2703a374de
 vp8_multi_resolution_encoder.DESCRIPTION  = VP8 Multiple-resolution Encoding

@@ -168,12 +172,12 @@ $(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_OBJS,BUILD_OBJS):=yes)
 # Create build/install dependencies for all examples. The common case
 # is handled here. The MSVS case is handled below.
 NOT_MSVS = $(if $(CONFIG_MSVS),,yes)
-DIST-BINS-$(NOT_MSVS)      += $(addprefix bin/,$(ALL_EXAMPLES:.c=))
-INSTALL-BINS-$(NOT_MSVS)   += $(addprefix bin/,$(UTILS:.c=))
+DIST-BINS-$(NOT_MSVS)      += $(addprefix bin/,$(ALL_EXAMPLES:.c=$(EXE_SFX)))
+INSTALL-BINS-$(NOT_MSVS)   += $(addprefix bin/,$(UTILS:.c=$(EXE_SFX)))
 DIST-SRCS-yes              += $(ALL_SRCS)
 INSTALL-SRCS-yes           += $(UTIL_SRCS)
 OBJS-$(NOT_MSVS)           += $(if $(BUILD_OBJS),$(call objs,$(ALL_SRCS)))
-BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=))
+BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=$(EXE_SFX)))


 # Instantiate linker template for all examples.
@@ -183,7 +187,7 @@ $(foreach bin,$(BINS-yes),\
    $(if $(BUILD_OBJS),$(eval $(bin):\
        $(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF)))\
    $(if $(BUILD_OBJS),$(eval $(call linker_template,$(bin),\
-        $(call objs,$($(notdir $(bin)).SRCS)) \
+        $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \
        -l$(CODEC_LIB) $(addprefix -l,$(CODEC_EXTRA_LIBS))\
        )))\
    $(if $(LIPO_OBJS),$(eval $(call lipo_bin_template,$(bin))))\
--- a/libs.mk
+++ b/libs.mk
@@ -48,7 +48,6 @@ ifeq ($(CONFIG_VP8_DECODER),yes)
  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))
  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))
  CODEC_SRCS-yes += $(VP8_PREFIX)vp8dx.mk vpx/vp8.h vpx/vp8dx.h
-  CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8dx_arm.mk
  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
@@ -90,6 +89,7 @@ endif
 $(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_LIBVPX,BUILD_LIBVPX):=yes)

 CODEC_SRCS-$(BUILD_LIBVPX) += build/make/version.sh
+CODEC_SRCS-$(BUILD_LIBVPX) += build/make/rtcd.sh
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx/vpx_integer.h
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/asm_offsets.h
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_timer.h
@@ -183,6 +183,7 @@ vpx.vcproj: $(CODEC_SRCS) vpx.def
 PROJECTS-$(BUILD_LIBVPX) += vpx.vcproj

 vpx.vcproj: vpx_config.asm
+vpx.vcproj: vpx_rtcd.h

 endif
 else
@@ -322,6 +323,18 @@ endif
 $(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
 CLEAN-OBJS += $(BUILD_PFX)vpx_version.h

+#
+# Rule to generate runtime cpu detection files
+#
+$(OBJS-yes:.o=.d): vpx_rtcd.h
+vpx_rtcd.h: $(sort $(filter %rtcd_defs.sh,$(CODEC_SRCS)))
+	@echo "    [CREATE] $@"
+	$(qexec)$(SRC_PATH_BARE)/build/make/rtcd.sh --arch=$(TGT_ISA) \
+          --sym=vpx_rtcd \
+          --config=$(target)$(if $(FAT_ARCHS),,-$(TOOLCHAIN)).mk \
+          $(RTCD_OPTIONS) $^ > $@
+CLEAN-OBJS += $(BUILD_PFX)vpx_rtcd.h
+
 CODEC_DOC_SRCS += vpx/vpx_codec.h \
                  vpx/vpx_decoder.h \
                  vpx/vpx_encoder.h \
--- a/third_party/libyuv/source/scale.c
+++ b/third_party/libyuv/source/scale.c
@@ -60,7 +60,7 @@ void SetUseReferenceImpl(int use) {

 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
 #define HAS_SCALEROWDOWN2_NEON
-void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
+void ScaleRowDown2_NEON(const uint8* src_ptr, int src_stride,
                        uint8* dst, int dst_width) {
  asm volatile (
    "1:                                        \n"
@@ -102,7 +102,7 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
 }

 #define HAS_SCALEROWDOWN4_NEON
-static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
+static void ScaleRowDown4_NEON(const uint8* src_ptr, int src_stride,
                               uint8* dst_ptr, int dst_width) {
  asm volatile (
    "1:                                        \n"
@@ -160,7 +160,7 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
 // Down scale from 4 to 3 pixels.  Use the neon multilane read/write
 //  to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
-static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */,
+static void ScaleRowDown34_NEON(const uint8* src_ptr, int src_stride,
                                uint8* dst_ptr, int dst_width) {
  asm volatile (
    "1:                                        \n"
@@ -284,7 +284,7 @@ const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) =
    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };

 // 32 -> 12
-static void ScaleRowDown38_NEON(const uint8* src_ptr, int,
+static void ScaleRowDown38_NEON(const uint8* src_ptr, int src_stride,
                                uint8* dst_ptr, int dst_width) {
  asm volatile (
    "vld1.u8      {q3}, [%3]                   \n"
--- a/tools_common.c
+++ b/tools_common.c
@@ -9,15 +9,21 @@
 */
 #include <stdio.h>
 #include "tools_common.h"
-#ifdef _WIN32
+#if defined(_WIN32) || defined(__OS2__)
 #include <io.h>
 #include <fcntl.h>
+
+#ifdef __OS2__
+#define _setmode    setmode
+#define _fileno     fileno
+#define _O_BINARY   O_BINARY
+#endif
 #endif

 FILE* set_binary_mode(FILE *stream)
 {
    (void)stream;
-#ifdef _WIN32
+#if defined(_WIN32) || defined(__OS2__)
    _setmode(_fileno(stream), _O_BINARY);
 #endif
    return stream;
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -1,115 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vpx_ports/arm.h"
-#include "vp8/common/pragmas.h"
-#include "vp8/common/subpixel.h"
-#include "vp8/common/loopfilter.h"
-#include "vp8/common/recon.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/onyxc_int.h"
-
-void vp8_arch_arm_common_init(VP8_COMMON *ctx)
-{
-#if CONFIG_RUNTIME_CPU_DETECT
-    VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
-    int flags = arm_cpu_caps();
-    rtcd->flags = flags;
-
-    /* Override default functions with fastest ones for this CPU. */
-#if HAVE_ARMV5TE
-    if (flags & HAS_EDSP)
-    {
-    }
-#endif
-
-#if HAVE_ARMV6
-    if (flags & HAS_MEDIA)
-    {
-        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_armv6;
-        rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_armv6;
-        rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_armv6;
-        rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_armv6;
-        rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_armv6;
-        rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_armv6;
-        rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_armv6;
-        rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_armv6;
-
-        rtcd->idct.idct16       = vp8_short_idct4x4llm_v6_dual;
-        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_v6;
-
-        rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;
-        rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_armv6;
-        rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;
-        rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_armv6;
-        rtcd->loopfilter.simple_mb_v =
-                vp8_loop_filter_simple_vertical_edge_armv6;
-        rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_armv6;
-        rtcd->loopfilter.simple_mb_h =
-                vp8_loop_filter_simple_horizontal_edge_armv6;
-        rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_armv6;
-
-        rtcd->recon.copy16x16   = vp8_copy_mem16x16_v6;
-        rtcd->recon.copy8x8     = vp8_copy_mem8x8_v6;
-        rtcd->recon.copy8x4     = vp8_copy_mem8x4_v6;
-        rtcd->recon.intra4x4_predict = vp8_intra4x4_predict_armv6;
-
-        rtcd->dequant.block               = vp8_dequantize_b_v6;
-        rtcd->dequant.idct_add            = vp8_dequant_idct_add_v6;
-        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
-        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
-
-    }
-#endif
-
-#if HAVE_ARMV7
-    if (flags & HAS_NEON)
-    {
-        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_neon;
-        rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_neon;
-        rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_neon;
-        rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_neon;
-        rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_neon;
-        rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_neon;
-        rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_neon;
-        rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_neon;
-
-        rtcd->idct.idct16       = vp8_short_idct4x4llm_neon;
-        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_neon;
-
-        rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;
-        rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_neon;
-        rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_neon;
-        rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_neon;
-        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_neon;
-        rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_neon;
-        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_neon;
-        rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_neon;
-
-        rtcd->recon.copy16x16   = vp8_copy_mem16x16_neon;
-        rtcd->recon.copy8x8     = vp8_copy_mem8x8_neon;
-        rtcd->recon.copy8x4     = vp8_copy_mem8x4_neon;
-        rtcd->recon.build_intra_predictors_mby =
-            vp8_build_intra_predictors_mby_neon;
-        rtcd->recon.build_intra_predictors_mby_s =
-            vp8_build_intra_predictors_mby_s_neon;
-
-        rtcd->dequant.block               = vp8_dequantize_b_neon;
-        rtcd->dequant.idct_add            = vp8_dequant_idct_add_neon;
-        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
-        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
-
-    }
-#endif
-
-#endif
-}
--- a/vp8/common/arm/armv6/idct_blk_v6.c
+++ b/vp8/common/arm/armv6/idct_blk_v6.c
@@ -9,8 +9,7 @@
 */

 #include "vpx_config.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/dequantize.h"
+#include "vpx_rtcd.h"


 void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ b/vp8/common/arm/bilinearfilter_arm.c
@@ -8,10 +8,10 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
 #include <math.h>
 #include "vp8/common/filter.h"
-#include "vp8/common/subpixel.h"
 #include "bilinearfilter_arm.h"

 void vp8_filter_block2d_bil_armv6
--- a/vp8/common/arm/dequantize_arm.c
+++ b/vp8/common/arm/dequantize_arm.c
@@ -10,18 +10,17 @@


 #include "vpx_config.h"
-#include "vp8/common/dequantize.h"
-#include "vp8/common/idct.h"
+#include "vp8/common/blockd.h"

-#if HAVE_ARMV7
+#if HAVE_NEON
 extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
 #endif

-#if HAVE_ARMV6
+#if HAVE_MEDIA
 extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
 #endif

-#if HAVE_ARMV7
+#if HAVE_NEON

 void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
 {
@@ -32,7 +31,7 @@ void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
 }
 #endif

-#if HAVE_ARMV6
+#if HAVE_MEDIA
 void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
 {
    short *DQ  = d->dqcoeff;
--- a/vp8/common/arm/dequantize_arm.h
+++ b/vp8/common/arm/dequantize_arm.h
@@ -1,59 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef DEQUANTIZE_ARM_H
-#define DEQUANTIZE_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_dequant_block(vp8_dequantize_b_v6);
-extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);
-extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);
-extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_dequant_block
-#define vp8_dequant_block vp8_dequantize_b_v6
-
-#undef  vp8_dequant_idct_add
-#define vp8_dequant_idct_add vp8_dequant_idct_add_v6
-
-#undef  vp8_dequant_idct_add_y_block
-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6
-
-#undef  vp8_dequant_idct_add_uv_block
-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6
-#endif
-#endif
-
-#if HAVE_ARMV7
-extern prototype_dequant_block(vp8_dequantize_b_neon);
-extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);
-extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);
-extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_dequant_block
-#define vp8_dequant_block vp8_dequantize_b_neon
-
-#undef  vp8_dequant_idct_add
-#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
-
-#undef  vp8_dequant_idct_add_y_block
-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
-
-#undef  vp8_dequant_idct_add_uv_block
-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
-#endif
-
-#endif
-
-#endif
--- a/vp8/common/arm/filter_arm.c
+++ b/vp8/common/arm/filter_arm.c
@@ -10,9 +10,9 @@


 #include "vpx_config.h"
+#include "vpx_rtcd.h"
 #include <math.h>
 #include "vp8/common/filter.h"
-#include "vp8/common/subpixel.h"
 #include "vpx_ports/mem.h"

 extern void vp8_filter_block2d_first_pass_armv6
@@ -86,8 +86,8 @@ extern void vp8_filter_block2d_second_pass_only_armv6
    const short *vp8_filter
 );

-#if HAVE_ARMV6
-void vp8_sixtap_predict_armv6
+#if HAVE_MEDIA
+void vp8_sixtap_predict4x4_armv6
 (
    unsigned char  *src_ptr,
    int  src_pixels_per_line,
--- a/vp8/common/arm/idct_arm.h
+++ b/vp8/common/arm/idct_arm.h
@@ -1,51 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef IDCT_ARM_H
-#define IDCT_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_idct(vp8_short_idct4x4llm_v6_dual);
-extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6);
-extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6);
-extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_idct_idct16
-#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual
-
-#undef  vp8_idct_idct1_scalar_add
-#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6
-
-#undef  vp8_idct_iwalsh16
-#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
-#endif
-#endif
-
-#if HAVE_ARMV7
-extern prototype_idct(vp8_short_idct4x4llm_neon);
-extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon);
-extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
-extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_idct_idct16
-#define vp8_idct_idct16 vp8_short_idct4x4llm_neon
-
-#undef  vp8_idct_idct1_scalar_add
-#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon
-
-#undef  vp8_idct_iwalsh16
-#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon
-#endif
-#endif
-
-#endif
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -10,17 +10,22 @@


 #include "vpx_config.h"
+#include "vpx_rtcd.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/onyxc_int.h"

-#if HAVE_ARMV6
+#define prototype_loopfilter(sym) \
+    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+             const unsigned char *limit, const unsigned char *thresh, int count)
+
+#if HAVE_MEDIA
 extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
 extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
 #endif

-#if HAVE_ARMV7
+#if HAVE_NEON
 typedef void loopfilter_y_neon(unsigned char *src, int pitch,
        unsigned char blimit, unsigned char limit, unsigned char thresh);
 typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
@@ -38,8 +43,8 @@ extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
 extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
 #endif

-#if HAVE_ARMV6
-/*ARMV6 loopfilter functions*/
+#if HAVE_MEDIA
+/* ARMV6/MEDIA loopfilter functions*/
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi)
@@ -113,7 +118,7 @@ void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
 }
 #endif

-#if HAVE_ARMV7
+#if HAVE_NEON
 /* NEON loopfilter functions */
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
--- a/vp8/common/arm/loopfilter_arm.h
+++ b/vp8/common/arm/loopfilter_arm.h
@@ -1,93 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef LOOPFILTER_ARM_H
-#define LOOPFILTER_ARM_H
-
-#include "vpx_config.h"
-
-#if HAVE_ARMV6
-extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6);
-extern prototype_simple_loopfilter(vp8_loop_filter_bvs_armv6);
-extern prototype_simple_loopfilter(vp8_loop_filter_bhs_armv6);
-extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
-extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_lf_normal_mb_v
-#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_armv6
-
-#undef  vp8_lf_normal_b_v
-#define vp8_lf_normal_b_v vp8_loop_filter_bv_armv6
-
-#undef  vp8_lf_normal_mb_h
-#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_armv6
-
-#undef  vp8_lf_normal_b_h
-#define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6
-
-#undef  vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_armv6
-
-#undef  vp8_lf_simple_b_v
-#define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6
-
-#undef  vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_armv6
-
-#undef  vp8_lf_simple_b_h
-#define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_ARMV6 */
-
-#if HAVE_ARMV7
-extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_bv_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_bh_neon);
-extern prototype_simple_loopfilter(vp8_loop_filter_mbvs_neon);
-extern prototype_simple_loopfilter(vp8_loop_filter_bvs_neon);
-extern prototype_simple_loopfilter(vp8_loop_filter_mbhs_neon);
-extern prototype_simple_loopfilter(vp8_loop_filter_bhs_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_lf_normal_mb_v
-#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_neon
-
-#undef  vp8_lf_normal_b_v
-#define vp8_lf_normal_b_v vp8_loop_filter_bv_neon
-
-#undef  vp8_lf_normal_mb_h
-#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_neon
-
-#undef  vp8_lf_normal_b_h
-#define vp8_lf_normal_b_h vp8_loop_filter_bh_neon
-
-#undef  vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_neon
-
-#undef  vp8_lf_simple_b_v
-#define vp8_lf_simple_b_v vp8_loop_filter_bvs_neon
-
-#undef  vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_neon
-
-#undef  vp8_lf_simple_b_h
-#define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_ARMV7 */
-
-#endif /* LOOPFILTER_ARM_H */
--- a/vp8/common/arm/neon/idct_blk_neon.c
+++ b/vp8/common/arm/neon/idct_blk_neon.c
@@ -9,8 +9,7 @@
 */

 #include "vpx_config.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/dequantize.h"
+#include "vpx_rtcd.h"

 /* place these declarations here because we don't want to maintain them
 * outside of this scope
--- a/vp8/common/arm/neon/save_reg_neon.asm
+++ b/vp8/common/arm/neon/save_reg_neon.asm
--- a/vp8/common/arm/neon/sixtappredict4x4_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict4x4_neon.asm
@@ -9,7 +9,7 @@
 ;


-    EXPORT  |vp8_sixtap_predict_neon|
+    EXPORT  |vp8_sixtap_predict4x4_neon|
    ARM
    REQUIRE8
    PRESERVE8
@@ -33,7 +33,7 @@ filter4_coeff
 ; stack(r4) unsigned char *dst_ptr,
 ; stack(lr) int  dst_pitch

-|vp8_sixtap_predict_neon| PROC
+|vp8_sixtap_predict4x4_neon| PROC
    push            {r4, lr}

    adr             r12, filter4_coeff
--- a/vp8/common/arm/recon_arm.h
+++ b/vp8/common/arm/recon_arm.h
@@ -1,65 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef RECON_ARM_H
-#define RECON_ARM_H
-
-#if HAVE_ARMV6
-
-extern prototype_copy_block(vp8_copy_mem8x8_v6);
-extern prototype_copy_block(vp8_copy_mem8x4_v6);
-extern prototype_copy_block(vp8_copy_mem16x16_v6);
-extern prototype_intra4x4_predict(vp8_intra4x4_predict_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_recon_copy8x8
-#define vp8_recon_copy8x8 vp8_copy_mem8x8_v6
-
-#undef  vp8_recon_copy8x4
-#define vp8_recon_copy8x4 vp8_copy_mem8x4_v6
-
-#undef  vp8_recon_copy16x16
-#define vp8_recon_copy16x16 vp8_copy_mem16x16_v6
-
-#undef  vp8_recon_intra4x4_predict
-#define vp8_recon_intra4x4_predict vp8_intra4x4_predict_armv6
-#endif
-#endif
-
-#if HAVE_ARMV7
-
-extern prototype_copy_block(vp8_copy_mem8x8_neon);
-extern prototype_copy_block(vp8_copy_mem8x4_neon);
-extern prototype_copy_block(vp8_copy_mem16x16_neon);
-
-extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_neon);
-extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_recon_copy8x8
-#define vp8_recon_copy8x8 vp8_copy_mem8x8_neon
-
-#undef  vp8_recon_copy8x4
-#define vp8_recon_copy8x4 vp8_copy_mem8x4_neon
-
-#undef  vp8_recon_copy16x16
-#define vp8_recon_copy16x16 vp8_copy_mem16x16_neon
-
-#undef  vp8_recon_build_intra_predictors_mby
-#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_neon
-
-#undef  vp8_recon_build_intra_predictors_mby_s
-#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon
-
-#endif
-#endif
-
-#endif
--- a/vp8/common/arm/reconintra_arm.c
+++ b/vp8/common/arm/reconintra_arm.c
@@ -10,12 +10,11 @@


 #include "vpx_config.h"
+#include "vpx_rtcd.h"
 #include "vp8/common/blockd.h"
-#include "vp8/common/reconintra.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vp8/common/recon.h"

-#if HAVE_ARMV7
+#if HAVE_NEON
 extern void vp8_build_intra_predictors_mby_neon_func(
    unsigned char *y_buffer,
    unsigned char *ypred_ptr,
@@ -35,10 +34,7 @@ void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)

    vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
 }
-#endif

-
-#if HAVE_ARMV7
 extern void vp8_build_intra_predictors_mby_s_neon_func(
    unsigned char *y_buffer,
    unsigned char *ypred_ptr,
--- a/vp8/common/arm/subpixel_arm.h
+++ b/vp8/common/arm/subpixel_arm.h
@@ -1,89 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef SUBPIXEL_ARM_H
-#define SUBPIXEL_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_subpixel_predict(vp8_sixtap_predict16x16_armv6);
-extern prototype_subpixel_predict(vp8_sixtap_predict8x8_armv6);
-extern prototype_subpixel_predict(vp8_sixtap_predict8x4_armv6);
-extern prototype_subpixel_predict(vp8_sixtap_predict_armv6);
-extern prototype_subpixel_predict(vp8_bilinear_predict16x16_armv6);
-extern prototype_subpixel_predict(vp8_bilinear_predict8x8_armv6);
-extern prototype_subpixel_predict(vp8_bilinear_predict8x4_armv6);
-extern prototype_subpixel_predict(vp8_bilinear_predict4x4_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_subpix_sixtap16x16
-#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_armv6
-
-#undef  vp8_subpix_sixtap8x8
-#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_armv6
-
-#undef  vp8_subpix_sixtap8x4
-#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_armv6
-
-#undef  vp8_subpix_sixtap4x4
-#define vp8_subpix_sixtap4x4 vp8_sixtap_predict_armv6
-
-#undef  vp8_subpix_bilinear16x16
-#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_armv6
-
-#undef  vp8_subpix_bilinear8x8
-#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_armv6
-
-#undef  vp8_subpix_bilinear8x4
-#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_armv6
-
-#undef  vp8_subpix_bilinear4x4
-#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_armv6
-#endif
-#endif
-
-#if HAVE_ARMV7
-extern prototype_subpixel_predict(vp8_sixtap_predict16x16_neon);
-extern prototype_subpixel_predict(vp8_sixtap_predict8x8_neon);
-extern prototype_subpixel_predict(vp8_sixtap_predict8x4_neon);
-extern prototype_subpixel_predict(vp8_sixtap_predict_neon);
-extern prototype_subpixel_predict(vp8_bilinear_predict16x16_neon);
-extern prototype_subpixel_predict(vp8_bilinear_predict8x8_neon);
-extern prototype_subpixel_predict(vp8_bilinear_predict8x4_neon);
-extern prototype_subpixel_predict(vp8_bilinear_predict4x4_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_subpix_sixtap16x16
-#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_neon
-
-#undef  vp8_subpix_sixtap8x8
-#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_neon
-
-#undef  vp8_subpix_sixtap8x4
-#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_neon
-
-#undef  vp8_subpix_sixtap4x4
-#define vp8_subpix_sixtap4x4 vp8_sixtap_predict_neon
-
-#undef  vp8_subpix_bilinear16x16
-#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_neon
-
-#undef  vp8_subpix_bilinear8x8
-#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_neon
-
-#undef  vp8_subpix_bilinear8x4
-#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_neon
-
-#undef  vp8_subpix_bilinear4x4
-#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_neon
-#endif
-#endif
-
-#endif
--- a/vp8/common/asm_com_offsets.c
+++ b/vp8/common/asm_com_offsets.c
@@ -35,7 +35,7 @@ END
 /* add asserts for any offset that is not supported by assembly code */
 /* add asserts for any size that is not supported by assembly code */

-#if HAVE_ARMV6
+#if HAVE_MEDIA
 /* switch case in vp8_intra4x4_predict_armv6 is based on these enumerated values */
 ct_assert(B_DC_PRED, B_DC_PRED == 0);
 ct_assert(B_TM_PRED, B_TM_PRED == 1);
@@ -49,7 +49,7 @@ ct_assert(B_HD_PRED, B_HD_PRED == 8);
 ct_assert(B_HU_PRED, B_HU_PRED == 9);
 #endif

-#if HAVE_ARMV7
+#if HAVE_NEON
 /* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
 ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
 #endif
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -18,7 +18,6 @@ void vpx_log(const char *format, ...);
 #include "vpx_scale/yv12config.h"
 #include "mv.h"
 #include "treecoder.h"
-#include "subpixel.h"
 #include "vpx_ports/mem.h"

 /*#define DCPRED 1*/
@@ -179,28 +178,22 @@ typedef struct
 } LOWER_RES_INFO;
 #endif

-typedef struct
+typedef struct blockd
 {
    short *qcoeff;
    short *dqcoeff;
    unsigned char  *predictor;
    short *dequant;

-    /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
-    unsigned char **base_pre;
-    int pre;
-    int pre_stride;
-
-    unsigned char **base_dst;
-    int dst;
-    int dst_stride;
-
+    int offset;
    char *eob;

    union b_mode_info bmi;
 } BLOCKD;

-typedef struct MacroBlockD
+typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+
+typedef struct macroblockd
 {
    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
    DECLARE_ALIGNED(16, short, qcoeff[400]);
@@ -212,6 +205,11 @@ typedef struct MacroBlockD
    DECLARE_ALIGNED(16, short,  dequant_y2[16]);
    DECLARE_ALIGNED(16, short,  dequant_uv[16]);

+    /* position of this macroblock */
+    int mbr;
+    int mbc;
+    int mbrc;
+
    /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
    BLOCKD block[25];
    int fullpixel_mask;
@@ -265,11 +263,8 @@ typedef struct MacroBlockD
    int mb_to_top_edge;
    int mb_to_bottom_edge;

-    int ref_frame_cost[MAX_REF_FRAMES];


-    unsigned int frames_since_golden;
-    unsigned int frames_till_alt_ref_frame;
    vp8_subpix_fn_t  subpixel_predict;
    vp8_subpix_fn_t  subpixel_predict8x4;
    vp8_subpix_fn_t  subpixel_predict8x8;
@@ -286,10 +281,6 @@ typedef struct MacroBlockD
     */
    DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]);
 #endif
-
-#if CONFIG_RUNTIME_CPU_DETECT
-    struct VP8_COMMON_RTCD  *rtcd;
-#endif
 } MACROBLOCKD;


--- a/vp8/common/dequantize.c
+++ b/vp8/common/dequantize.c
@@ -10,8 +10,8 @@


 #include "vpx_config.h"
-#include "dequantize.h"
-#include "vp8/common/idct.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/blockd.h"
 #include "vpx_mem/vpx_mem.h"

 void vp8_dequantize_b_c(BLOCKD *d, short *DQC)
--- a/vp8/common/dequantize.h
+++ b/vp8/common/dequantize.h
@@ -1,85 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef DEQUANTIZE_H
-#define DEQUANTIZE_H
-#include "vp8/common/blockd.h"
-
-#define prototype_dequant_block(sym) \
-    void sym(BLOCKD *x, short *DQC)
-
-#define prototype_dequant_idct_add(sym) \
-    void sym(short *input, short *dq, \
-             unsigned char *output, \
-             int stride)
-
-#define prototype_dequant_idct_add_y_block(sym) \
-    void sym(short *q, short *dq, \
-             unsigned char *dst, \
-             int stride, char *eobs)
-
-#define prototype_dequant_idct_add_uv_block(sym) \
-    void sym(short *q, short *dq, \
-             unsigned char *dst_u, \
-             unsigned char *dst_v, int stride, char *eobs)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/dequantize_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/dequantize_arm.h"
-#endif
-
-#ifndef vp8_dequant_block
-#define vp8_dequant_block vp8_dequantize_b_c
-#endif
-extern prototype_dequant_block(vp8_dequant_block);
-
-#ifndef vp8_dequant_idct_add
-#define vp8_dequant_idct_add vp8_dequant_idct_add_c
-#endif
-extern prototype_dequant_idct_add(vp8_dequant_idct_add);
-
-#ifndef vp8_dequant_idct_add_y_block
-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
-#endif
-extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block);
-
-#ifndef vp8_dequant_idct_add_uv_block
-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
-#endif
-extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block);
-
-
-typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
-
-typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
-
-typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t));
-
-typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));
-
-typedef struct
-{
-    vp8_dequant_block_fn_t               block;
-    vp8_dequant_idct_add_fn_t            idct_add;
-    vp8_dequant_idct_add_y_block_fn_t    idct_add_y_block;
-    vp8_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;
-} vp8_dequant_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define DEQUANT_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define DEQUANT_INVOKE(ctx,fn) vp8_dequant_##fn
-#endif
-
-#endif
--- a/vp8/common/filter.c
+++ b/vp8/common/filter.c
@@ -149,7 +149,7 @@ static void filter_block2d
 }


-void vp8_sixtap_predict_c
+void vp8_sixtap_predict4x4_c
 (
    unsigned char  *src_ptr,
    int   src_pixels_per_line,
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -10,30 +10,33 @@


 #include "vpx_config.h"
-#include "vp8/common/subpixel.h"
-#include "vp8/common/loopfilter.h"
-#include "vp8/common/recon.h"
-#include "vp8/common/idct.h"
+#include "vpx_rtcd.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#elif ARCH_X86 || ARCH_X86_64
+#include "vpx_ports/x86.h"
+#endif
 #include "vp8/common/onyxc_int.h"

 #if CONFIG_MULTITHREAD
-#if HAVE_UNISTD_H
+#if HAVE_UNISTD_H && !defined(__OS2__)
 #include <unistd.h>
 #elif defined(_WIN32)
 #include <windows.h>
 typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);
+#elif defined(__OS2__)
+#define INCL_DOS
+#define INCL_DOSSPINLOCK
+#include <os2.h>
 #endif
 #endif

-extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);
-extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);
-
 #if CONFIG_MULTITHREAD
 static int get_cpu_count()
 {
    int core_count = 16;

-#if HAVE_UNISTD_H
+#if HAVE_UNISTD_H && !defined(__OS2__)
 #if defined(_SC_NPROCESSORS_ONLN)
    core_count = sysconf(_SC_NPROCESSORS_ONLN);
 #elif defined(_SC_NPROC_ONLN)
@@ -56,6 +59,21 @@ static int get_cpu_count()

        core_count = sysinfo.dwNumberOfProcessors;
    }
+#elif defined(__OS2__)
+    {
+        ULONG proc_id;
+        ULONG status;
+
+        core_count = 0;
+        for (proc_id = 1; ; proc_id++)
+        {
+            if (DosGetProcessorStatus(proc_id, &status))
+                break;
+
+            if (status == PROC_ONLINE)
+                core_count++;
+        }
+    }
 #else
    /* other platforms */
 #endif
@@ -66,76 +84,15 @@ static int get_cpu_count()

 void vp8_machine_specific_config(VP8_COMMON *ctx)
 {
-#if CONFIG_RUNTIME_CPU_DETECT
-    VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
-
-
-    rtcd->dequant.block             = vp8_dequantize_b_c;
-    rtcd->dequant.idct_add          = vp8_dequant_idct_add_c;
-    rtcd->dequant.idct_add_y_block  = vp8_dequant_idct_add_y_block_c;
-    rtcd->dequant.idct_add_uv_block =
-        vp8_dequant_idct_add_uv_block_c;
-
-
-    rtcd->idct.idct16       = vp8_short_idct4x4llm_c;
-    rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;
-    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_c;
-    rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_c;
-
-    rtcd->recon.copy16x16   = vp8_copy_mem16x16_c;
-    rtcd->recon.copy8x8     = vp8_copy_mem8x8_c;
-    rtcd->recon.copy8x4     = vp8_copy_mem8x4_c;
-
-    rtcd->recon.build_intra_predictors_mby =
-        vp8_build_intra_predictors_mby;
-    rtcd->recon.build_intra_predictors_mby_s =
-        vp8_build_intra_predictors_mby_s;
-    rtcd->recon.build_intra_predictors_mbuv =
-        vp8_build_intra_predictors_mbuv;
-    rtcd->recon.build_intra_predictors_mbuv_s =
-        vp8_build_intra_predictors_mbuv_s;
-    rtcd->recon.intra4x4_predict =
-        vp8_intra4x4_predict_c;
-
-    rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;
-    rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;
-    rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_c;
-    rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_c;
-    rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_c;
-    rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_c;
-    rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_c;
-    rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_c;
-
-    rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_c;
-    rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_c;
-    rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c;
-    rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_c;
-    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_c;
-    rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_c;
-    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_c;
-    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_c;
-
-#if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_INTERNAL_STATS)
-    rtcd->postproc.down             = vp8_mbpost_proc_down_c;
-    rtcd->postproc.across           = vp8_mbpost_proc_across_ip_c;
-    rtcd->postproc.downacross       = vp8_post_proc_down_and_across_c;
-    rtcd->postproc.addnoise         = vp8_plane_add_noise_c;
-    rtcd->postproc.blend_mb_inner   = vp8_blend_mb_inner_c;
-    rtcd->postproc.blend_mb_outer   = vp8_blend_mb_outer_c;
-    rtcd->postproc.blend_b          = vp8_blend_b_c;
-#endif
-
-#endif
-
-#if ARCH_X86 || ARCH_X86_64
-    vp8_arch_x86_common_init(ctx);
-#endif
-
-#if ARCH_ARM
-    vp8_arch_arm_common_init(ctx);
-#endif
-
 #if CONFIG_MULTITHREAD
    ctx->processor_core_count = get_cpu_count();
 #endif /* CONFIG_MULTITHREAD */
+
+#if ARCH_ARM
+    ctx->cpu_caps = arm_cpu_caps();
+#elif ARCH_X86 || ARCH_X86_64
+    ctx->cpu_caps = x86_simd_caps();
+#endif
+
+    vpx_rtcd();
 }
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h
@@ -1,80 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_IDCT_H
-#define __INC_IDCT_H
-
-#define prototype_second_order(sym) \
-    void sym(short *input, short *output)
-
-#define prototype_idct(sym) \
-    void sym(short *input, unsigned char *pred, int pitch, unsigned char *dst, \
-             int dst_stride)
-
-#define prototype_idct_scalar_add(sym) \
-    void sym(short input, \
-            unsigned char *pred, int pred_stride, \
-            unsigned char *dst, \
-            int dst_stride)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/idct_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/idct_arm.h"
-#endif
-
-#ifndef vp8_idct_idct16
-#define vp8_idct_idct16 vp8_short_idct4x4llm_c
-#endif
-extern prototype_idct(vp8_idct_idct16);
-/* add this prototype to prevent compiler warning about implicit
- * declaration of vp8_short_idct4x4llm_c function in dequantize.c
- * when building, for example, neon optimized version */
-extern prototype_idct(vp8_short_idct4x4llm_c);
-
-#ifndef vp8_idct_idct1_scalar_add
-#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_c
-#endif
-extern prototype_idct_scalar_add(vp8_idct_idct1_scalar_add);
-
-
-#ifndef vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_c
-#endif
-extern prototype_second_order(vp8_idct_iwalsh1);
-
-#ifndef vp8_idct_iwalsh16
-#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_c
-#endif
-extern prototype_second_order(vp8_idct_iwalsh16);
-
-typedef prototype_idct((*vp8_idct_fn_t));
-typedef prototype_idct_scalar_add((*vp8_idct_scalar_add_fn_t));
-typedef prototype_second_order((*vp8_second_order_fn_t));
-
-typedef struct
-{
-    vp8_idct_fn_t            idct16;
-    vp8_idct_scalar_add_fn_t idct1_scalar_add;
-
-    vp8_second_order_fn_t iwalsh1;
-    vp8_second_order_fn_t iwalsh16;
-} vp8_idct_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IDCT_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define IDCT_INVOKE(ctx,fn) vp8_idct_##fn
-#endif
-
-#endif
--- a/vp8/common/idct_blk.c
+++ b/vp8/common/idct_blk.c
@@ -9,8 +9,7 @@
 */

 #include "vpx_config.h"
-#include "vp8/common/idct.h"
-#include "dequantize.h"
+#include "vpx_rtcd.h"

 void vp8_dequant_idct_add_c(short *input, short *dq,
                            unsigned char *dest, int stride);
--- a/vp8/common/invtrans.h
+++ b/vp8/common/invtrans.h
@@ -13,7 +13,7 @@
 #define __INC_INVTRANS_H

 #include "vpx_config.h"
-#include "idct.h"
+#include "vpx_rtcd.h"
 #include "blockd.h"
 #include "onyxc_int.h"

@@ -33,8 +33,7 @@ static void eob_adjust(char *eobs, short *diff)
    }
 }

-static void vp8_inverse_transform_mby(MACROBLOCKD *xd,
-                                      const VP8_COMMON_RTCD *rtcd)
+static void vp8_inverse_transform_mby(MACROBLOCKD *xd)
 {
    short *DQC = xd->dequant_y1;

@@ -43,19 +42,19 @@ static void vp8_inverse_transform_mby(MACROBLOCKD *xd,
        /* do 2nd order transform on the dc block */
        if (xd->eobs[24] > 1)
        {
-            IDCT_INVOKE(&rtcd->idct, iwalsh16)
+            vp8_short_inv_walsh4x4
                (&xd->block[24].dqcoeff[0], xd->qcoeff);
        }
        else
        {
-            IDCT_INVOKE(&rtcd->idct, iwalsh1)
+            vp8_short_inv_walsh4x4_1
                (&xd->block[24].dqcoeff[0], xd->qcoeff);
        }
        eob_adjust(xd->eobs, xd->qcoeff);

        DQC = xd->dequant_y1_dc;
    }
-    DEQUANT_INVOKE (&rtcd->dequant, idct_add_y_block)
+    vp8_dequant_idct_add_y_block
                    (xd->qcoeff, DQC,
                     xd->dst.y_buffer,
                     xd->dst.y_stride, xd->eobs);
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -10,96 +10,13 @@


 #include "vpx_config.h"
+#include "vpx_rtcd.h"
 #include "loopfilter.h"
 #include "onyxc_int.h"
 #include "vpx_mem/vpx_mem.h"

 typedef unsigned char uc;

-prototype_loopfilter(vp8_loop_filter_horizontal_edge_c);
-prototype_loopfilter(vp8_loop_filter_vertical_edge_c);
-prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c);
-prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c);
-
-prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
-prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
-
-/* Horizontal MB filtering */
-void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                           unsigned char *v_ptr, int y_stride, int uv_stride,
-                           loop_filter_info *lfi)
-{
-    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
-    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
-    if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Vertical MB Filtering */
-void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                           unsigned char *v_ptr, int y_stride, int uv_stride,
-                           loop_filter_info *lfi)
-{
-    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
-    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
-    if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Horizontal B Filtering */
-void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                          unsigned char *v_ptr, int y_stride, int uv_stride,
-                          loop_filter_info *lfi)
-{
-    vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-    vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
-    if (v_ptr)
-        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
-                           const unsigned char *blimit)
-{
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit);
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit);
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit);
-}
-
-/* Vertical B Filtering */
-void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                          unsigned char *v_ptr, int y_stride, int uv_stride,
-                          loop_filter_info *lfi)
-{
-    vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-    vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-    if (u_ptr)
-        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
-    if (v_ptr)
-        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
-                           const unsigned char *blimit)
-{
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
-}
-
 static void lf_init_lut(loop_filter_info_n *lfi)
 {
    int filt_lvl;
@@ -335,39 +252,39 @@ void vp8_loop_filter_frame
                    lfi.hev_thr = lfi_n->hev_thr[hev_index];

                    if (mb_col > 0)
-                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
+                        vp8_loop_filter_mbv
                        (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);

                    if (!skip_lf)
-                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
+                        vp8_loop_filter_bv
                        (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);

                    /* don't apply across umv border */
                    if (mb_row > 0)
-                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
+                        vp8_loop_filter_mbh
                        (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);

                    if (!skip_lf)
-                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
+                        vp8_loop_filter_bh
                        (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
                }
                else
                {
                    if (mb_col > 0)
-                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
+                        vp8_loop_filter_simple_mbv
                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);

                    if (!skip_lf)
-                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
+                        vp8_loop_filter_simple_bv
                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);

                    /* don't apply across umv border */
                    if (mb_row > 0)
-                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
+                        vp8_loop_filter_simple_mbh
                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);

                    if (!skip_lf)
-                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
+                        vp8_loop_filter_simple_bh
                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
                }
            }
@@ -446,39 +363,39 @@ void vp8_loop_filter_frame_yonly
                    lfi.hev_thr = lfi_n->hev_thr[hev_index];

                    if (mb_col > 0)
-                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
+                        vp8_loop_filter_mbv
                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);

                    if (!skip_lf)
-                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
+                        vp8_loop_filter_bv
                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);

                    /* don't apply across umv border */
                    if (mb_row > 0)
-                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
+                        vp8_loop_filter_mbh
                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);

                    if (!skip_lf)
-                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
+                        vp8_loop_filter_bh
                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
                }
                else
                {
                    if (mb_col > 0)
-                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
+                        vp8_loop_filter_simple_mbv
                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);

                    if (!skip_lf)
-                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
+                        vp8_loop_filter_simple_bv
                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);

                    /* don't apply across umv border */
                    if (mb_row > 0)
-                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
+                        vp8_loop_filter_simple_mbh
                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);

                    if (!skip_lf)
-                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
+                        vp8_loop_filter_simple_bh
                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
                }
            }
@@ -578,35 +495,35 @@ void vp8_loop_filter_partial_frame
                    lfi.hev_thr = lfi_n->hev_thr[hev_index];

                    if (mb_col > 0)
-                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
+                        vp8_loop_filter_mbv
                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);

                    if (!skip_lf)
-                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
+                        vp8_loop_filter_bv
                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);

-                    LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
+                    vp8_loop_filter_mbh
                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);

                    if (!skip_lf)
-                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
+                        vp8_loop_filter_bh
                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
                }
                else
                {
                    if (mb_col > 0)
-                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
+                        vp8_loop_filter_simple_mbv
                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);

                    if (!skip_lf)
-                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
+                        vp8_loop_filter_simple_bv
                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);

-                    LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
+                    vp8_loop_filter_simple_mbh
                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);

                    if (!skip_lf)
-                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
+                        vp8_loop_filter_simple_bh
                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
                }
            }
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h
@@ -14,6 +14,7 @@

 #include "vpx_ports/mem.h"
 #include "vpx_config.h"
+#include "vpx_rtcd.h"

 #define MAX_LOOP_FILTER             63
 /* fraction of total macroblock rows to be used in fast filter level picking */
@@ -46,7 +47,7 @@ typedef struct
    unsigned char mode_lf_lut[10];
 } loop_filter_info_n;

-typedef struct
+typedef struct loop_filter_info
 {
    const unsigned char * mblim;
    const unsigned char * blim;
@@ -55,86 +56,6 @@ typedef struct
 } loop_filter_info;


-#define prototype_loopfilter(sym) \
-    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
-             const unsigned char *limit, const unsigned char *thresh, int count)
-
-#define prototype_loopfilter_block(sym) \
-    void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
-             int ystride, int uv_stride, loop_filter_info *lfi)
-
-#define prototype_simple_loopfilter(sym) \
-    void sym(unsigned char *y, int ystride, const unsigned char *blimit)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/loopfilter_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/loopfilter_arm.h"
-#endif
-
-#ifndef vp8_lf_normal_mb_v
-#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_c
-#endif
-extern prototype_loopfilter_block(vp8_lf_normal_mb_v);
-
-#ifndef vp8_lf_normal_b_v
-#define vp8_lf_normal_b_v vp8_loop_filter_bv_c
-#endif
-extern prototype_loopfilter_block(vp8_lf_normal_b_v);
-
-#ifndef vp8_lf_normal_mb_h
-#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_c
-#endif
-extern prototype_loopfilter_block(vp8_lf_normal_mb_h);
-
-#ifndef vp8_lf_normal_b_h
-#define vp8_lf_normal_b_h vp8_loop_filter_bh_c
-#endif
-extern prototype_loopfilter_block(vp8_lf_normal_b_h);
-
-#ifndef vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_c
-#endif
-extern prototype_simple_loopfilter(vp8_lf_simple_mb_v);
-
-#ifndef vp8_lf_simple_b_v
-#define vp8_lf_simple_b_v vp8_loop_filter_bvs_c
-#endif
-extern prototype_simple_loopfilter(vp8_lf_simple_b_v);
-
-#ifndef vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_c
-#endif
-extern prototype_simple_loopfilter(vp8_lf_simple_mb_h);
-
-#ifndef vp8_lf_simple_b_h
-#define vp8_lf_simple_b_h vp8_loop_filter_bhs_c
-#endif
-extern prototype_simple_loopfilter(vp8_lf_simple_b_h);
-
-typedef prototype_loopfilter_block((*vp8_lf_block_fn_t));
-typedef prototype_simple_loopfilter((*vp8_slf_block_fn_t));
-
-typedef struct
-{
-    vp8_lf_block_fn_t  normal_mb_v;
-    vp8_lf_block_fn_t  normal_b_v;
-    vp8_lf_block_fn_t  normal_mb_h;
-    vp8_lf_block_fn_t  normal_b_h;
-    vp8_slf_block_fn_t  simple_mb_v;
-    vp8_slf_block_fn_t  simple_b_v;
-    vp8_slf_block_fn_t  simple_mb_h;
-    vp8_slf_block_fn_t  simple_b_h;
-} vp8_loopfilter_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define LF_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define LF_INVOKE(ctx,fn) vp8_lf_##fn
-#endif
-
 typedef void loop_filter_uvfunction
 (
    unsigned char *u,   /* source pointer */
@@ -147,22 +68,22 @@ typedef void loop_filter_uvfunction

 /* assorted loopfilter functions which get used elsewhere */
 struct VP8Common;
-struct MacroBlockD;
+struct macroblockd;

 void vp8_loop_filter_init(struct VP8Common *cm);

 void vp8_loop_filter_frame_init(struct VP8Common *cm,
-                                struct MacroBlockD *mbd,
+                                struct macroblockd *mbd,
                                int default_filt_lvl);

-void vp8_loop_filter_frame(struct VP8Common *cm, struct MacroBlockD *mbd);
+void vp8_loop_filter_frame(struct VP8Common *cm, struct macroblockd *mbd);

 void vp8_loop_filter_partial_frame(struct VP8Common *cm,
-                                   struct MacroBlockD *mbd,
+                                   struct macroblockd *mbd,
                                   int default_filt_lvl);

 void vp8_loop_filter_frame_yonly(struct VP8Common *cm,
-                                 struct MacroBlockD *mbd,
+                                 struct macroblockd *mbd,
                                 int default_filt_lvl);

 void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
--- a/vp8/common/loopfilter_filters.c
+++ b/vp8/common/loopfilter_filters.c
@@ -352,3 +352,79 @@ void vp8_loop_filter_simple_vertical_edge_c
    while (++i < 16);

 }
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                          unsigned char *v_ptr, int y_stride, int uv_stride,
+                          loop_filter_info *lfi)
+{
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
+                           const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                          unsigned char *v_ptr, int y_stride, int uv_stride,
+                          loop_filter_info *lfi)
+{
+    vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
+                           const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
+}
--- a/vp8/common/mbpitch.c
+++ b/vp8/common/mbpitch.c
@@ -17,33 +17,6 @@ typedef enum
    DEST = 1
 } BLOCKSET;

-static void setup_block
-(
-    BLOCKD *b,
-    int mv_stride,
-    unsigned char **base,
-    int Stride,
-    int offset,
-    BLOCKSET bs
-)
-{
-
-    if (bs == DEST)
-    {
-        b->dst_stride = Stride;
-        b->dst = offset;
-        b->base_dst = base;
-    }
-    else
-    {
-        b->pre_stride = Stride;
-        b->pre = offset;
-        b->base_pre = base;
-    }
-
-}
-
-
 static void setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)
 {
    int block;
@@ -65,17 +38,15 @@ static void setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)

    for (block = 0; block < 16; block++) /* y blocks */
    {
-        setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride,
-                        (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4, bs);
+        x->block[block].offset =
+            (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4;
    }

    for (block = 16; block < 20; block++) /* U and V blocks */
    {
-        setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride,
-                        ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
-
-        setup_block(&x->block[block+4], x->dst.uv_stride, v, x->dst.uv_stride,
-                        ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
+        x->block[block+4].offset =
+        x->block[block].offset =
+            ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4;
    }
 }

--- a/vp8/common/mv.h
+++ b/vp8/common/mv.h
@@ -19,7 +19,7 @@ typedef struct
    short col;
 } MV;

-typedef union
+typedef union int_mv
 {
    uint32_t  as_int;
    MV        as_mv;
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -13,16 +13,14 @@
 #define __INC_VP8C_INT_H

 #include "vpx_config.h"
+#include "vpx_rtcd.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "loopfilter.h"
 #include "entropymv.h"
 #include "entropy.h"
-#include "idct.h"
-#include "recon.h"
 #if CONFIG_POSTPROC
 #include "postproc.h"
 #endif
-#include "dequantize.h"

 /*#ifdef PACKET_TESTING*/
 #include "header.h"
@@ -71,23 +69,6 @@ typedef enum
    BILINEAR = 1
 } INTERPOLATIONFILTERTYPE;

-typedef struct VP8_COMMON_RTCD
-{
-#if CONFIG_RUNTIME_CPU_DETECT
-    vp8_dequant_rtcd_vtable_t        dequant;
-    vp8_idct_rtcd_vtable_t        idct;
-    vp8_recon_rtcd_vtable_t       recon;
-    vp8_subpix_rtcd_vtable_t      subpix;
-    vp8_loopfilter_rtcd_vtable_t  loopfilter;
-#if CONFIG_POSTPROC
-    vp8_postproc_rtcd_vtable_t    postproc;
-#endif
-    int                           flags;
-#else
-    int unused;
-#endif
-} VP8_COMMON_RTCD;
-
 typedef struct VP8Common

 {
@@ -203,15 +184,13 @@ typedef struct VP8Common
    double bitrate;
    double framerate;

-#if CONFIG_RUNTIME_CPU_DETECT
-    VP8_COMMON_RTCD rtcd;
-#endif
 #if CONFIG_MULTITHREAD
    int processor_core_count;
 #endif
 #if CONFIG_POSTPROC
    struct postproc_state  postproc_state;
 #endif
+    int cpu_caps;
 } VP8_COMMON;

 #endif
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -10,15 +10,16 @@


 #include "vpx_config.h"
+#include "vpx_rtcd.h"
 #include "vpx_scale/yv12config.h"
 #include "postproc.h"
 #include "common.h"
-#include "recon.h"
 #include "vpx_scale/yv12extend.h"
 #include "vpx_scale/vpxscale.h"
 #include "systemdependent.h"
 #include "../encoder/variance.h"

+#include <limits.h>
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
@@ -329,21 +330,19 @@ static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG         *source,
        YV12_BUFFER_CONFIG         *post,
        int                         q,
        int                         low_var_thresh,
-        int                         flag,
-        vp8_postproc_rtcd_vtable_t *rtcd)
+        int                         flag)
 {
    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
    int ppl = (int)(level + .5);
    (void) low_var_thresh;
    (void) flag;

-    POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer, source->y_stride,  post->y_stride, source->y_height, source->y_width,  ppl);
-    POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
-    POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
+    vp8_post_proc_down_and_across(source->y_buffer, post->y_buffer, source->y_stride,  post->y_stride, source->y_height, source->y_width,  ppl);
+    vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
+    vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));

-
-    POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
-    POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
+    vp8_post_proc_down_and_across(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
+    vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);

 }

@@ -351,25 +350,23 @@ void vp8_deblock(YV12_BUFFER_CONFIG         *source,
                 YV12_BUFFER_CONFIG         *post,
                 int                         q,
                 int                         low_var_thresh,
-                 int                         flag,
-                 vp8_postproc_rtcd_vtable_t *rtcd)
+                 int                         flag)
 {
    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
    int ppl = (int)(level + .5);
    (void) low_var_thresh;
    (void) flag;

-    POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer, source->y_stride,  post->y_stride, source->y_height, source->y_width,   ppl);
-    POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride,  source->uv_height, source->uv_width, ppl);
-    POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
+    vp8_post_proc_down_and_across(source->y_buffer, post->y_buffer, source->y_stride,  post->y_stride, source->y_height, source->y_width,   ppl);
+    vp8_post_proc_down_and_across(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride,  source->uv_height, source->uv_width, ppl);
+    vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
 }

 void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
                  YV12_BUFFER_CONFIG         *post,
                  int                         q,
                  int                         low_var_thresh,
-                  int                         flag,
-                  vp8_postproc_rtcd_vtable_t *rtcd)
+                  int                         flag)
 {
    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
    int ppl = (int)(level + .5);
@@ -377,7 +374,7 @@ void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
    (void) low_var_thresh;
    (void) flag;

-    POSTPROC_INVOKE(rtcd, downacross)(
+    vp8_post_proc_down_and_across(
        source->y_buffer + 2 * source->y_stride + 2,
        source->y_buffer + 2 * source->y_stride + 2,
        source->y_stride,
@@ -385,14 +382,14 @@ void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
        source->y_height - 4,
        source->y_width - 4,
        ppl);
-    POSTPROC_INVOKE(rtcd, downacross)(
+    vp8_post_proc_down_and_across(
        source->u_buffer + 2 * source->uv_stride + 2,
        source->u_buffer + 2 * source->uv_stride + 2,
        source->uv_stride,
        source->uv_stride,
        source->uv_height - 4,
        source->uv_width - 4, ppl);
-    POSTPROC_INVOKE(rtcd, downacross)(
+    vp8_post_proc_down_and_across(
        source->v_buffer + 2 * source->uv_stride + 2,
        source->v_buffer + 2 * source->uv_stride + 2,
        source->uv_stride,
@@ -732,18 +729,18 @@ static void multiframe_quality_enhance_block
    unsigned int act, sse, sad, thr;
    if (blksize == 16)
    {
-        act = (vp8_variance_var16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
-        sad = (vp8_variance_sad16x16(y, y_stride, yd, yd_stride, 0)+128)>>8;
+        act = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, INT_MAX)+128)>>8;
    }
    else if (blksize == 8)
    {
-        act = (vp8_variance_var8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
-        sad = (vp8_variance_sad8x8(y, y_stride, yd, yd_stride, 0)+32)>>6;
+        act = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, INT_MAX)+32)>>6;
    }
    else
    {
-        act = (vp8_variance_var4x4(yd, yd_stride, VP8_ZEROS, 0, &sse)+8)>>4;
-        sad = (vp8_variance_sad4x4(y, y_stride, yd, yd_stride, 0)+8)>>4;
+        act = (vp8_variance4x4(yd, yd_stride, VP8_ZEROS, 0, &sse)+8)>>4;
+        sad = (vp8_sad4x4(y, y_stride, yd, yd_stride, INT_MAX)+8)>>4;
    }
    /* thr = qdiff/8 + log2(act) + log4(qprev) */
    thr = (qdiff>>3);
@@ -779,13 +776,13 @@ static void multiframe_quality_enhance_block
    {
        if (blksize == 16)
        {
-            vp8_recon_copy16x16(y, y_stride, yd, yd_stride);
-            vp8_recon_copy8x8(u, uv_stride, ud, uvd_stride);
-            vp8_recon_copy8x8(v, uv_stride, vd, uvd_stride);
+            vp8_copy_mem16x16(y, y_stride, yd, yd_stride);
+            vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride);
+            vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride);
        }
        else if (blksize == 8)
        {
-            vp8_recon_copy8x8(y, y_stride, yd, yd_stride);
+            vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
                vpx_memcpy(udp, up, blksizeby2);
            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
@@ -803,12 +800,6 @@ static void multiframe_quality_enhance_block
    }
 }

-#if CONFIG_RUNTIME_CPU_DETECT
-#define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc)
-#else
-#define RTCD_VTABLE(oci) NULL
-#endif
-
 void vp8_multiframe_quality_enhance
 (
    VP8_COMMON *cm
@@ -886,9 +877,9 @@ void vp8_multiframe_quality_enhance
            }
            else
            {
-                vp8_recon_copy16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
-                vp8_recon_copy8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
-                vp8_recon_copy8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
+                vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
+                vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
+                vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
            }
            y_ptr += 16;
            u_ptr += 8;
@@ -940,10 +931,16 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
    {
        if ((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK))
        {
-            if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer_int, oci->Width, oci->Height, VP8BORDERINPIXELS) >= 0)
-            {
-                oci->post_proc_buffer_int_used = 1;
-            }
+            int width = (oci->Width + 15) & ~15;
+            int height = (oci->Height + 15) & ~15;
+
+            if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer_int,
+                                            width, height, VP8BORDERINPIXELS))
+                vpx_internal_error(&oci->error, VPX_CODEC_MEM_ERROR,
+                                   "Failed to allocate MFQE framebuffer");
+
+            oci->post_proc_buffer_int_used = 1;
+
            // insure that postproc is set to all 0's so that post proc
            // doesn't pull random data in from edge
            vpx_memset((&oci->post_proc_buffer_int)->buffer_alloc,126,(&oci->post_proc_buffer)->frame_size);
@@ -967,12 +964,12 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
            if (flags & VP8D_DEMACROBLOCK)
            {
                vp8_deblock_and_de_macro_block(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
-                                               q + (deblock_level - 5) * 10, 1, 0, RTCD_VTABLE(oci));
+                                               q + (deblock_level - 5) * 10, 1, 0);
            }
            else if (flags & VP8D_DEBLOCK)
            {
                vp8_deblock(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
-                            q, 1, 0, RTCD_VTABLE(oci));
+                            q, 1, 0);
            }
        }
        /* Move partially towards the base q of the previous frame */
@@ -981,13 +978,13 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
    else if (flags & VP8D_DEMACROBLOCK)
    {
        vp8_deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
-                                       q + (deblock_level - 5) * 10, 1, 0, RTCD_VTABLE(oci));
+                                       q + (deblock_level - 5) * 10, 1, 0);
        oci->postproc_state.last_base_qindex = oci->base_qindex;
    }
    else if (flags & VP8D_DEBLOCK)
    {
        vp8_deblock(oci->frame_to_show, &oci->post_proc_buffer,
-                    q, 1, 0, RTCD_VTABLE(oci));
+                    q, 1, 0);
        oci->postproc_state.last_base_qindex = oci->base_qindex;
    }
    else
@@ -1004,7 +1001,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
            fillrd(&oci->postproc_state, 63 - q, noise_level);
        }

-        POSTPROC_INVOKE(RTCD_VTABLE(oci), addnoise)
+        vp8_plane_add_noise
        (oci->post_proc_buffer.y_buffer,
         oci->postproc_state.noise,
         oci->postproc_state.blackclamp,
@@ -1302,7 +1299,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
                                U = B_PREDICTION_MODE_colors[bmi->as_mode][1];
                                V = B_PREDICTION_MODE_colors[bmi->as_mode][2];

-                                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)
+                                vp8_blend_b
                                    (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride);
                            }
                            bmi++;
@@ -1319,7 +1316,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
                    U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
                    V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];

-                    POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)
+                    vp8_blend_mb_inner
                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
                }

@@ -1358,7 +1355,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
                    U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
                    V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];

-                    POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)
+                    vp8_blend_mb_outer
                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
                }

--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -12,92 +12,6 @@
 #ifndef POSTPROC_H
 #define POSTPROC_H

-#define prototype_postproc_inplace(sym)\
-    void sym (unsigned char *dst, int pitch, int rows, int cols,int flimit)
-
-#define prototype_postproc(sym)\
-    void sym (unsigned char *src, unsigned char *dst, int src_pitch,\
-              int dst_pitch, int rows, int cols, int flimit)
-
-#define prototype_postproc_addnoise(sym) \
-    void sym (unsigned char *s, char *noise, char blackclamp[16],\
-              char whiteclamp[16], char bothclamp[16],\
-              unsigned int w, unsigned int h, int pitch)
-
-#define prototype_postproc_blend_mb_inner(sym)\
-    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
-              int y1, int u1, int v1, int alpha, int stride)
-
-#define prototype_postproc_blend_mb_outer(sym)\
-    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
-              int y1, int u1, int v1, int alpha, int stride)
-
-#define prototype_postproc_blend_b(sym)\
-    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
-              int y1, int u1, int v1, int alpha, int stride)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/postproc_x86.h"
-#endif
-
-#ifndef vp8_postproc_down
-#define vp8_postproc_down vp8_mbpost_proc_down_c
-#endif
-extern prototype_postproc_inplace(vp8_postproc_down);
-
-#ifndef vp8_postproc_across
-#define vp8_postproc_across vp8_mbpost_proc_across_ip_c
-#endif
-extern prototype_postproc_inplace(vp8_postproc_across);
-
-#ifndef vp8_postproc_downacross
-#define vp8_postproc_downacross vp8_post_proc_down_and_across_c
-#endif
-extern prototype_postproc(vp8_postproc_downacross);
-
-#ifndef vp8_postproc_addnoise
-#define vp8_postproc_addnoise vp8_plane_add_noise_c
-#endif
-extern prototype_postproc_addnoise(vp8_postproc_addnoise);
-
-#ifndef vp8_postproc_blend_mb_inner
-#define vp8_postproc_blend_mb_inner vp8_blend_mb_inner_c
-#endif
-extern prototype_postproc_blend_mb_inner(vp8_postproc_blend_mb_inner);
-
-#ifndef vp8_postproc_blend_mb_outer
-#define vp8_postproc_blend_mb_outer vp8_blend_mb_outer_c
-#endif
-extern prototype_postproc_blend_mb_outer(vp8_postproc_blend_mb_outer);
-
-#ifndef vp8_postproc_blend_b
-#define vp8_postproc_blend_b vp8_blend_b_c
-#endif
-extern prototype_postproc_blend_b(vp8_postproc_blend_b);
-
-typedef prototype_postproc((*vp8_postproc_fn_t));
-typedef prototype_postproc_inplace((*vp8_postproc_inplace_fn_t));
-typedef prototype_postproc_addnoise((*vp8_postproc_addnoise_fn_t));
-typedef prototype_postproc_blend_mb_inner((*vp8_postproc_blend_mb_inner_fn_t));
-typedef prototype_postproc_blend_mb_outer((*vp8_postproc_blend_mb_outer_fn_t));
-typedef prototype_postproc_blend_b((*vp8_postproc_blend_b_fn_t));
-typedef struct
-{
-    vp8_postproc_inplace_fn_t           down;
-    vp8_postproc_inplace_fn_t           across;
-    vp8_postproc_fn_t                   downacross;
-    vp8_postproc_addnoise_fn_t          addnoise;
-    vp8_postproc_blend_mb_inner_fn_t    blend_mb_inner;
-    vp8_postproc_blend_mb_outer_fn_t    blend_mb_outer;
-    vp8_postproc_blend_b_fn_t           blend_b;
-} vp8_postproc_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define POSTPROC_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define POSTPROC_INVOKE(ctx,fn) vp8_postproc_##fn
-#endif
-
 #include "vpx_ports/mem.h"
 struct postproc_state
 {
@@ -119,13 +33,11 @@ void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
                  YV12_BUFFER_CONFIG         *post,
                  int                         q,
                  int                         low_var_thresh,
-                  int                         flag,
-                  vp8_postproc_rtcd_vtable_t *rtcd);
+                  int                         flag);

 void vp8_deblock(YV12_BUFFER_CONFIG         *source,
                 YV12_BUFFER_CONFIG         *post,
                 int                         q,
                 int                         low_var_thresh,
-                 int                         flag,
-                 vp8_postproc_rtcd_vtable_t *rtcd);
+                 int                         flag);
 #endif
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -1,111 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_RECON_H
-#define __INC_RECON_H
-
-#include "blockd.h"
-
-#define prototype_copy_block(sym) \
-    void sym(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch)
-
-#define prototype_recon_block(sym) \
-    void sym(unsigned char *pred, short *diff, int diff_stride, unsigned char *dst, int pitch)
-
-#define prototype_recon_macroblock(sym) \
-    void sym(const struct vp8_recon_rtcd_vtable *rtcd, MACROBLOCKD *x)
-
-#define prototype_build_intra_predictors(sym) \
-    void sym(MACROBLOCKD *x)
-
-#define prototype_intra4x4_predict(sym) \
-    void sym(unsigned char *src, int src_stride, int b_mode, \
-             unsigned char *dst, int dst_stride)
-
-struct vp8_recon_rtcd_vtable;
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/recon_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/recon_arm.h"
-#endif
-
-#ifndef vp8_recon_copy16x16
-#define vp8_recon_copy16x16 vp8_copy_mem16x16_c
-#endif
-extern prototype_copy_block(vp8_recon_copy16x16);
-
-#ifndef vp8_recon_copy8x8
-#define vp8_recon_copy8x8 vp8_copy_mem8x8_c
-#endif
-extern prototype_copy_block(vp8_recon_copy8x8);
-
-#ifndef vp8_recon_copy8x4
-#define vp8_recon_copy8x4 vp8_copy_mem8x4_c
-#endif
-extern prototype_copy_block(vp8_recon_copy8x4);
-
-#ifndef vp8_recon_build_intra_predictors_mby
-#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby
-#endif
-extern prototype_build_intra_predictors\
-    (vp8_recon_build_intra_predictors_mby);
-
-#ifndef vp8_recon_build_intra_predictors_mby_s
-#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s
-#endif
-extern prototype_build_intra_predictors\
-    (vp8_recon_build_intra_predictors_mby_s);
-
-#ifndef vp8_recon_build_intra_predictors_mbuv
-#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv
-#endif
-extern prototype_build_intra_predictors\
-    (vp8_recon_build_intra_predictors_mbuv);
-
-#ifndef vp8_recon_build_intra_predictors_mbuv_s
-#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s
-#endif
-extern prototype_build_intra_predictors\
-    (vp8_recon_build_intra_predictors_mbuv_s);
-
-#ifndef vp8_recon_intra4x4_predict
-#define vp8_recon_intra4x4_predict vp8_intra4x4_predict_c
-#endif
-extern prototype_intra4x4_predict\
-    (vp8_recon_intra4x4_predict);
-
-
-typedef prototype_copy_block((*vp8_copy_block_fn_t));
-typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t));
-typedef prototype_intra4x4_predict((*vp8_intra4x4_pred_fn_t));
-typedef struct vp8_recon_rtcd_vtable
-{
-    vp8_copy_block_fn_t  copy16x16;
-    vp8_copy_block_fn_t  copy8x8;
-    vp8_copy_block_fn_t  copy8x4;
-
-    vp8_build_intra_pred_fn_t  build_intra_predictors_mby_s;
-    vp8_build_intra_pred_fn_t  build_intra_predictors_mby;
-    vp8_build_intra_pred_fn_t  build_intra_predictors_mbuv_s;
-    vp8_build_intra_pred_fn_t  build_intra_predictors_mbuv;
-    vp8_intra4x4_pred_fn_t intra4x4_predict;
-} vp8_recon_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define RECON_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define RECON_INVOKE(ctx,fn) vp8_recon_##fn
-#endif
-
-#endif
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -9,10 +9,10 @@
 */


+#include <limits.h>
 #include "vpx_config.h"
+#include "vpx_rtcd.h"
 #include "vpx/vpx_integer.h"
-#include "recon.h"
-#include "subpixel.h"
 #include "blockd.h"
 #include "reconinter.h"
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -123,25 +123,19 @@ void vp8_copy_mem8x4_c(
 }


-void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf)
+void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf)
 {
    int r;
-    unsigned char *ptr_base;
-    unsigned char *ptr;
    unsigned char *pred_ptr = d->predictor;
-
-    ptr_base = *(d->base_pre);
+    unsigned char *ptr;
+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);

    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
    {
-        ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
-        sppf(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
+        sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
    }
    else
    {
-        ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
-        ptr = ptr_base;
-
        for (r = 0; r < 4; r++)
        {
 #if !(CONFIG_FAST_UNALIGNED)
@@ -153,65 +147,53 @@ void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf)
            *(uint32_t *)pred_ptr = *(uint32_t *)ptr ;
 #endif
            pred_ptr     += pitch;
-            ptr         += d->pre_stride;
+            ptr         += pre_stride;
        }
    }
 }

-static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride)
+static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride)
 {
-    unsigned char *ptr_base;
    unsigned char *ptr;
-
-    ptr_base = *(d->base_pre);
-    ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);

    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
    {
-        x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+        x->subpixel_predict8x8(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
    }
    else
    {
-        RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst, dst_stride);
+        vp8_copy_mem8x8(ptr, pre_stride, dst, dst_stride);
    }
 }

-static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride)
+static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride)
 {
-    unsigned char *ptr_base;
    unsigned char *ptr;
-
-    ptr_base = *(d->base_pre);
-    ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);

    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
    {
-        x->subpixel_predict8x4(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+        x->subpixel_predict8x4(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
    }
    else
    {
-        RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d->pre_stride, dst, dst_stride);
+        vp8_copy_mem8x4(ptr, pre_stride, dst, dst_stride);
    }
 }

-static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stride, vp8_subpix_fn_t sppf)
+static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf)
 {
    int r;
-    unsigned char *ptr_base;
    unsigned char *ptr;
-
-    ptr_base = *(d->base_pre);
+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);

    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
    {
-        ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
-        sppf(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+        sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
    }
    else
    {
-        ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
-        ptr = ptr_base;
-
        for (r = 0; r < 4; r++)
        {
 #if !(CONFIG_FAST_UNALIGNED)
@@ -223,7 +205,7 @@ static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stri
            *(uint32_t *)dst = *(uint32_t *)ptr ;
 #endif
            dst     += dst_stride;
-            ptr         += d->pre_stride;
+            ptr     += pre_stride;
        }
    }
 }
@@ -239,22 +221,13 @@ void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x)
    int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
    int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
    int offset;
-    int pre_stride = x->block[16].pre_stride;
+    int pre_stride = x->pre.uv_stride;

    /* calc uv motion vectors */
-    if (mv_row < 0)
-        mv_row -= 1;
-    else
-        mv_row += 1;
-
-    if (mv_col < 0)
-        mv_col -= 1;
-    else
-        mv_col += 1;
-
+    mv_row += 1 | (mv_row >> (sizeof(int) * CHAR_BIT - 1));
+    mv_col += 1 | (mv_col >> (sizeof(int) * CHAR_BIT - 1));
    mv_row /= 2;
    mv_col /= 2;
-
    mv_row &= x->fullpixel_mask;
    mv_col &= x->fullpixel_mask;

@@ -269,8 +242,8 @@ void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x)
    }
    else
    {
-        RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, upred_ptr, 8);
-        RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vpred_ptr, 8);
+        vp8_copy_mem8x8(uptr, pre_stride, upred_ptr, 8);
+        vp8_copy_mem8x8(vptr, pre_stride, vpred_ptr, 8);
    }
 }

@@ -278,6 +251,8 @@ void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x)
 void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
 {
    int i, j;
+    int pre_stride = x->pre.uv_stride;
+    unsigned char *base_pre;

    /* build uv mvs */
    for (i = 0; i < 2; i++)
@@ -295,8 +270,7 @@ void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
                   + x->block[yoffset+4].bmi.mv.as_mv.row
                   + x->block[yoffset+5].bmi.mv.as_mv.row;

-            if (temp < 0) temp -= 4;
-            else temp += 4;
+            temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);

            x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;

@@ -305,29 +279,41 @@ void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
                   + x->block[yoffset+4].bmi.mv.as_mv.col
                   + x->block[yoffset+5].bmi.mv.as_mv.col;

-            if (temp < 0) temp -= 4;
-            else temp += 4;
+            temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);

            x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;

-            x->block[voffset].bmi.mv.as_mv.row =
-                x->block[uoffset].bmi.mv.as_mv.row ;
-            x->block[voffset].bmi.mv.as_mv.col =
-                x->block[uoffset].bmi.mv.as_mv.col ;
+            x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
        }
    }

-    for (i = 16; i < 24; i += 2)
+    base_pre = x->pre.u_buffer;
+    for (i = 16; i < 20; i += 2)
    {
        BLOCKD *d0 = &x->block[i];
        BLOCKD *d1 = &x->block[i+1];

        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-            build_inter_predictors2b(x, d0, d0->predictor, 8);
+            build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
        else
        {
-            vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
-            vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict);
+            vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict);
+            vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict);
+        }
+    }
+
+    base_pre = x->pre.v_buffer;
+    for (i = 20; i < 24; i += 2)
+    {
+        BLOCKD *d0 = &x->block[i];
+        BLOCKD *d1 = &x->block[i+1];
+
+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+            build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
+        else
+        {
+            vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict);
+            vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict);
        }
    }
 }
@@ -342,7 +328,7 @@ void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
    unsigned char *ptr;
    int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
    int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
-    int pre_stride = x->block[0].pre_stride;
+    int pre_stride = x->pre.y_stride;

    ptr_base = x->pre.y_buffer;
    ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
@@ -354,7 +340,7 @@ void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
    }
    else
    {
-        RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_y,
+        vp8_copy_mem16x16(ptr, pre_stride, dst_y,
            dst_ystride);
    }
 }
@@ -409,7 +395,7 @@ void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
    int_mv _16x16mv;

    unsigned char *ptr_base = x->pre.y_buffer;
-    int pre_stride = x->block[0].pre_stride;
+    int pre_stride = x->pre.y_stride;

    _16x16mv.as_int = x->mode_info_context->mbmi.mv.as_int;

@@ -426,23 +412,14 @@ void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
    }
    else
    {
-        RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_y, dst_ystride);
+        vp8_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
    }

    /* calc uv motion vectors */
-    if ( _16x16mv.as_mv.row < 0)
-      _16x16mv.as_mv.row -= 1;
-    else
-      _16x16mv.as_mv.row += 1;
-
-    if (_16x16mv.as_mv.col < 0)
-        _16x16mv.as_mv.col -= 1;
-    else
-        _16x16mv.as_mv.col += 1;
-
+    _16x16mv.as_mv.row += 1 | (_16x16mv.as_mv.row >> (sizeof(int) * CHAR_BIT - 1));
+    _16x16mv.as_mv.col += 1 | (_16x16mv.as_mv.col >> (sizeof(int) * CHAR_BIT - 1));
    _16x16mv.as_mv.row /= 2;
    _16x16mv.as_mv.col /= 2;
-
    _16x16mv.as_mv.row &= x->fullpixel_mask;
    _16x16mv.as_mv.col &= x->fullpixel_mask;

@@ -458,19 +435,21 @@ void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
    }
    else
    {
-        RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, dst_u, dst_uvstride);
-        RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, dst_v, dst_uvstride);
+        vp8_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
+        vp8_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
    }
 }

 static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
 {
    int i;
+    unsigned char *base_dst = x->dst.y_buffer;
+    unsigned char *base_pre = x->pre.y_buffer;

    if (x->mode_info_context->mbmi.partitioning < 3)
    {
        BLOCKD *b;
-        int dst_stride = x->block[ 0].dst_stride;
+        int dst_stride = x->dst.y_stride;

        x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
        x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
@@ -485,13 +464,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
        }

        b = &x->block[ 0];
-        build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
        b = &x->block[ 2];
-        build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
        b = &x->block[ 8];
-        build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
        b = &x->block[10];
-        build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
    }
    else
    {
@@ -499,7 +478,7 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
        {
            BLOCKD *d0 = &x->block[i];
            BLOCKD *d1 = &x->block[i+1];
-            int dst_stride = x->block[ 0].dst_stride;
+            int dst_stride = x->dst.y_stride;

            x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
            x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
@@ -510,31 +489,51 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
            }

            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                build_inter_predictors2b(x, d0, *(d0->base_dst) + d0->dst, dst_stride);
+                build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
            else
            {
-                build_inter_predictors_b(d0, *(d0->base_dst) + d0->dst, dst_stride, x->subpixel_predict);
-                build_inter_predictors_b(d1, *(d1->base_dst) + d1->dst, dst_stride, x->subpixel_predict);
+                build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+                build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
            }

        }

    }
-
-    for (i = 16; i < 24; i += 2)
+    base_dst = x->dst.u_buffer;
+    base_pre = x->pre.u_buffer;
+    for (i = 16; i < 20; i += 2)
    {
        BLOCKD *d0 = &x->block[i];
        BLOCKD *d1 = &x->block[i+1];
-        int dst_stride = x->block[ 16].dst_stride;
+        int dst_stride = x->dst.uv_stride;

        /* Note: uv mvs already clamped in build_4x4uvmvs() */

        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-            build_inter_predictors2b(x, d0, *(d0->base_dst) + d0->dst, dst_stride);
+            build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
        else
        {
-            build_inter_predictors_b(d0, *(d0->base_dst) + d0->dst, dst_stride, x->subpixel_predict);
-            build_inter_predictors_b(d1, *(d1->base_dst) + d1->dst, dst_stride, x->subpixel_predict);
+            build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+            build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+        }
+    }
+
+    base_dst = x->dst.v_buffer;
+    base_pre = x->pre.v_buffer;
+    for (i = 20; i < 24; i += 2)
+    {
+        BLOCKD *d0 = &x->block[i];
+        BLOCKD *d1 = &x->block[i+1];
+        int dst_stride = x->dst.uv_stride;
+
+        /* Note: uv mvs already clamped in build_4x4uvmvs() */
+
+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+            build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
+        else
+        {
+            build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+            build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
        }
    }
 }
@@ -559,8 +558,7 @@ void build_4x4uvmvs(MACROBLOCKD *x)
                 + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.row
                 + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.row;

-            if (temp < 0) temp -= 4;
-            else temp += 4;
+            temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);

            x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;

@@ -569,18 +567,14 @@ void build_4x4uvmvs(MACROBLOCKD *x)
                 + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.col
                 + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.col;

-            if (temp < 0) temp -= 4;
-            else temp += 4;
+            temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);

            x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;

            if (x->mode_info_context->mbmi.need_to_clamp_mvs)
                clamp_uvmv_to_umv_border(&x->block[uoffset].bmi.mv.as_mv, x);

-            x->block[voffset].bmi.mv.as_mv.row =
-                x->block[uoffset].bmi.mv.as_mv.row ;
-            x->block[voffset].bmi.mv.as_mv.col =
-                x->block[uoffset].bmi.mv.as_mv.col ;
+            x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
        }
    }
 }
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -25,6 +25,8 @@ extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
                                                unsigned char *dst_y,
                                                int dst_ystride);
 extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
+                                         unsigned char *base_pre,
+                                         int pre_stride,
                                         vp8_subpix_fn_t sppf);

 extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@@ -10,15 +10,15 @@


 #include "vpx_config.h"
-#include "recon.h"
-#include "reconintra.h"
+#include "vpx_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
+#include "blockd.h"

 /* For skip_recon_mb(), add vp8_build_intra_predictors_mby_s(MACROBLOCKD *x) and
 * vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x).
 */

-void vp8_build_intra_predictors_mby(MACROBLOCKD *x)
+void vp8_build_intra_predictors_mby_c(MACROBLOCKD *x)
 {

    unsigned char *yabove_row = x->dst.y_buffer - x->dst.y_stride;
@@ -136,7 +136,7 @@ void vp8_build_intra_predictors_mby(MACROBLOCKD *x)
    }
 }

-void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x)
+void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x)
 {

    unsigned char *yabove_row = x->dst.y_buffer - x->dst.y_stride;
@@ -262,7 +262,7 @@ void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x)
    }
 }

-void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x)
+void vp8_build_intra_predictors_mbuv_c(MACROBLOCKD *x)
 {
    unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride;
    unsigned char uleft_col[16];
@@ -400,7 +400,7 @@ void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x)
    }
 }

-void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x)
+void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x)
 {
    unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride;
    unsigned char uleft_col[16];
--- a/vp8/common/reconintra.h
+++ b/vp8/common/reconintra.h
@@ -1,17 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_RECONINTRA_H
-#define __INC_RECONINTRA_H
-
-extern void init_intra_left_above_pixels(MACROBLOCKD *x);
-
-#endif
--- a/vp8/common/reconintra4x4.c
+++ b/vp8/common/reconintra4x4.c
@@ -9,7 +9,9 @@
 */


-#include "recon.h"
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "blockd.h"

 void vp8_intra4x4_predict_c(unsigned char *src, int src_stride,
                            int b_mode,
@@ -302,12 +304,13 @@ void vp8_intra4x4_predict_c(unsigned char *src, int src_stride,
 */
 void vp8_intra_prediction_down_copy(MACROBLOCKD *x)
 {
-    unsigned char *above_right = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16;
+    int dst_stride = x->dst.y_stride;
+    unsigned char *above_right = x->dst.y_buffer - dst_stride + 16;

    unsigned int *src_ptr = (unsigned int *)above_right;
-    unsigned int *dst_ptr0 = (unsigned int *)(above_right + 4 * x->block[0].dst_stride);
-    unsigned int *dst_ptr1 = (unsigned int *)(above_right + 8 * x->block[0].dst_stride);
-    unsigned int *dst_ptr2 = (unsigned int *)(above_right + 12 * x->block[0].dst_stride);
+    unsigned int *dst_ptr0 = (unsigned int *)(above_right + 4 * dst_stride);
+    unsigned int *dst_ptr1 = (unsigned int *)(above_right + 8 * dst_stride);
+    unsigned int *dst_ptr2 = (unsigned int *)(above_right + 12 * dst_stride);

    *dst_ptr0 = *src_ptr;
    *dst_ptr1 = *src_ptr;
--- a/vp8/common/reconintra4x4.h
+++ b/vp8/common/reconintra4x4.h
@@ -12,6 +12,8 @@
 #ifndef __INC_RECONINTRA4x4_H
 #define __INC_RECONINTRA4x4_H

-extern void vp8_intra_prediction_down_copy(MACROBLOCKD *x);
+struct macroblockd;
+
+extern void vp8_intra_prediction_down_copy(struct macroblockd *x);

 #endif
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -7,13 +7,6 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
-
-
 #include "vpx_config.h"
-#include "vpx_ports/x86.h"
-#include "vp8/decoder/onyxd_int.h"
-
-void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
-{
-
-}
+#define RTCD_C
+#include "vpx_rtcd.h"
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -0,0 +1,500 @@
+common_forward_decls() {
+cat <<EOF
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+EOF
+}
+forward_decls common_forward_decls
+
+#
+# Dequant
+#
+prototype void vp8_dequantize_b "struct blockd*, short *dqc"
+specialize vp8_dequantize_b mmx media neon
+vp8_dequantize_b_media=vp8_dequantize_b_v6
+
+prototype void vp8_dequant_idct_add "short *input, short *dq, unsigned char *output, int stride"
+specialize vp8_dequant_idct_add mmx media neon
+vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6
+
+prototype void vp8_dequant_idct_add_y_block "short *q, short *dq, unsigned char *dst, int stride, char *eobs"
+specialize vp8_dequant_idct_add_y_block mmx sse2 media neon
+vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6
+
+prototype void vp8_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"
+specialize vp8_dequant_idct_add_uv_block mmx sse2 media neon
+vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6
+
+#
+# Loopfilter
+#
+prototype void vp8_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp8_loop_filter_mbv mmx sse2 media neon
+vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6
+
+prototype void vp8_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp8_loop_filter_bv mmx sse2 media neon
+vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6
+
+prototype void vp8_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp8_loop_filter_mbh mmx sse2 media neon
+vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6
+
+prototype void vp8_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp8_loop_filter_bh mmx sse2 media neon
+vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6
+
+
+prototype void vp8_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp8_loop_filter_simple_mbv mmx sse2 media neon
+vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c
+vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx
+vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2
+vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6
+vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon
+
+prototype void vp8_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp8_loop_filter_simple_mbh mmx sse2 media neon
+vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c
+vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx
+vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2
+vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6
+vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon
+
+prototype void vp8_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp8_loop_filter_simple_bv mmx sse2 media neon
+vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c
+vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx
+vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2
+vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6
+vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon
+
+prototype void vp8_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp8_loop_filter_simple_bh mmx sse2 media neon
+vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c
+vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx
+vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2
+vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6
+vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon
+
+#
+# IDCT
+#
+#idct16
+prototype void vp8_short_idct4x4llm "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride"
+specialize vp8_short_idct4x4llm mmx media neon
+vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual
+
+#iwalsh1
+prototype void vp8_short_inv_walsh4x4_1 "short *input, short *output"
+# no asm yet
+
+#iwalsh16
+prototype void vp8_short_inv_walsh4x4 "short *input, short *output"
+specialize vp8_short_inv_walsh4x4 mmx sse2 media neon
+vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6
+
+#idct1_scalar_add
+prototype void vp8_dc_only_idct_add "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride"
+specialize vp8_dc_only_idct_add	mmx media neon
+vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6
+
+#
+# RECON
+#
+prototype void vp8_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp8_copy_mem16x16 mmx sse2 media neon
+vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6
+
+prototype void vp8_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp8_copy_mem8x8 mmx media neon
+vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6
+
+prototype void vp8_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp8_copy_mem8x4 mmx media neon
+vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6
+
+prototype void vp8_build_intra_predictors_mby "struct macroblockd *x"
+specialize vp8_build_intra_predictors_mby sse2 ssse3 neon
+
+prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x"
+specialize vp8_build_intra_predictors_mby_s sse2 ssse3 neon
+
+prototype void vp8_build_intra_predictors_mbuv "struct macroblockd *x"
+specialize vp8_build_intra_predictors_mbuv sse2 ssse3
+
+prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x"
+specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
+
+prototype void vp8_intra4x4_predict "unsigned char *src, int src_stride, int b_mode, unsigned char *dst, int dst_stride"
+specialize vp8_intra4x4_predict media
+vp8_intra4x4_predict_media=vp8_intra4x4_predict_armv6
+
+#
+# Postproc
+#
+if [ "$CONFIG_POSTPROC" = "yes" ]; then
+    prototype void vp8_mbpost_proc_down "unsigned char *dst, int pitch, int rows, int cols,int flimit"
+    specialize vp8_mbpost_proc_down mmx sse2
+    vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm
+
+    prototype void vp8_mbpost_proc_across_ip "unsigned char *dst, int pitch, int rows, int cols,int flimit"
+    specialize vp8_mbpost_proc_across_ip sse2
+    vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm
+
+    prototype void vp8_post_proc_down_and_across "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int rows, int cols, int flimit"
+    specialize vp8_post_proc_down_and_across mmx sse2
+    vp8_post_proc_down_and_across_sse2=vp8_post_proc_down_and_across_xmm
+
+    prototype void vp8_plane_add_noise "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch"
+    specialize vp8_plane_add_noise mmx sse2
+    vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt
+
+    prototype void vp8_blend_mb_inner "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+    # no asm yet
+
+    prototype void vp8_blend_mb_outer "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+    # no asm yet
+
+    prototype void vp8_blend_b "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+    # no asm yet
+fi
+
+#
+# Subpixel
+#
+prototype void vp8_sixtap_predict16x16 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon
+vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6
+
+prototype void vp8_sixtap_predict8x8 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon
+vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6
+
+prototype void vp8_sixtap_predict8x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon
+vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6
+
+prototype void vp8_sixtap_predict4x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_sixtap_predict4x4 mmx ssse3 media neon
+vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6
+
+prototype void vp8_bilinear_predict16x16 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon
+vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6
+
+prototype void vp8_bilinear_predict8x8 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon
+vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6
+
+prototype void vp8_bilinear_predict8x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_bilinear_predict8x4 mmx media neon
+vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6
+
+prototype void vp8_bilinear_predict4x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_bilinear_predict4x4 mmx media neon
+vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6
+
+#
+# Encoder functions below this point.
+#
+if [ "$CONFIG_VP8_ENCODER" = "yes" ]; then
+
+#
+# Whole-pixel Variance
+#
+prototype unsigned int vp8_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance4x4 mmx sse2
+vp8_variance4x4_sse2=vp8_variance4x4_wmt
+
+prototype unsigned int vp8_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance8x8 mmx sse2 media neon
+vp8_variance8x8_sse2=vp8_variance8x8_wmt
+vp8_variance8x8_media=vp8_variance8x8_armv6
+
+prototype unsigned int vp8_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance8x16 mmx sse2 neon
+vp8_variance8x16_sse2=vp8_variance8x16_wmt
+
+prototype unsigned int vp8_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance16x8 mmx sse2 neon
+vp8_variance16x8_sse2=vp8_variance16x8_wmt
+
+prototype unsigned int vp8_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance16x16 mmx sse2 media neon
+vp8_variance16x16_sse2=vp8_variance16x16_wmt
+vp8_variance16x16_media=vp8_variance16x16_armv6
+
+#
+# Sub-pixel Variance
+#
+prototype unsigned int vp8_sub_pixel_variance4x4 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance4x4 mmx sse2
+vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt
+
+prototype unsigned int vp8_sub_pixel_variance8x8 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance8x8 mmx sse2 media neon
+vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt
+vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6
+
+prototype unsigned int vp8_sub_pixel_variance8x16 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance8x16 mmx sse2
+vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt
+
+prototype unsigned int vp8_sub_pixel_variance16x8 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance16x8 mmx sse2 ssse3
+vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt
+
+prototype unsigned int vp8_sub_pixel_variance16x16 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon
+vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt
+vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6
+
+prototype unsigned int vp8_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_h mmx sse2 media neon
+vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt
+vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6
+
+prototype unsigned int vp8_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_v mmx sse2 media neon
+vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt
+vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6
+
+prototype unsigned int vp8_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_hv mmx sse2 media neon
+vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt
+vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6
+
+#
+# Sum of squares (vector)
+#
+prototype unsigned int vp8_get_mb_ss "const short *"
+specialize vp8_get_mb_ss mmx sse2
+
+#
+# SSE (Sum Squared Error)
+#
+prototype unsigned int vp8_sub_pixel_mse16x16 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_mse16x16 mmx sse2
+vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt
+
+prototype unsigned int vp8_mse16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_mse16x16 mmx sse2 media neon
+vp8_mse16x16_sse2=vp8_mse16x16_wmt
+vp8_mse16x16_media=vp8_mse16x16_armv6
+
+prototype unsigned int vp8_get4x4sse_cs "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride"
+specialize vp8_get4x4sse_cs mmx neon
+
+#
+# Single block SAD
+#
+prototype unsigned int vp8_sad4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+specialize vp8_sad4x4 mmx sse2 neon
+vp8_sad4x4_sse2=vp8_sad4x4_wmt
+
+prototype unsigned int vp8_sad8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+specialize vp8_sad8x8 mmx sse2 neon
+vp8_sad8x8_sse2=vp8_sad8x8_wmt
+
+prototype unsigned int vp8_sad8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+specialize vp8_sad8x16 mmx sse2 neon
+vp8_sad8x16_sse2=vp8_sad8x16_wmt
+
+prototype unsigned int vp8_sad16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+specialize vp8_sad16x8 mmx sse2 neon
+vp8_sad16x8_sse2=vp8_sad16x8_wmt
+
+prototype unsigned int vp8_sad16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+specialize vp8_sad16x16 mmx sse2 sse3 media neon
+vp8_sad16x16_sse2=vp8_sad16x16_wmt
+vp8_sad16x16_media=vp8_sad16x16_armv6
+
+#
+# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+#
+prototype void vp8_sad4x4x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad4x4x3 sse3
+
+prototype void vp8_sad8x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x8x3 sse3
+
+prototype void vp8_sad8x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x16x3 sse3
+
+prototype void vp8_sad16x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x8x3 sse3 ssse3
+
+prototype void vp8_sad16x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x16x3 sse3 ssse3
+
+# Note the only difference in the following prototypes is that they return into
+# an array of short
+prototype void vp8_sad4x4x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad4x4x8 sse4_1
+vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4
+
+prototype void vp8_sad8x8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad8x8x8 sse4_1
+vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4
+
+prototype void vp8_sad8x16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad8x16x8 sse4_1
+vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4
+
+prototype void vp8_sad16x8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad16x8x8 sse4_1
+vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4
+
+prototype void vp8_sad16x16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad16x16x8 sse4_1
+vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4
+
+#
+# Multi-block SAD, comparing a reference to N independent blocks
+#
+prototype void vp8_sad4x4x4d "const unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr[4], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad4x4x4d sse3
+
+prototype void vp8_sad8x8x4d "const unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr[4], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x8x4d sse3
+
+prototype void vp8_sad8x16x4d "const unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr[4], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x16x4d sse3
+
+prototype void vp8_sad16x8x4d "const unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr[4], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x8x4d sse3
+
+prototype void vp8_sad16x16x4d "const unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr[4], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x16x4d sse3
+
+#
+# Block copy
+#
+case $arch in
+    x86*)
+    prototype void vp8_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"
+    specialize vp8_copy32xn sse2 sse3
+    ;;
+esac
+
+#
+# Structured Similarity (SSIM)
+#
+if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
+    [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
+
+    prototype void vp8_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+    specialize vp8_ssim_parms_8x8 $sse2_on_x86_64
+
+    prototype void vp8_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+    specialize vp8_ssim_parms_16x16 $sse2_on_x86_64
+fi
+
+#
+# Forward DCT
+#
+prototype void vp8_short_fdct4x4 "short *input, short *output, int pitch"
+specialize vp8_short_fdct4x4 mmx sse2 media neon
+vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6
+
+prototype void vp8_short_fdct8x4 "short *input, short *output, int pitch"
+specialize vp8_short_fdct8x4 mmx sse2 media neon
+vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6
+
+prototype void vp8_short_walsh4x4 "short *input, short *output, int pitch"
+specialize vp8_short_walsh4x4 sse2 media neon
+vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6
+
+#
+# Quantizer
+#
+prototype void vp8_regular_quantize_b "struct block *, struct blockd *"
+specialize vp8_regular_quantize_b sse2 sse4_1
+vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4
+
+prototype void vp8_fast_quantize_b "struct block *, struct blockd *"
+specialize vp8_fast_quantize_b sse2 ssse3 media neon
+vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6
+
+prototype void vp8_regular_quantize_b_pair "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"
+# no asm yet
+
+prototype void vp8_fast_quantize_b_pair "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"
+specialize vp8_fast_quantize_b_pair neon
+
+prototype void vp8_quantize_mb "struct macroblock *"
+specialize vp8_quantize_mb neon
+
+prototype void vp8_quantize_mby "struct macroblock *"
+specialize vp8_quantize_mby neon
+
+prototype void vp8_quantize_mbuv "struct macroblock *"
+specialize vp8_quantize_mbuv neon
+
+#
+# Block subtraction
+#
+prototype int vp8_block_error "short *coeff, short *dqcoeff"
+specialize vp8_block_error mmx sse2
+vp8_block_error_sse2=vp8_block_error_xmm
+
+prototype int vp8_mbblock_error "struct macroblock *mb, int dc"
+specialize vp8_mbblock_error mmx sse2
+vp8_mbblock_error_sse2=vp8_mbblock_error_xmm
+
+prototype int vp8_mbuverror "struct macroblock *mb"
+specialize vp8_mbuverror mmx sse2
+vp8_mbuverror_sse2=vp8_mbuverror_xmm
+
+prototype void vp8_subtract_b "struct block *be, struct blockd *bd, int pitch"
+specialize vp8_subtract_b mmx sse2 media neon
+vp8_subtract_b_media=vp8_subtract_b_armv6
+
+prototype void vp8_subtract_mby "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride"
+specialize vp8_subtract_mby mmx sse2 media neon
+vp8_subtract_mby_media=vp8_subtract_mby_armv6
+
+prototype void vp8_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride"
+specialize vp8_subtract_mbuv mmx sse2 media neon
+vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6
+
+#
+# Motion search
+#
+prototype int vp8_full_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
+specialize vp8_full_search_sad sse3 sse4_1
+vp8_full_search_sad_sse3=vp8_full_search_sadx3
+vp8_full_search_sad_sse4_1=vp8_full_search_sadx8
+
+prototype int vp8_refining_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
+specialize vp8_refining_search_sad sse3
+vp8_refining_search_sad_sse3=vp8_refining_search_sadx4
+
+prototype int vp8_diamond_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
+vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4
+
+#
+# Alt-ref Noise Reduction (ARNR)
+#
+if [ "$CONFIG_REALTIME_ONLY" != "yes" ]; then
+    prototype void vp8_temporal_filter_apply "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count"
+    specialize vp8_temporal_filter_apply sse2
+fi
+
+#
+# Pick Loopfilter
+#
+prototype void vp8_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
+specialize vp8_yv12_copy_partial_frame neon
+
+# End of encoder only functions
+fi
--- a/vp8/common/subpixel.h
+++ b/vp8/common/subpixel.h
@@ -1,86 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef SUBPIXEL_H
-#define SUBPIXEL_H
-
-#define prototype_subpixel_predict(sym) \
-    void sym(unsigned char *src, int src_pitch, int xofst, int yofst, \
-             unsigned char *dst, int dst_pitch)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/subpixel_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/subpixel_arm.h"
-#endif
-
-#ifndef vp8_subpix_sixtap16x16
-#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_c
-#endif
-extern prototype_subpixel_predict(vp8_subpix_sixtap16x16);
-
-#ifndef vp8_subpix_sixtap8x8
-#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_c
-#endif
-extern prototype_subpixel_predict(vp8_subpix_sixtap8x8);
-
-#ifndef vp8_subpix_sixtap8x4
-#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_c
-#endif
-extern prototype_subpixel_predict(vp8_subpix_sixtap8x4);
-
-#ifndef vp8_subpix_sixtap4x4
-#define vp8_subpix_sixtap4x4 vp8_sixtap_predict_c
-#endif
-extern prototype_subpixel_predict(vp8_subpix_sixtap4x4);
-
-#ifndef vp8_subpix_bilinear16x16
-#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_c
-#endif
-extern prototype_subpixel_predict(vp8_subpix_bilinear16x16);
-
-#ifndef vp8_subpix_bilinear8x8
-#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_c
-#endif
-extern prototype_subpixel_predict(vp8_subpix_bilinear8x8);
-
-#ifndef vp8_subpix_bilinear8x4
-#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_c
-#endif
-extern prototype_subpixel_predict(vp8_subpix_bilinear8x4);
-
-#ifndef vp8_subpix_bilinear4x4
-#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_c
-#endif
-extern prototype_subpixel_predict(vp8_subpix_bilinear4x4);
-
-typedef prototype_subpixel_predict((*vp8_subpix_fn_t));
-typedef struct
-{
-    vp8_subpix_fn_t  sixtap16x16;
-    vp8_subpix_fn_t  sixtap8x8;
-    vp8_subpix_fn_t  sixtap8x4;
-    vp8_subpix_fn_t  sixtap4x4;
-    vp8_subpix_fn_t  bilinear16x16;
-    vp8_subpix_fn_t  bilinear8x8;
-    vp8_subpix_fn_t  bilinear8x4;
-    vp8_subpix_fn_t  bilinear4x4;
-} vp8_subpix_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define SUBPIX_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define SUBPIX_INVOKE(ctx,fn) vp8_subpix_##fn
-#endif
-
-#endif
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -33,6 +33,29 @@
 #define pthread_getspecific(ts_key) TlsGetValue(ts_key)
 #define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
 #define pthread_self() GetCurrentThreadId()
+
+#elif defined(__OS2__)
+/* OS/2 */
+#define INCL_DOS
+#include <os2.h>
+
+#include <stdlib.h>
+#define THREAD_FUNCTION void
+#define THREAD_FUNCTION_RETURN void
+#define THREAD_SPECIFIC_INDEX PULONG
+#define pthread_t TID
+#define pthread_attr_t ULONG
+#define pthread_create(thhandle,attr,thfunc,tharg) \
+    ((int)((*(thhandle)=_beginthread(thfunc,NULL,1024*1024,tharg))==-1))
+#define pthread_join(thread, result) ((int)DosWaitThread(&(thread),0))
+#define pthread_detach(thread) 0
+#define thread_sleep(nms) DosSleep(nms)
+#define pthread_cancel(thread) DosKillThread(thread)
+#define ts_key_create(ts_key, destructor) \
+    DosAllocThreadLocalMemory(1, &(ts_key));
+#define pthread_getspecific(ts_key) ((void *)(*(ts_key)))
+#define pthread_setspecific(ts_key, value) (*(ts_key)=(ULONG)(value))
+#define pthread_self() _gettid()
 #else
 #ifdef __APPLE__
 #include <mach/mach_init.h>
@@ -64,6 +87,76 @@
 #define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE)
 #define thread_sleep(nms) Sleep(nms)

+#elif defined(__OS2__)
+typedef struct
+{
+    HEV  event;
+    HMTX wait_mutex;
+    HMTX count_mutex;
+    int  count;
+} sem_t;
+
+static inline int sem_init(sem_t *sem, int pshared, unsigned int value)
+{
+    DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0,
+                      value > 0 ? TRUE : FALSE);
+    DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE);
+    DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE);
+
+    sem->count = value;
+
+    return 0;
+}
+
+static inline int sem_wait(sem_t * sem)
+{
+    DosRequestMutexSem(sem->wait_mutex, -1);
+
+    DosWaitEventSem(sem->event, -1);
+
+    DosRequestMutexSem(sem->count_mutex, -1);
+
+    sem->count--;
+    if (sem->count == 0)
+    {
+        ULONG post_count;
+
+        DosResetEventSem(sem->event, &post_count);
+    }
+
+    DosReleaseMutexSem(sem->count_mutex);
+
+    DosReleaseMutexSem(sem->wait_mutex);
+
+    return 0;
+}
+
+static inline int sem_post(sem_t * sem)
+{
+    DosRequestMutexSem(sem->count_mutex, -1);
+
+    if (sem->count < 32768)
+    {
+        sem->count++;
+        DosPostEventSem(sem->event);
+    }
+
+    DosReleaseMutexSem(sem->count_mutex);
+
+    return 0;
+}
+
+static inline int sem_destroy(sem_t * sem)
+{
+    DosCloseEventSem(sem->event);
+    DosCloseMutexSem(sem->wait_mutex);
+    DosCloseMutexSem(sem->count_mutex);
+
+    return 0;
+}
+
+#define thread_sleep(nms) DosSleep(nms)
+
 #else

 #ifdef __APPLE__
--- a/vp8/common/x86/dequantize_x86.h
+++ b/vp8/common/x86/dequantize_x86.h
@@ -1,58 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef DEQUANTIZE_X86_H
-#define DEQUANTIZE_X86_H
-
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-#if HAVE_MMX
-extern prototype_dequant_block(vp8_dequantize_b_mmx);
-extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
-extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx);
-extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_dequant_block
-#define vp8_dequant_block vp8_dequantize_b_mmx
-
-#undef  vp8_dequant_idct_add
-#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
-
-#undef vp8_dequant_idct_add_y_block
-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx
-
-#undef vp8_dequant_idct_add_uv_block
-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_mmx
-
-#endif
-#endif
-
-#if HAVE_SSE2
-extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2);
-extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_dequant_idct_add_y_block
-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
-
-#undef vp8_dequant_idct_add_uv_block
-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2
-
-#endif
-#endif
-
-#endif
--- a/vp8/common/x86/idct_blk_mmx.c
+++ b/vp8/common/x86/idct_blk_mmx.c
@@ -9,8 +9,8 @@
 */

 #include "vpx_config.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/dequantize.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/blockd.h"

 extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);

--- a/vp8/common/x86/idct_blk_sse2.c
+++ b/vp8/common/x86/idct_blk_sse2.c
@@ -9,8 +9,7 @@
 */

 #include "vpx_config.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/dequantize.h"
+#include "vpx_rtcd.h"

 void vp8_idct_dequant_0_2x_sse2
            (short *q, short *dq ,
--- a/vp8/common/x86/idct_x86.h
+++ b/vp8/common/x86/idct_x86.h
@@ -1,56 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef IDCT_X86_H
-#define IDCT_X86_H
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_idct(vp8_short_idct4x4llm_mmx);
-extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx);
-
-extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_idct_idct16
-#define vp8_idct_idct16 vp8_short_idct4x4llm_mmx
-
-#undef  vp8_idct_idct1_scalar_add
-#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_mmx
-
-#undef vp8_idct_iwalsh16
-#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx
-
-#endif
-#endif
-
-#if HAVE_SSE2
-
-extern prototype_second_order(vp8_short_inv_walsh4x4_sse2);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp8_idct_iwalsh16
-#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_sse2
-
-#endif
-
-#endif
-
-
-
-#endif
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -12,10 +12,19 @@
 #include "vpx_config.h"
 #include "vp8/common/loopfilter.h"

+#define prototype_loopfilter(sym) \
+    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+             const unsigned char *limit, const unsigned char *thresh, int count)
+
+#define prototype_simple_loopfilter(sym) \
+    void sym(unsigned char *y, int ystride, const unsigned char *blimit)
+
 prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
 prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
 prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
 prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
+prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
+prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);

 #if HAVE_SSE2 && ARCH_X86_64
 prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
--- a/vp8/common/x86/loopfilter_x86.h
+++ b/vp8/common/x86/loopfilter_x86.h
@@ -1,100 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef LOOPFILTER_X86_H
-#define LOOPFILTER_X86_H
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_loopfilter_block(vp8_loop_filter_mbv_mmx);
-extern prototype_loopfilter_block(vp8_loop_filter_bv_mmx);
-extern prototype_loopfilter_block(vp8_loop_filter_mbh_mmx);
-extern prototype_loopfilter_block(vp8_loop_filter_bh_mmx);
-extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
-extern prototype_simple_loopfilter(vp8_loop_filter_bvs_mmx);
-extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
-extern prototype_simple_loopfilter(vp8_loop_filter_bhs_mmx);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_lf_normal_mb_v
-#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_mmx
-
-#undef  vp8_lf_normal_b_v
-#define vp8_lf_normal_b_v vp8_loop_filter_bv_mmx
-
-#undef  vp8_lf_normal_mb_h
-#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_mmx
-
-#undef  vp8_lf_normal_b_h
-#define vp8_lf_normal_b_h vp8_loop_filter_bh_mmx
-
-#undef  vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_mmx
-
-#undef  vp8_lf_simple_b_v
-#define vp8_lf_simple_b_v vp8_loop_filter_bvs_mmx
-
-#undef  vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_mmx
-
-#undef  vp8_lf_simple_b_h
-#define vp8_lf_simple_b_h vp8_loop_filter_bhs_mmx
-#endif
-#endif
-
-
-#if HAVE_SSE2
-extern prototype_loopfilter_block(vp8_loop_filter_mbv_sse2);
-extern prototype_loopfilter_block(vp8_loop_filter_bv_sse2);
-extern prototype_loopfilter_block(vp8_loop_filter_mbh_sse2);
-extern prototype_loopfilter_block(vp8_loop_filter_bh_sse2);
-extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2);
-extern prototype_simple_loopfilter(vp8_loop_filter_bvs_sse2);
-extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2);
-extern prototype_simple_loopfilter(vp8_loop_filter_bhs_sse2);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_lf_normal_mb_v
-#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_sse2
-
-#undef  vp8_lf_normal_b_v
-#define vp8_lf_normal_b_v vp8_loop_filter_bv_sse2
-
-#undef  vp8_lf_normal_mb_h
-#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_sse2
-
-#undef  vp8_lf_normal_b_h
-#define vp8_lf_normal_b_h vp8_loop_filter_bh_sse2
-
-#undef  vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_sse2
-
-#undef  vp8_lf_simple_b_v
-#define vp8_lf_simple_b_v vp8_loop_filter_bvs_sse2
-
-#undef  vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_sse2
-
-#undef  vp8_lf_simple_b_h
-#define vp8_lf_simple_b_h vp8_loop_filter_bhs_sse2
-#endif
-#endif
-
-
-#endif
--- a/vp8/common/x86/postproc_x86.h
+++ b/vp8/common/x86/postproc_x86.h
@@ -1,64 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef POSTPROC_X86_H
-#define POSTPROC_X86_H
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_postproc_inplace(vp8_mbpost_proc_down_mmx);
-extern prototype_postproc(vp8_post_proc_down_and_across_mmx);
-extern prototype_postproc_addnoise(vp8_plane_add_noise_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_postproc_down
-#define vp8_postproc_down vp8_mbpost_proc_down_mmx
-
-#undef  vp8_postproc_downacross
-#define vp8_postproc_downacross vp8_post_proc_down_and_across_mmx
-
-#undef  vp8_postproc_addnoise
-#define vp8_postproc_addnoise vp8_plane_add_noise_mmx
-
-#endif
-#endif
-
-
-#if HAVE_SSE2
-extern prototype_postproc_inplace(vp8_mbpost_proc_down_xmm);
-extern prototype_postproc_inplace(vp8_mbpost_proc_across_ip_xmm);
-extern prototype_postproc(vp8_post_proc_down_and_across_xmm);
-extern prototype_postproc_addnoise(vp8_plane_add_noise_wmt);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_postproc_down
-#define vp8_postproc_down vp8_mbpost_proc_down_xmm
-
-#undef  vp8_postproc_across
-#define vp8_postproc_across vp8_mbpost_proc_across_ip_xmm
-
-#undef  vp8_postproc_downacross
-#define vp8_postproc_downacross vp8_post_proc_down_and_across_xmm
-
-#undef  vp8_postproc_addnoise
-#define vp8_postproc_addnoise vp8_plane_add_noise_wmt
-
-
-#endif
-#endif
-
-#endif
--- a/vp8/common/x86/recon_wrapper_sse2.c
+++ b/vp8/common/x86/recon_wrapper_sse2.c
@@ -9,9 +9,9 @@
 */

 #include "vpx_config.h"
-#include "vp8/common/recon.h"
-#include "recon_x86.h"
+#include "vpx_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vp8/common/blockd.h"

 #define build_intra_predictors_mbuv_prototype(sym) \
    void sym(unsigned char *dst, int dst_stride, \
--- a/vp8/common/x86/recon_x86.h
+++ b/vp8/common/x86/recon_x86.h
@@ -1,88 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef RECON_X86_H
-#define RECON_X86_H
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_copy_block(vp8_copy_mem8x8_mmx);
-extern prototype_copy_block(vp8_copy_mem8x4_mmx);
-extern prototype_copy_block(vp8_copy_mem16x16_mmx);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_recon_copy8x8
-#define vp8_recon_copy8x8 vp8_copy_mem8x8_mmx
-
-#undef  vp8_recon_copy8x4
-#define vp8_recon_copy8x4 vp8_copy_mem8x4_mmx
-
-#undef  vp8_recon_copy16x16
-#define vp8_recon_copy16x16 vp8_copy_mem16x16_mmx
-
-#endif
-#endif
-
-#if HAVE_SSE2
-extern prototype_copy_block(vp8_copy_mem16x16_sse2);
-extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_sse2);
-extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
-extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_sse2);
-extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_sse2);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_recon_copy16x16
-#define vp8_recon_copy16x16 vp8_copy_mem16x16_sse2
-
-#undef  vp8_recon_build_intra_predictors_mbuv
-#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv_sse2
-
-#undef  vp8_recon_build_intra_predictors_mbuv_s
-#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_sse2
-
-#undef  vp8_recon_build_intra_predictors_mby
-#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_sse2
-
-#undef  vp8_recon_build_intra_predictors_mby_s
-#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_sse2
-
-#endif
-#endif
-
-#if HAVE_SSSE3
-extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_ssse3);
-extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_ssse3);
-extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_ssse3);
-extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_ssse3);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_recon_build_intra_predictors_mbuv
-#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv_ssse3
-
-#undef  vp8_recon_build_intra_predictors_mbuv_s
-#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_ssse3
-
-#undef  vp8_recon_build_intra_predictors_mby
-#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_ssse3
-
-#undef  vp8_recon_build_intra_predictors_mby_s
-#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_ssse3
-
-#endif
-#endif
-#endif
--- a/vp8/common/x86/subpixel_x86.h
+++ b/vp8/common/x86/subpixel_x86.h
@@ -1,124 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef SUBPIXEL_X86_H
-#define SUBPIXEL_X86_H
-
-#include "filter_x86.h"
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_subpixel_predict(vp8_sixtap_predict16x16_mmx);
-extern prototype_subpixel_predict(vp8_sixtap_predict8x8_mmx);
-extern prototype_subpixel_predict(vp8_sixtap_predict8x4_mmx);
-extern prototype_subpixel_predict(vp8_sixtap_predict4x4_mmx);
-extern prototype_subpixel_predict(vp8_bilinear_predict16x16_mmx);
-extern prototype_subpixel_predict(vp8_bilinear_predict8x8_mmx);
-extern prototype_subpixel_predict(vp8_bilinear_predict8x4_mmx);
-extern prototype_subpixel_predict(vp8_bilinear_predict4x4_mmx);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_subpix_sixtap16x16
-#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_mmx
-
-#undef  vp8_subpix_sixtap8x8
-#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_mmx
-
-#undef  vp8_subpix_sixtap8x4
-#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_mmx
-
-#undef  vp8_subpix_sixtap4x4
-#define vp8_subpix_sixtap4x4 vp8_sixtap_predict4x4_mmx
-
-#undef  vp8_subpix_bilinear16x16
-#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_mmx
-
-#undef  vp8_subpix_bilinear8x8
-#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_mmx
-
-#undef  vp8_subpix_bilinear8x4
-#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_mmx
-
-#undef  vp8_subpix_bilinear4x4
-#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_mmx
-
-#endif
-#endif
-
-
-#if HAVE_SSE2
-extern prototype_subpixel_predict(vp8_sixtap_predict16x16_sse2);
-extern prototype_subpixel_predict(vp8_sixtap_predict8x8_sse2);
-extern prototype_subpixel_predict(vp8_sixtap_predict8x4_sse2);
-extern prototype_subpixel_predict(vp8_bilinear_predict16x16_sse2);
-extern prototype_subpixel_predict(vp8_bilinear_predict8x8_sse2);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_subpix_sixtap16x16
-#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_sse2
-
-#undef  vp8_subpix_sixtap8x8
-#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_sse2
-
-#undef  vp8_subpix_sixtap8x4
-#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_sse2
-
-#undef  vp8_subpix_bilinear16x16
-#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_sse2
-
-#undef  vp8_subpix_bilinear8x8
-#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_sse2
-
-#endif
-#endif
-
-#if HAVE_SSSE3
-extern prototype_subpixel_predict(vp8_sixtap_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp8_sixtap_predict8x8_ssse3);
-extern prototype_subpixel_predict(vp8_sixtap_predict8x4_ssse3);
-extern prototype_subpixel_predict(vp8_sixtap_predict4x4_ssse3);
-extern prototype_subpixel_predict(vp8_bilinear_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp8_bilinear_predict8x8_ssse3);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_subpix_sixtap16x16
-#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_ssse3
-
-#undef  vp8_subpix_sixtap8x8
-#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_ssse3
-
-#undef  vp8_subpix_sixtap8x4
-#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_ssse3
-
-#undef  vp8_subpix_sixtap4x4
-#define vp8_subpix_sixtap4x4 vp8_sixtap_predict4x4_ssse3
-
-
-#undef  vp8_subpix_bilinear16x16
-#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_ssse3
-
-#undef  vp8_subpix_bilinear8x8
-#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_ssse3
-
-#endif
-#endif
-
-
-
-#endif
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -10,8 +10,8 @@


 #include "vpx_config.h"
+#include "vpx_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "vp8/common/subpixel.h"
 #include "filter_x86.h"

 extern const short vp8_six_tap_mmx[8][6*8];
@@ -114,7 +114,6 @@ extern void vp8_filter_block1d8_v6_only_sse2
    unsigned int   output_height,
    const short   *vp8_filter
 );
-extern prototype_subpixel_predict(vp8_bilinear_predict8x8_mmx);


 #if HAVE_MMX
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -1,146 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vpx_ports/x86.h"
-#include "vp8/common/subpixel.h"
-#include "vp8/common/loopfilter.h"
-#include "vp8/common/recon.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/pragmas.h"
-#include "vp8/common/onyxc_int.h"
-
-void vp8_arch_x86_common_init(VP8_COMMON *ctx)
-{
-#if CONFIG_RUNTIME_CPU_DETECT
-    VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
-    int flags = x86_simd_caps();
-
-    /* Note:
-     *
-     * This platform can be built without runtime CPU detection as well. If
-     * you modify any of the function mappings present in this file, be sure
-     * to also update them in static mapings (<arch>/filename_<arch>.h)
-     */
-
-    /* Override default functions with fastest ones for this CPU. */
-#if HAVE_MMX
-
-    if (flags & HAS_MMX)
-    {
-        rtcd->dequant.block               = vp8_dequantize_b_mmx;
-        rtcd->dequant.idct_add            = vp8_dequant_idct_add_mmx;
-        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_mmx;
-        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_mmx;
-
-        rtcd->idct.idct16       = vp8_short_idct4x4llm_mmx;
-        rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
-        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_mmx;
-
-        rtcd->recon.copy8x8     = vp8_copy_mem8x8_mmx;
-        rtcd->recon.copy8x4     = vp8_copy_mem8x4_mmx;
-        rtcd->recon.copy16x16   = vp8_copy_mem16x16_mmx;
-
-        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_mmx;
-        rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_mmx;
-        rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_mmx;
-        rtcd->subpix.sixtap4x4     = vp8_sixtap_predict4x4_mmx;
-        rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_mmx;
-        rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_mmx;
-        rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_mmx;
-        rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_mmx;
-
-        rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_mmx;
-        rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_mmx;
-        rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_mmx;
-        rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_mmx;
-        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_mmx;
-        rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_mmx;
-        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_mmx;
-        rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_mmx;
-
-#if CONFIG_POSTPROC
-        rtcd->postproc.down        = vp8_mbpost_proc_down_mmx;
-        /*rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;*/
-        rtcd->postproc.downacross  = vp8_post_proc_down_and_across_mmx;
-        rtcd->postproc.addnoise    = vp8_plane_add_noise_mmx;
-#endif
-    }
-
-#endif
-#if HAVE_SSE2
-
-    if (flags & HAS_SSE2)
-    {
-        rtcd->recon.copy16x16   = vp8_copy_mem16x16_sse2;
-        rtcd->recon.build_intra_predictors_mbuv =
-            vp8_build_intra_predictors_mbuv_sse2;
-        rtcd->recon.build_intra_predictors_mbuv_s =
-            vp8_build_intra_predictors_mbuv_s_sse2;
-        rtcd->recon.build_intra_predictors_mby =
-            vp8_build_intra_predictors_mby_sse2;
-        rtcd->recon.build_intra_predictors_mby_s =
-            vp8_build_intra_predictors_mby_s_sse2;
-
-        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;
-        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;
-
-        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_sse2;
-
-        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_sse2;
-        rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_sse2;
-        rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_sse2;
-        rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_sse2;
-        rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_sse2;
-
-        rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_sse2;
-        rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_sse2;
-        rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_sse2;
-        rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_sse2;
-        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_sse2;
-        rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_sse2;
-        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_sse2;
-        rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_sse2;
-
-#if CONFIG_POSTPROC
-        rtcd->postproc.down        = vp8_mbpost_proc_down_xmm;
-        rtcd->postproc.across      = vp8_mbpost_proc_across_ip_xmm;
-        rtcd->postproc.downacross  = vp8_post_proc_down_and_across_xmm;
-        rtcd->postproc.addnoise    = vp8_plane_add_noise_wmt;
-#endif
-    }
-
-#endif
-
-#if HAVE_SSSE3
-
-    if (flags & HAS_SSSE3)
-    {
-        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_ssse3;
-        rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_ssse3;
-        rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_ssse3;
-        rtcd->subpix.sixtap4x4     = vp8_sixtap_predict4x4_ssse3;
-        rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_ssse3;
-        rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_ssse3;
-
-        rtcd->recon.build_intra_predictors_mbuv =
-            vp8_build_intra_predictors_mbuv_ssse3;
-        rtcd->recon.build_intra_predictors_mbuv_s =
-            vp8_build_intra_predictors_mbuv_s_ssse3;
-        rtcd->recon.build_intra_predictors_mby =
-            vp8_build_intra_predictors_mby_ssse3;
-        rtcd->recon.build_intra_predictors_mby_s =
-            vp8_build_intra_predictors_mby_s_ssse3;
-    }
-#endif
-
-#endif
-}
--- a/vp8/decoder/arm/arm_dsystemdependent.c
+++ b/vp8/decoder/arm/arm_dsystemdependent.c
@@ -1,39 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vpx_ports/arm.h"
-#include "vp8/decoder/onyxd_int.h"
-
-void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
-{
-#if CONFIG_RUNTIME_CPU_DETECT
-    int flags = pbi->common.rtcd.flags;
-
-#if HAVE_ARMV5TE
-    if (flags & HAS_EDSP)
-    {
-    }
-#endif
-
-#if HAVE_ARMV6
-    if (flags & HAS_MEDIA)
-    {
-    }
-#endif
-
-#if HAVE_ARMV7
-    if (flags & HAS_NEON)
-    {
-    }
-#endif
-#endif
-}
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -9,13 +9,12 @@
 */


+#include "vpx_config.h"
+#include "vpx_rtcd.h"
 #include "onyxd_int.h"
 #include "vp8/common/header.h"
-#include "vp8/common/reconintra.h"
 #include "vp8/common/reconintra4x4.h"
-#include "vp8/common/recon.h"
 #include "vp8/common/reconinter.h"
-#include "vp8/common/dequantize.h"
 #include "detokenize.h"
 #include "vp8/common/invtrans.h"
 #include "vp8/common/alloccommon.h"
@@ -31,8 +30,6 @@
 #include "error_concealment.h"
 #endif
 #include "vpx_mem/vpx_mem.h"
-#include "vp8/common/idct.h"
-
 #include "vp8/common/threading.h"
 #include "decoderthreading.h"
 #include "dboolhuff.h"
@@ -96,11 +93,6 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
    }
 }

-#if CONFIG_RUNTIME_CPU_DETECT
-#define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x)
-#else
-#define RTCD_VTABLE(x) NULL
-#endif

 static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                              unsigned int mb_idx)
@@ -164,16 +156,17 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
    /* do prediction */
    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
    {
-        RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd);
+        vp8_build_intra_predictors_mbuv_s(xd);

        if (mode != B_PRED)
        {
-            RECON_INVOKE(&pbi->common.rtcd.recon,
-                         build_intra_predictors_mby_s)(xd);
+            vp8_build_intra_predictors_mby_s(xd);
        }
        else
        {
            short *DQC = xd->dequant_y1;
+            int dst_stride = xd->dst.y_stride;
+            unsigned char *base_dst = xd->dst.y_buffer;

            /* clear out residual eob info */
            if(xd->mode_info_context->mbmi.mb_skip_coeff)
@@ -186,24 +179,24 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                BLOCKD *b = &xd->block[i];
                int b_mode = xd->mode_info_context->bmi[i].as_mode;

-                RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
-                              ( *(b->base_dst) + b->dst, b->dst_stride, b_mode,
-                                *(b->base_dst) + b->dst, b->dst_stride );
+
+                vp8_intra4x4_predict (base_dst + b->offset, dst_stride, b_mode,
+                                      base_dst + b->offset, dst_stride );

                if (xd->eobs[i])
                {
                    if (xd->eobs[i] > 1)
                    {
-                        DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)
+                    vp8_dequant_idct_add
                            (b->qcoeff, DQC,
-                            *(b->base_dst) + b->dst, b->dst_stride);
+                                base_dst + b->offset, dst_stride);
                    }
                    else
                    {
-                        IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+                        vp8_dc_only_idct_add
                            (b->qcoeff[0] * DQC[0],
-                            *(b->base_dst) + b->dst, b->dst_stride,
-                            *(b->base_dst) + b->dst, b->dst_stride);
+                                base_dst + b->offset, dst_stride,
+                                base_dst + b->offset, dst_stride);
                        ((int *)b->qcoeff)[0] = 0;
                    }
                }
@@ -237,10 +230,9 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                /* do 2nd order transform on the dc block */
                if (xd->eobs[24] > 1)
                {
-                    DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b,
-                        xd->dequant_y2);
+                    vp8_dequantize_b(b, xd->dequant_y2);

-                    IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
+                    vp8_short_inv_walsh4x4(&b->dqcoeff[0],
                        xd->qcoeff);
                    ((int *)b->qcoeff)[0] = 0;
                    ((int *)b->qcoeff)[1] = 0;
@@ -254,7 +246,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                else
                {
                    b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
-                    IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0],
+                    vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
                        xd->qcoeff);
                    ((int *)b->qcoeff)[0] = 0;
                }
@@ -265,13 +257,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                DQC = xd->dequant_y1_dc;
            }

-            DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)
+            vp8_dequant_idct_add_y_block
                            (xd->qcoeff, DQC,
                             xd->dst.y_buffer,
                             xd->dst.y_stride, xd->eobs);
        }

-        DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)
+        vp8_dequant_idct_add_uv_block
                        (xd->qcoeff+16*16, xd->dequant_uv,
                         xd->dst.u_buffer, xd->dst.v_buffer,
                         xd->dst.uv_stride, xd->eobs+16);
@@ -619,17 +611,17 @@ static void init_frame(VP8D_COMP *pbi)
        /* To enable choice of different interploation filters */
        if (pc->mcomp_filter_type == SIXTAP)
        {
-            xd->subpixel_predict      = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap4x4);
-            xd->subpixel_predict8x4   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap8x4);
-            xd->subpixel_predict8x8   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap8x8);
-            xd->subpixel_predict16x16 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap16x16);
+            xd->subpixel_predict        = vp8_sixtap_predict4x4;
+            xd->subpixel_predict8x4     = vp8_sixtap_predict8x4;
+            xd->subpixel_predict8x8     = vp8_sixtap_predict8x8;
+            xd->subpixel_predict16x16   = vp8_sixtap_predict16x16;
        }
        else
        {
-            xd->subpixel_predict      = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear4x4);
-            xd->subpixel_predict8x4   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear8x4);
-            xd->subpixel_predict8x8   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear8x8);
-            xd->subpixel_predict16x16 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear16x16);
+            xd->subpixel_predict        = vp8_bilinear_predict4x4;
+            xd->subpixel_predict8x4     = vp8_bilinear_predict8x4;
+            xd->subpixel_predict8x8     = vp8_bilinear_predict8x8;
+            xd->subpixel_predict16x16   = vp8_bilinear_predict16x16;
        }

        if (pbi->decoded_key_frame && pbi->ec_enabled && !pbi->ec_active)
--- a/vp8/decoder/error_concealment.c
+++ b/vp8/decoder/error_concealment.c
@@ -12,7 +12,6 @@
 #include "onyxd_int.h"
 #include "decodemv.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vp8/common/recon.h"
 #include "vp8/common/findnearmv.h"

 #include <assert.h>
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -1,33 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8/common/dequantize.h"
-#include "vp8/decoder/onyxd_int.h"
-
-extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi);
-extern void vp8_arch_arm_decode_init(VP8D_COMP *pbi);
-
-void vp8_dmachine_specific_config(VP8D_COMP *pbi)
-{
-    /* Pure C: */
-#if CONFIG_RUNTIME_CPU_DETECT
-    pbi->mb.rtcd                               = &pbi->common.rtcd;
-#endif
-
-#if ARCH_X86 || ARCH_X86_64
-    vp8_arch_x86_decode_init(pbi);
-#endif
-
-#if ARCH_ARM
-    vp8_arch_arm_decode_init(pbi);
-#endif
-}
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -76,7 +76,6 @@ struct VP8D_COMP * vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
    vp8dx_initialize();

    vp8_create_common(&pbi->common);
-    vp8_dmachine_specific_config(pbi);

    pbi->common.current_video_frame = 0;
    pbi->ready_for_new_data = 1;
@@ -211,7 +210,7 @@ vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, VP8_REFFRAME ref_frame_flag,
 }

 /*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
-#if HAVE_ARMV7
+#if HAVE_NEON
 extern void vp8_push_neon(int64_t *store);
 extern void vp8_pop_neon(int64_t *store);
 #endif
@@ -298,7 +297,7 @@ static int swap_frame_buffers (VP8_COMMON *cm)

 int vp8dx_receive_compressed_data(VP8D_COMP *pbi, unsigned long size, const unsigned char *source, int64_t time_stamp)
 {
-#if HAVE_ARMV7
+#if HAVE_NEON
    int64_t dx_store_reg[8];
 #endif
    VP8_COMMON *cm = &pbi->common;
@@ -387,9 +386,9 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, unsigned long size, const unsi
        return 0;
    }

-#if HAVE_ARMV7
+#if HAVE_NEON
 #if CONFIG_RUNTIME_CPU_DETECT
-    if (cm->rtcd.flags & HAS_NEON)
+    if (cm->cpu_caps & HAS_NEON)
 #endif
    {
        vp8_push_neon(dx_store_reg);
@@ -400,9 +399,9 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, unsigned long size, const unsi

    if (setjmp(pbi->common.error.jmp))
    {
-#if HAVE_ARMV7
+#if HAVE_NEON
 #if CONFIG_RUNTIME_CPU_DETECT
-        if (cm->rtcd.flags & HAS_NEON)
+        if (cm->cpu_caps & HAS_NEON)
 #endif
        {
            vp8_pop_neon(dx_store_reg);
@@ -429,9 +428,9 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, unsigned long size, const unsi

    if (retcode < 0)
    {
-#if HAVE_ARMV7
+#if HAVE_NEON
 #if CONFIG_RUNTIME_CPU_DETECT
-        if (cm->rtcd.flags & HAS_NEON)
+        if (cm->cpu_caps & HAS_NEON)
 #endif
        {
            vp8_pop_neon(dx_store_reg);
@@ -450,9 +449,9 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, unsigned long size, const unsi
    {
        if (swap_frame_buffers (cm))
        {
-#if HAVE_ARMV7
+#if HAVE_NEON
 #if CONFIG_RUNTIME_CPU_DETECT
-            if (cm->rtcd.flags & HAS_NEON)
+            if (cm->cpu_caps & HAS_NEON)
 #endif
            {
                vp8_pop_neon(dx_store_reg);
@@ -468,9 +467,9 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, unsigned long size, const unsi
    {
        if (swap_frame_buffers (cm))
        {
-#if HAVE_ARMV7
+#if HAVE_NEON
 #if CONFIG_RUNTIME_CPU_DETECT
-            if (cm->rtcd.flags & HAS_NEON)
+            if (cm->cpu_caps & HAS_NEON)
 #endif
            {
                vp8_pop_neon(dx_store_reg);
@@ -558,9 +557,9 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, unsigned long size, const unsi
    }
 #endif

-#if HAVE_ARMV7
+#if HAVE_NEON
 #if CONFIG_RUNTIME_CPU_DETECT
-    if (cm->rtcd.flags & HAS_NEON)
+    if (cm->cpu_caps & HAS_NEON)
 #endif
    {
        vp8_pop_neon(dx_store_reg);
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -17,7 +17,6 @@
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/threading.h"

-
 #if CONFIG_ERROR_CONCEALMENT
 #include "ec_types.h"
 #endif
@@ -114,8 +113,6 @@ typedef struct VP8D_COMP
 } VP8D_COMP;

 int vp8_decode_frame(VP8D_COMP *cpi);
-void vp8_dmachine_specific_config(VP8D_COMP *pbi);
-

 #if CONFIG_DEBUG
 #define CHECK_MEM_ERROR(lval,expr) do {\
--- a/vp8/decoder/reconintra_mt.c
+++ b/vp8/decoder/reconintra_mt.c
@@ -10,8 +10,7 @@


 #include "vpx_config.h"
-#include "vp8/common/recon.h"
-#include "vp8/common/reconintra.h"
+#include "vpx_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "onyxd_int.h"

@@ -618,12 +617,15 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
    unsigned char top_left; /* = Above[-1]; */

    BLOCKD *x = &xd->block[num];
+    int dst_stride = xd->dst.y_stride;
+    unsigned char *base_dst = xd->dst.y_buffer;
+

    /*Caution: For some b_mode, it needs 8 pixels (4 above + 4 above-right).*/
    if (num < 4 && pbi->common.filter_level)
        Above = pbi->mt_yabove_row[mb_row] + mb_col*16 + num*4 + 32;
    else
-        Above = *(x->base_dst) + x->dst - x->dst_stride;
+        Above = base_dst + x->offset - dst_stride;

    if (num%4==0 && pbi->common.filter_level)
    {
@@ -631,10 +633,10 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
            Left[i] = pbi->mt_yleft_col[mb_row][num + i];
    }else
    {
-        Left[0] = (*(x->base_dst))[x->dst - 1];
-        Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
-        Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
-        Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
+        Left[0] = (base_dst)[x->offset - 1];
+        Left[1] = (base_dst)[x->offset - 1 + dst_stride];
+        Left[2] = (base_dst)[x->offset - 1 + 2 * dst_stride];
+        Left[3] = (base_dst)[x->offset - 1 + 3 * dst_stride];
    }

    if ((num==4 || num==8 || num==12) && pbi->common.filter_level)
@@ -919,19 +921,22 @@ void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row
    unsigned int *dst_ptr0;
    unsigned int *dst_ptr1;
    unsigned int *dst_ptr2;
+    int dst_stride = x->dst.y_stride;
+    unsigned char *base_dst = x->dst.y_buffer;
+

    if (pbi->common.filter_level)
        above_right = pbi->mt_yabove_row[mb_row] + mb_col*16 + 32 +16;
    else
-        above_right = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16;
+        above_right = base_dst + x->block[0].offset - dst_stride + 16;

    src_ptr = (unsigned int *)above_right;
    /*dst_ptr0 = (unsigned int *)(above_right + 4 * x->block[0].dst_stride);
    dst_ptr1 = (unsigned int *)(above_right + 8 * x->block[0].dst_stride);
    dst_ptr2 = (unsigned int *)(above_right + 12 * x->block[0].dst_stride);*/
-    dst_ptr0 = (unsigned int *)(*(x->block[0].base_dst) + x->block[0].dst + 16 + 3 * x->block[0].dst_stride);
-    dst_ptr1 = (unsigned int *)(*(x->block[0].base_dst) + x->block[0].dst + 16 + 7 * x->block[0].dst_stride);
-    dst_ptr2 = (unsigned int *)(*(x->block[0].base_dst) + x->block[0].dst + 16 + 11 * x->block[0].dst_stride);
+    dst_ptr0 = (unsigned int *)(base_dst + x->block[0].offset + 16 + 3 * dst_stride);
+    dst_ptr1 = (unsigned int *)(base_dst + x->block[0].offset + 16 + 7 * dst_stride);
+    dst_ptr2 = (unsigned int *)(base_dst + x->block[0].offset + 16 + 11 * dst_stride);
    *dst_ptr0 = *src_ptr;
    *dst_ptr1 = *src_ptr;
    *dst_ptr2 = *src_ptr;
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -9,6 +9,8 @@
 */


+#include "vpx_config.h"
+#include "vpx_rtcd.h"
 #if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
 # include <unistd.h>
 #endif
@@ -28,12 +30,6 @@

 extern void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);

-#if CONFIG_RUNTIME_CPU_DETECT
-#define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x)
-#else
-#define RTCD_VTABLE(x) NULL
-#endif
-
 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
    VP8_COMMON *const pc = & pbi->common;
@@ -42,9 +38,6 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_D
    for (i = 0; i < count; i++)
    {
        MACROBLOCKD *mbd = &mbrd[i].mbd;
-#if CONFIG_RUNTIME_CPU_DETECT
-        mbd->rtcd = xd->rtcd;
-#endif
        mbd->subpixel_predict        = xd->subpixel_predict;
        mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
        mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
@@ -54,9 +47,6 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_D
        mbd->mode_info_stride  = pc->mode_info_stride;

        mbd->frame_type = pc->frame_type;
-        mbd->frames_since_golden      = pc->frames_since_golden;
-        mbd->frames_till_alt_ref_frame  = pc->frames_till_alt_ref_frame;
-
        mbd->pre = pc->yv12_fb[pc->lst_fb_idx];
        mbd->dst = pc->yv12_fb[pc->new_fb_idx];

@@ -178,29 +168,31 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
    if (xd->mode_info_context->mbmi.mode == B_PRED)
    {
        short *DQC = xd->dequant_y1;
+        int dst_stride = xd->dst.y_stride;
+        unsigned char *base_dst = xd->dst.y_buffer;

        for (i = 0; i < 16; i++)
        {
            BLOCKD *b = &xd->block[i];
            int b_mode = xd->mode_info_context->bmi[i].as_mode;

-            vp8mt_predict_intra4x4(pbi, xd, b_mode, *(b->base_dst) + b->dst,
-                                   b->dst_stride, mb_row, mb_col, i);
+            vp8mt_predict_intra4x4(pbi, xd, b_mode, base_dst + b->offset,
+                                   dst_stride, mb_row, mb_col, i);

            if (xd->eobs[i] )
            {
                if (xd->eobs[i] > 1)
                {
-                    DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)
+                    vp8_dequant_idct_add
                        (b->qcoeff, DQC,
-                        *(b->base_dst) + b->dst, b->dst_stride);
+                        base_dst + b->offset, dst_stride);
                }
                else
                {
-                    IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+                    vp8_dc_only_idct_add
                        (b->qcoeff[0] * DQC[0],
-                        *(b->base_dst) + b->dst, b->dst_stride,
-                        *(b->base_dst) + b->dst, b->dst_stride);
+                        base_dst + b->offset, dst_stride,
+                        base_dst + b->offset, dst_stride);
                    ((int *)b->qcoeff)[0] = 0;
                }
            }
@@ -217,9 +209,9 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
            /* do 2nd order transform on the dc block */
            if (xd->eobs[24] > 1)
            {
-                DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b, xd->dequant_y2);
+                vp8_dequantize_b(b, xd->dequant_y2);

-                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
+                vp8_short_inv_walsh4x4(&b->dqcoeff[0],
                    xd->qcoeff);
                ((int *)b->qcoeff)[0] = 0;
                ((int *)b->qcoeff)[1] = 0;
@@ -233,7 +225,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
            else
            {
                b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
-                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], xd->qcoeff);
+                vp8_short_inv_walsh4x4_1(&b->dqcoeff[0], xd->qcoeff);
                ((int *)b->qcoeff)[0] = 0;
            }

@@ -241,13 +233,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
            DQC = xd->dequant_y1_dc;
        }

-        DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)
+        vp8_dequant_idct_add_y_block
                        (xd->qcoeff, DQC,
                         xd->dst.y_buffer,
                         xd->dst.y_stride, xd->eobs);
    }

-    DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)
+    vp8_dequant_idct_add_uv_block
                    (xd->qcoeff+16*16, xd->dequant_uv,
                     xd->dst.u_buffer, xd->dst.v_buffer,
                     xd->dst.uv_stride, xd->eobs+16);
@@ -440,39 +432,39 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data)
                                    lfi.hev_thr = lfi_n->hev_thr[hev_index];

                                    if (mb_col > 0)
-                                        LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_v)
+                                        vp8_loop_filter_mbv
                                        (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);

                                    if (!skip_lf)
-                                        LF_INVOKE(&pc->rtcd.loopfilter, normal_b_v)
+                                        vp8_loop_filter_bv
                                        (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);

                                    /* don't apply across umv border */
                                    if (mb_row > 0)
-                                        LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_h)
+                                        vp8_loop_filter_mbh
                                        (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);

                                    if (!skip_lf)
-                                        LF_INVOKE(&pc->rtcd.loopfilter, normal_b_h)
+                                        vp8_loop_filter_bh
                                        (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer,  recon_y_stride, recon_uv_stride, &lfi);
                                }
                                else
                                {
                                    if (mb_col > 0)
-                                        LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_v)
+                                        vp8_loop_filter_simple_mbv
                                        (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);

                                    if (!skip_lf)
-                                        LF_INVOKE(&pc->rtcd.loopfilter, simple_b_v)
+                                        vp8_loop_filter_simple_bv
                                        (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);

                                    /* don't apply across umv border */
                                    if (mb_row > 0)
-                                        LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_h)
+                                        vp8_loop_filter_simple_mbh
                                        (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);

                                    if (!skip_lf)
-                                        LF_INVOKE(&pc->rtcd.loopfilter, simple_b_h)
+                                        vp8_loop_filter_simple_bh
                                        (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
                                }
                            }
@@ -938,39 +930,39 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                            lfi.hev_thr = lfi_n->hev_thr[hev_index];

                            if (mb_col > 0)
-                                LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_v)
+                                vp8_loop_filter_mbv
                                (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);

                            if (!skip_lf)
-                                LF_INVOKE(&pc->rtcd.loopfilter, normal_b_v)
+                                vp8_loop_filter_bv
                                (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);

                            /* don't apply across umv border */
                            if (mb_row > 0)
-                                LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_h)
+                                vp8_loop_filter_mbh
                                (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);

                            if (!skip_lf)
-                                LF_INVOKE(&pc->rtcd.loopfilter, normal_b_h)
+                                vp8_loop_filter_bh
                                (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer,  recon_y_stride, recon_uv_stride, &lfi);
                        }
                        else
                        {
                            if (mb_col > 0)
-                                LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_v)
+                                vp8_loop_filter_simple_mbv
                                (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);

                            if (!skip_lf)
-                                LF_INVOKE(&pc->rtcd.loopfilter, simple_b_v)
+                                vp8_loop_filter_simple_bv
                                (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);

                            /* don't apply across umv border */
                            if (mb_row > 0)
-                                LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_h)
+                                vp8_loop_filter_simple_mbh
                                (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);

                            if (!skip_lf)
-                                LF_INVOKE(&pc->rtcd.loopfilter, simple_b_h)
+                                vp8_loop_filter_simple_bh
                                (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
                        }
                    }
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -1,137 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vpx_ports/arm.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-extern void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
-extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
-extern void vp8_yv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
-
-void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
-{
-#if CONFIG_RUNTIME_CPU_DETECT
-    int flags = cpi->common.rtcd.flags;
-
-#if HAVE_ARMV5TE
-    if (flags & HAS_EDSP)
-    {
-    }
-#endif
-
-#if HAVE_ARMV6
-    if (flags & HAS_MEDIA)
-    {
-        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_armv6;
-        /*cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
-        cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
-        cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
-        cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;*/
-
-        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;*/
-        cpi->rtcd.variance.var8x8                = vp8_variance8x8_armv6;
-        /*cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
-        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;*/
-        cpi->rtcd.variance.var16x16              = vp8_variance16x16_armv6;
-
-        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;*/
-        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_armv6;
-        /*cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
-        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;*/
-        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_armv6;
-        cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_armv6;
-        cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_armv6;
-        cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_armv6;
-
-        cpi->rtcd.variance.mse16x16              = vp8_mse16x16_armv6;
-        /*cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
-
-        /*cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;*/
-
-        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_armv6;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_armv6;
-        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_armv6;
-        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_armv6;
-        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_armv6;
-
-        /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
-        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
-        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;*/
-        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_armv6;
-        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_armv6;
-        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_armv6;
-
-        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;*/
-        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_armv6;
-    }
-#endif
-
-#if HAVE_ARMV7
-    if (flags & HAS_NEON)
-    {
-        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_neon;
-        cpi->rtcd.variance.sad16x8               = vp8_sad16x8_neon;
-        cpi->rtcd.variance.sad8x16               = vp8_sad8x16_neon;
-        cpi->rtcd.variance.sad8x8                = vp8_sad8x8_neon;
-        cpi->rtcd.variance.sad4x4                = vp8_sad4x4_neon;
-
-        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;*/
-        cpi->rtcd.variance.var8x8                = vp8_variance8x8_neon;
-        cpi->rtcd.variance.var8x16               = vp8_variance8x16_neon;
-        cpi->rtcd.variance.var16x8               = vp8_variance16x8_neon;
-        cpi->rtcd.variance.var16x16              = vp8_variance16x16_neon;
-
-        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;*/
-        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_neon;
-        /*cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
-        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;*/
-        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_neon;
-        cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_neon;
-        cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_neon;
-        cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_neon;
-
-        cpi->rtcd.variance.mse16x16              = vp8_mse16x16_neon;
-        /*cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
-
-        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_neon;
-
-        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_neon;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_neon;
-        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_neon;
-        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_neon;
-        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_neon;
-
-        /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
-        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
-        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;*/
-        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_neon;
-        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_neon;
-        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_neon;
-
-        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
-        cpi->rtcd.quantize.quantb_pair           = vp8_regular_quantize_b_pair;*/
-        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_neon;
-        cpi->rtcd.quantize.fastquantb_pair       = vp8_fast_quantize_b_pair_neon;
-    }
-#endif /* HAVE_ARMV7 */
-#endif /* CONFIG_RUNTIME_CPU_DETECT */
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (flags & HAS_NEON)
-#endif
-    {
-        vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame_neon;
-    }
-#endif
-}
--- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
@@ -144,7 +144,7 @@ loop
    ldr     r6, [sp, #40]       ; get address of sse
    mul     r0, r8, r8          ; sum * sum
    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))

    ldmfd   sp!, {r4-r12, pc}

--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@@ -169,7 +169,7 @@ loop
    ldr     r6, [sp, #40]       ; get address of sse
    mul     r0, r8, r8          ; sum * sum
    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))

    ldmfd   sp!, {r4-r12, pc}

--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@@ -210,7 +210,7 @@ loop
    ldr     r6, [sp, #40]       ; get address of sse
    mul     r0, r8, r8          ; sum * sum
    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))

    ldmfd   sp!, {r4-r12, pc}

--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -171,7 +171,7 @@ loop
    ldr     r6, [sp, #40]       ; get address of sse
    mul     r0, r8, r8          ; sum * sum
    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))

    ldmfd   sp!, {r4-r12, pc}

--- a/vp8/encoder/arm/dct_arm.c
+++ b/vp8/encoder/arm/dct_arm.c
@@ -9,9 +9,9 @@
 */

 #include "vpx_config.h"
-#include "vp8/encoder/dct.h"
+#include "vpx_rtcd.h"

-#if HAVE_ARMV6
+#if HAVE_MEDIA

 void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch)
 {
@@ -19,4 +19,4 @@ void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch)
    vp8_short_fdct4x4_armv6(input + 4, output + 16, pitch);
 }

-#endif /* HAVE_ARMV6 */
+#endif /* HAVE_MEDIA */
--- a/vp8/encoder/arm/dct_arm.h
+++ b/vp8/encoder/arm/dct_arm.h
@@ -1,65 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef DCT_ARM_H
-#define DCT_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_fdct(vp8_short_walsh4x4_armv6);
-extern prototype_fdct(vp8_short_fdct4x4_armv6);
-extern prototype_fdct(vp8_short_fdct8x4_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_fdct_walsh_short4x4
-#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
-
-#undef  vp8_fdct_short4x4
-#define vp8_fdct_short4x4 vp8_short_fdct4x4_armv6
-
-#undef  vp8_fdct_short8x4
-#define vp8_fdct_short8x4 vp8_short_fdct8x4_armv6
-
-#undef  vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp8_short_fdct4x4_armv6
-
-#undef  vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_short_fdct8x4_armv6
-#endif
-
-#endif /* HAVE_ARMV6 */
-
-#if HAVE_ARMV7
-extern prototype_fdct(vp8_short_fdct4x4_neon);
-extern prototype_fdct(vp8_short_fdct8x4_neon);
-extern prototype_fdct(vp8_fast_fdct4x4_neon);
-extern prototype_fdct(vp8_fast_fdct8x4_neon);
-extern prototype_fdct(vp8_short_walsh4x4_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_fdct_short4x4
-#define vp8_fdct_short4x4 vp8_short_fdct4x4_neon
-
-#undef  vp8_fdct_short8x4
-#define vp8_fdct_short8x4 vp8_short_fdct8x4_neon
-
-#undef  vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp8_short_fdct4x4_neon
-
-#undef  vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_short_fdct8x4_neon
-
-#undef  vp8_fdct_walsh_short4x4
-#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_neon
-#endif
-
-#endif
-
-#endif
--- a/vp8/encoder/arm/encodemb_arm.h
+++ b/vp8/encoder/arm/encodemb_arm.h
@@ -1,64 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef ENCODEMB_ARM_H
-#define ENCODEMB_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_subb(vp8_subtract_b_armv6);
-extern prototype_submby(vp8_subtract_mby_armv6);
-extern prototype_submbuv(vp8_subtract_mbuv_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_encodemb_subb
-#define vp8_encodemb_subb vp8_subtract_b_armv6
-
-#undef  vp8_encodemb_submby
-#define vp8_encodemb_submby vp8_subtract_mby_armv6
-
-#undef  vp8_encodemb_submbuv
-#define vp8_encodemb_submbuv vp8_subtract_mbuv_armv6
-#endif
-
-#endif /* HAVE_ARMV6 */
-
-#if HAVE_ARMV7
-//extern prototype_berr(vp8_block_error_c);
-//extern prototype_mberr(vp8_mbblock_error_c);
-//extern prototype_mbuverr(vp8_mbuverror_c);
-
-extern prototype_subb(vp8_subtract_b_neon);
-extern prototype_submby(vp8_subtract_mby_neon);
-extern prototype_submbuv(vp8_subtract_mbuv_neon);
-
-//#undef  vp8_encodemb_berr
-//#define vp8_encodemb_berr vp8_block_error_c
-
-//#undef  vp8_encodemb_mberr
-//#define vp8_encodemb_mberr vp8_mbblock_error_c
-
-//#undef  vp8_encodemb_mbuverr
-//#define vp8_encodemb_mbuverr vp8_mbuverror_c
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_encodemb_subb
-#define vp8_encodemb_subb vp8_subtract_b_neon
-
-#undef  vp8_encodemb_submby
-#define vp8_encodemb_submby vp8_subtract_mby_neon
-
-#undef  vp8_encodemb_submbuv
-#define vp8_encodemb_submbuv vp8_subtract_mbuv_neon
-#endif
-
-#endif
-
-#endif
--- a/vp8/encoder/arm/neon/variance_neon.asm
+++ b/vp8/encoder/arm/neon/variance_neon.asm
@@ -77,14 +77,14 @@ variance16x16_neon_loop
    ;vmov.32        r1, d1[0]
    ;mul            r0, r0, r0
    ;str            r1, [r12]
-    ;sub            r0, r1, r0, asr #8
+    ;sub            r0, r1, r0, lsr #8

-    ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
-    ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
+    ; while sum is signed, sum * sum is always positive and must be treated as
+    ; unsigned to avoid propagating the sign bit.
    vmull.s32       q5, d0, d0
    vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10

    vmov.32         r0, d0[0]                   ;return
    bx              lr
@@ -145,8 +145,8 @@ variance16x8_neon_loop

    vmull.s32       q5, d0, d0
    vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #7
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #7
+    vsub.u32        d0, d1, d10

    vmov.32         r0, d0[0]                   ;return
    bx              lr
@@ -200,8 +200,8 @@ variance8x16_neon_loop

    vmull.s32       q5, d0, d0
    vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #7
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #7
+    vsub.u32        d0, d1, d10

    vmov.32         r0, d0[0]                   ;return
    bx              lr
@@ -265,8 +265,8 @@ variance8x8_neon_loop

    vmull.s32       q5, d0, d0
    vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #6
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #6
+    vsub.u32        d0, d1, d10

    vmov.32         r0, d0[0]                   ;return
    bx              lr
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -405,8 +405,8 @@ sub_pixel_variance16x16_neon_loop

    vmull.s32       q5, d0, d0
    vst1.32         {d1[0]}, [r6]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10

    add             sp, sp, #528
    vmov.32         r0, d0[0]                   ;return
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -112,8 +112,8 @@ vp8_filt_fpo16x16s_4_0_loop_neon

    vmull.s32       q5, d0, d0
    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10

    vmov.32         r0, d0[0]                   ;return
    pop             {pc}
@@ -208,8 +208,8 @@ vp8_filt_spo16x16s_0_4_loop_neon

    vmull.s32       q5, d0, d0
    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10

    vmov.32         r0, d0[0]                   ;return
    pop             {pc}
@@ -327,8 +327,8 @@ vp8_filt16x16s_4_4_loop_neon

    vmull.s32       q5, d0, d0
    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10

    vmov.32         r0, d0[0]                   ;return
    pop             {pc}
@@ -560,8 +560,8 @@ sub_pixel_variance16x16s_neon_loop

    vmull.s32       q5, d0, d0
    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10

    add             sp, sp, #256
    vmov.32         r0, d0[0]                   ;return
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -206,8 +206,8 @@ sub_pixel_variance8x8_neon_loop

    vmull.s32       q5, d0, d0
    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #6
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #6
+    vsub.u32        d0, d1, d10

    vmov.32         r0, d0[0]                   ;return
    pop             {r4-r5, pc}
--- a/vp8/encoder/arm/quantize_arm.c
+++ b/vp8/encoder/arm/quantize_arm.c
@@ -9,14 +9,16 @@
 */


+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/encoder/block.h"
 #include <math.h>
 #include "vpx_mem/vpx_mem.h"
-
 #include "vp8/encoder/quantize.h"
 #include "vp8/common/entropy.h"


-#if HAVE_ARMV7
+#if HAVE_NEON

 /* vp8_quantize_mbX functions here differs from corresponding ones in
 * quantize.c only by using quantize_b_pair function pointer instead of
@@ -59,4 +61,4 @@ void vp8_quantize_mbuv_neon(MACROBLOCK *x)
                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
 }

-#endif /* HAVE_ARMV7 */
+#endif /* HAVE_NEON */
--- a/vp8/encoder/arm/quantize_arm.h
+++ b/vp8/encoder/arm/quantize_arm.h
@@ -1,52 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef QUANTIZE_ARM_H
-#define QUANTIZE_ARM_H
-
-#if HAVE_ARMV6
-
-extern prototype_quantize_block(vp8_fast_quantize_b_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_quantize_fastquantb
-#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6
-#endif
-
-#endif /* HAVE_ARMV6 */
-
-
-#if HAVE_ARMV7
-
-extern prototype_quantize_block(vp8_fast_quantize_b_neon);
-extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_quantize_fastquantb
-#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon
-
-#undef  vp8_quantize_fastquantb_pair
-#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon
-
-#undef vp8_quantize_mb
-#define vp8_quantize_mb vp8_quantize_mb_neon
-
-#undef vp8_quantize_mbuv
-#define vp8_quantize_mbuv vp8_quantize_mbuv_neon
-
-#undef vp8_quantize_mby
-#define vp8_quantize_mby vp8_quantize_mby_neon
-#endif
-
-#endif /* HAVE_ARMV7 */
-
-#endif
-
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -9,10 +9,11 @@
 */

 #include "vpx_config.h"
+#include "vpx_rtcd.h"
 #include "vp8/encoder/variance.h"
 #include "vp8/common/filter.h"

-#if HAVE_ARMV6
+#if HAVE_MEDIA
 #include "vp8/common/arm/bilinearfilter_arm.h"

 unsigned int vp8_sub_pixel_variance8x8_armv6
@@ -91,10 +92,10 @@ unsigned int vp8_sub_pixel_variance16x16_armv6
    return var;
 }

-#endif /* HAVE_ARMV6 */
+#endif /* HAVE_MEDIA */


-#if HAVE_ARMV7
+#if HAVE_NEON

 unsigned int vp8_sub_pixel_variance16x16_neon
 (
--- a/vp8/encoder/arm/variance_arm.h
+++ b/vp8/encoder/arm/variance_arm.h
@@ -1,155 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VARIANCE_ARM_H
-#define VARIANCE_ARM_H
-
-#if HAVE_ARMV6
-
-extern prototype_sad(vp8_sad16x16_armv6);
-extern prototype_variance(vp8_variance16x16_armv6);
-extern prototype_variance(vp8_variance8x8_armv6);
-extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6);
-extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_armv6);
-extern prototype_variance(vp8_variance_halfpixvar16x16_h_armv6);
-extern prototype_variance(vp8_variance_halfpixvar16x16_v_armv6);
-extern prototype_variance(vp8_variance_halfpixvar16x16_hv_armv6);
-extern prototype_variance(vp8_mse16x16_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef  vp8_variance_sad16x16
-#define vp8_variance_sad16x16 vp8_sad16x16_armv6
-
-#undef  vp8_variance_subpixvar16x16
-#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6
-
-#undef  vp8_variance_subpixvar8x8
-#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_armv6
-
-#undef  vp8_variance_var16x16
-#define vp8_variance_var16x16 vp8_variance16x16_armv6
-
-#undef  vp8_variance_mse16x16
-#define vp8_variance_mse16x16 vp8_mse16x16_armv6
-
-#undef  vp8_variance_var8x8
-#define vp8_variance_var8x8 vp8_variance8x8_armv6
-
-#undef  vp8_variance_halfpixvar16x16_h
-#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_armv6
-
-#undef  vp8_variance_halfpixvar16x16_v
-#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_armv6
-
-#undef  vp8_variance_halfpixvar16x16_hv
-#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_armv6
-
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_ARMV6 */
-
-
-#if HAVE_ARMV7
-extern prototype_sad(vp8_sad4x4_neon);
-extern prototype_sad(vp8_sad8x8_neon);
-extern prototype_sad(vp8_sad8x16_neon);
-extern prototype_sad(vp8_sad16x8_neon);
-extern prototype_sad(vp8_sad16x16_neon);
-
-//extern prototype_variance(vp8_variance4x4_c);
-extern prototype_variance(vp8_variance8x8_neon);
-extern prototype_variance(vp8_variance8x16_neon);
-extern prototype_variance(vp8_variance16x8_neon);
-extern prototype_variance(vp8_variance16x16_neon);
-
-//extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_c);
-extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_neon);
-//extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_c);
-//extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_c);
-extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon);
-extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon_func);
-extern prototype_variance(vp8_variance_halfpixvar16x16_h_neon);
-extern prototype_variance(vp8_variance_halfpixvar16x16_v_neon);
-extern prototype_variance(vp8_variance_halfpixvar16x16_hv_neon);
-
-//extern prototype_getmbss(vp8_get_mb_ss_c);
-extern prototype_variance(vp8_mse16x16_neon);
-extern prototype_get16x16prederror(vp8_get4x4sse_cs_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_variance_sad4x4
-#define vp8_variance_sad4x4 vp8_sad4x4_neon
-
-#undef  vp8_variance_sad8x8
-#define vp8_variance_sad8x8 vp8_sad8x8_neon
-
-#undef  vp8_variance_sad8x16
-#define vp8_variance_sad8x16 vp8_sad8x16_neon
-
-#undef  vp8_variance_sad16x8
-#define vp8_variance_sad16x8 vp8_sad16x8_neon
-
-#undef  vp8_variance_sad16x16
-#define vp8_variance_sad16x16 vp8_sad16x16_neon
-
-//#undef  vp8_variance_var4x4
-//#define vp8_variance_var4x4 vp8_variance4x4_c
-
-#undef  vp8_variance_var8x8
-#define vp8_variance_var8x8 vp8_variance8x8_neon
-
-#undef  vp8_variance_var8x16
-#define vp8_variance_var8x16 vp8_variance8x16_neon
-
-#undef  vp8_variance_var16x8
-#define vp8_variance_var16x8 vp8_variance16x8_neon
-
-#undef  vp8_variance_var16x16
-#define vp8_variance_var16x16 vp8_variance16x16_neon
-
-//#undef  vp8_variance_subpixvar4x4
-//#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_c
-
-#undef  vp8_variance_subpixvar8x8
-#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_neon
-
-//#undef  vp8_variance_subpixvar8x16
-//#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_c
-
-//#undef  vp8_variance_subpixvar16x8
-//#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_c
-
-#undef  vp8_variance_subpixvar16x16
-#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_neon
-
-#undef  vp8_variance_halfpixvar16x16_h
-#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_neon
-
-#undef  vp8_variance_halfpixvar16x16_v
-#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_neon
-
-#undef  vp8_variance_halfpixvar16x16_hv
-#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_neon
-
-//#undef  vp8_variance_getmbss
-//#define vp8_variance_getmbss vp8_get_mb_ss_c
-
-#undef  vp8_variance_mse16x16
-#define vp8_variance_mse16x16 vp8_mse16x16_neon
-
-#undef  vp8_variance_get4x4sse_cs
-#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_neon
-#endif
-
-#endif
-
-#endif
--- a/vp8/encoder/asm_enc_offsets.c
+++ b/vp8/encoder/asm_enc_offsets.c
@@ -88,7 +88,7 @@ END
 * change they will have to be adjusted.
 */

-#if HAVE_ARMV5TE
+#if HAVE_EDSP
 ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
 ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)
 #endif
--- a/vp8/encoder/bitstream.h
+++ b/vp8/encoder/bitstream.h
@@ -12,7 +12,7 @@
 #ifndef __INC_BITSTREAM_H
 #define __INC_BITSTREAM_H

-#if HAVE_ARMV5TE
+#if HAVE_EDSP
 void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
                             vp8_token *,
                             vp8_extra_bit_struct *,
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -25,7 +25,7 @@ typedef struct
    int offset;
 } search_site;

-typedef struct
+typedef struct block
 {
    // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
    short *src_diff;
@@ -57,7 +57,7 @@ typedef struct
    } bmi[16];
 } PARTITION_INFO;

-typedef struct
+typedef struct macroblock
 {
    DECLARE_ALIGNED(16, short, src_diff[400]);       // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
    DECLARE_ALIGNED(16, short, coeff[400]);     // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
@@ -74,6 +74,8 @@ typedef struct
    PARTITION_INFO *pi;   /* Corresponds to upper left visible macroblock */
    PARTITION_INFO *pip;  /* Base of allocated array */

+    int ref_frame_cost[MAX_REF_FRAMES];
+
    search_site *ss;
    int ss_count;
    int searches_per_step;
@@ -117,8 +119,8 @@ typedef struct
    int optimize;
    int q_index;

-    void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
-    void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
+    void (*short_fdct4x4)(short *input, short *output, int pitch);
+    void (*short_fdct8x4)(short *input, short *output, int pitch);
    void (*short_walsh4x4)(short *input, short *output, int pitch);
    void (*quantize_b)(BLOCK *b, BLOCKD *d);
    void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
--- a/vp8/encoder/dct.h
+++ b/vp8/encoder/dct.h
@@ -1,65 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_DCT_H
-#define __INC_DCT_H
-
-#define prototype_fdct(sym) void (sym)(short *input, short *output, int pitch)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/dct_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/dct_arm.h"
-#endif
-
-#ifndef vp8_fdct_short4x4
-#define vp8_fdct_short4x4  vp8_short_fdct4x4_c
-#endif
-extern prototype_fdct(vp8_fdct_short4x4);
-
-#ifndef vp8_fdct_short8x4
-#define vp8_fdct_short8x4  vp8_short_fdct8x4_c
-#endif
-extern prototype_fdct(vp8_fdct_short8x4);
-
-// There is no fast4x4 (for now)
-#ifndef vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4  vp8_short_fdct4x4_c
-#endif
-
-#ifndef vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4  vp8_short_fdct8x4_c
-#endif
-
-#ifndef vp8_fdct_walsh_short4x4
-#define vp8_fdct_walsh_short4x4  vp8_short_walsh4x4_c
-#endif
-extern prototype_fdct(vp8_fdct_walsh_short4x4);
-
-typedef prototype_fdct(*vp8_fdct_fn_t);
-typedef struct
-{
-    vp8_fdct_fn_t    short4x4;
-    vp8_fdct_fn_t    short8x4;
-    vp8_fdct_fn_t    fast4x4;
-    vp8_fdct_fn_t    fast8x4;
-    vp8_fdct_fn_t    walsh_short4x4;
-} vp8_fdct_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define FDCT_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define FDCT_INVOKE(ctx,fn) vp8_fdct_##fn
-#endif
-
-#endif
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -24,20 +24,11 @@
 #include "rdopt.h"
 #include "pickinter.h"
 #include "vp8/common/findnearmv.h"
-#include "vp8/common/reconintra.h"
 #include <stdio.h>
 #include <limits.h>
-#include "vp8/common/subpixel.h"
 #include "vp8/common/invtrans.h"
 #include "vpx_ports/vpx_timer.h"

-#if CONFIG_RUNTIME_CPU_DETECT
-#define RTCD(x)     &cpi->common.rtcd.x
-#define IF_RTCD(x)  (x)
-#else
-#define RTCD(x)     NULL
-#define IF_RTCD(x)  NULL
-#endif
 extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
 extern void vp8_calc_ref_frame_costs(int *ref_frame_cost,
                                     int prob_intra,
@@ -98,7 +89,7 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
     *  lambda using a non-linear combination (e.g., the smallest, or second
     *  smallest, etc.).
     */
-    act =     VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)(x->src.y_buffer,
+    act =  vp8_variance16x16(x->src.y_buffer,
                    x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
    act = act<<4;

@@ -299,7 +290,7 @@ static void build_activity_map( VP8_COMP *cpi )
            recon_yoffset += 16;
 #endif
            //Copy current mb to a buffer
-            RECON_INVOKE(&xd->rtcd->recon, copy16x16)(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+            vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);

            // measure activity
            mb_activity = mb_activity_measure( cpi, x, mb_row, mb_col );
@@ -420,6 +411,10 @@ void encode_mb_row(VP8_COMP *cpi,
    // for each macroblock col in image
    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
    {
+        xd->mbr = mb_row;
+        xd->mbc = mb_col;
+        xd->mbrc = mb_row * cm->mb_cols + mb_col;
+
        // Distance of Mb to the left & right edges, specified in
        // 1/8th pel units as they are always compared to values
        // that are in 1/8th pel units
@@ -441,7 +436,7 @@ void encode_mb_row(VP8_COMP *cpi,
        x->rdmult = cpi->RDMULT;

        //Copy current mb to a buffer
-        RECON_INVOKE(&xd->rtcd->recon, copy16x16)(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+        vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);

 #if CONFIG_MULTITHREAD
        if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
@@ -604,9 +599,6 @@ void init_encode_frame_mb_context(VP8_COMP *cpi)

    xd->frame_type = cm->frame_type;

-    xd->frames_since_golden = cm->frames_since_golden;
-    xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
-
    // reset intra mode contexts
    if (cm->frame_type == KEY_FRAME)
        vp8_init_mbmode_probs(cm);
@@ -641,18 +633,18 @@ void init_encode_frame_mb_context(VP8_COMP *cpi)

    // Special case treatment when GF and ARF are not sensible options for reference
    if (cpi->ref_frame_flags == VP8_LAST_FLAG)
-        vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+        vp8_calc_ref_frame_costs(x->ref_frame_cost,
                                 cpi->prob_intra_coded,255,128);
    else if ((cpi->oxcf.number_of_layers > 1) &&
               (cpi->ref_frame_flags == VP8_GOLD_FLAG))
-        vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+        vp8_calc_ref_frame_costs(x->ref_frame_cost,
                                 cpi->prob_intra_coded,1,255);
    else if ((cpi->oxcf.number_of_layers > 1) &&
                (cpi->ref_frame_flags == VP8_ALT_FLAG))
-        vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+        vp8_calc_ref_frame_costs(x->ref_frame_cost,
                                 cpi->prob_intra_coded,1,1);
    else
-        vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+        vp8_calc_ref_frame_costs(x->ref_frame_cost,
                                 cpi->prob_intra_coded,
                                 cpi->prob_last_coded,
                                 cpi->prob_gf_coded);
@@ -687,25 +679,17 @@ void vp8_encode_frame(VP8_COMP *cpi)
    // Functions setup for all frame types so we can use MC in AltRef
    if (cm->mcomp_filter_type == SIXTAP)
    {
-        xd->subpixel_predict        = SUBPIX_INVOKE(
-                                        &cpi->common.rtcd.subpix, sixtap4x4);
-        xd->subpixel_predict8x4     = SUBPIX_INVOKE(
-                                        &cpi->common.rtcd.subpix, sixtap8x4);
-        xd->subpixel_predict8x8     = SUBPIX_INVOKE(
-                                        &cpi->common.rtcd.subpix, sixtap8x8);
-        xd->subpixel_predict16x16   = SUBPIX_INVOKE(
-                                        &cpi->common.rtcd.subpix, sixtap16x16);
+        xd->subpixel_predict        = vp8_sixtap_predict4x4;
+        xd->subpixel_predict8x4     = vp8_sixtap_predict8x4;
+        xd->subpixel_predict8x8     = vp8_sixtap_predict8x8;
+        xd->subpixel_predict16x16   = vp8_sixtap_predict16x16;
    }
    else
    {
-        xd->subpixel_predict        = SUBPIX_INVOKE(
-                                        &cpi->common.rtcd.subpix, bilinear4x4);
-        xd->subpixel_predict8x4     = SUBPIX_INVOKE(
-                                        &cpi->common.rtcd.subpix, bilinear8x4);
-        xd->subpixel_predict8x8     = SUBPIX_INVOKE(
-                                        &cpi->common.rtcd.subpix, bilinear8x8);
-        xd->subpixel_predict16x16   = SUBPIX_INVOKE(
-                                      &cpi->common.rtcd.subpix, bilinear16x16);
+        xd->subpixel_predict        = vp8_bilinear_predict4x4;
+        xd->subpixel_predict8x4     = vp8_bilinear_predict8x4;
+        xd->subpixel_predict8x8     = vp8_bilinear_predict8x8;
+        xd->subpixel_predict16x16   = vp8_bilinear_predict16x16;
    }

    // Reset frame count of inter 0,0 motion vector useage.
@@ -1107,19 +1091,19 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
    }

    if (x->e_mbd.mode_info_context->mbmi.mode == B_PRED)
-        vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+        vp8_encode_intra4x4mby(x);
    else
-        vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+        vp8_encode_intra16x16mby(x);

-    vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+    vp8_encode_intra16x16mbuv(x);

    sum_intra_stats(cpi, x);
    vp8_tokenize_mb(cpi, &x->e_mbd, t);

    if (xd->mode_info_context->mbmi.mode != B_PRED)
-        vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));
+        vp8_inverse_transform_mby(xd);

-    DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)
+    vp8_dequant_idct_add_uv_block
                    (xd->qcoeff+16*16, xd->dequant_uv,
                     xd->dst.u_buffer, xd->dst.v_buffer,
                     xd->dst.uv_stride, xd->eobs+16);
@@ -1131,6 +1115,8 @@ extern int cnt_pm;

 extern void vp8_fix_contexts(MACROBLOCKD *x);

+#include "valgrind/memcheck.h"
+
 int vp8cx_encode_inter_macroblock
 (
    VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
@@ -1150,6 +1136,15 @@ int vp8cx_encode_inter_macroblock
    else
        x->encode_breakout = cpi->oxcf.encode_breakout;

+    if (cpi->external_modeinfo)
+    {
+        vp8_rd_use_external_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
+                                 &distortion, &intra_error);
+        VALGRIND_CHECK_VALUE_IS_DEFINED(rate);
+        VALGRIND_CHECK_VALUE_IS_DEFINED(distortion);
+        VALGRIND_CHECK_VALUE_IS_DEFINED(intra_error);
+    }
+    else
    if (cpi->sf.RD)
    {
        int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
@@ -1157,10 +1152,8 @@ int vp8cx_encode_inter_macroblock
        /* Are we using the fast quantizer for the mode selection? */
        if(cpi->sf.use_fastquant_for_pick)
        {
-            cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
-                                                      fastquantb);
-            cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
-                                                      fastquantb_pair);
+            cpi->mb.quantize_b      = vp8_fast_quantize_b;
+            cpi->mb.quantize_b_pair = vp8_fast_quantize_b_pair;

            /* the fast quantizer does not use zbin_extra, so
             * do not recalculate */
@@ -1172,10 +1165,8 @@ int vp8cx_encode_inter_macroblock
        /* switch back to the regular quantizer for the encode */
        if (cpi->sf.improved_quant)
        {
-            cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
-                                                      quantb);
-            cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
-                                                      quantb_pair);
+            cpi->mb.quantize_b      = vp8_regular_quantize_b;
+            cpi->mb.quantize_b_pair = vp8_regular_quantize_b_pair;
        }

        /* restore cpi->zbin_mode_boost_enabled */
@@ -1253,15 +1244,15 @@ int vp8cx_encode_inter_macroblock

    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
    {
-        vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+        vp8_encode_intra16x16mbuv(x);

        if (xd->mode_info_context->mbmi.mode == B_PRED)
        {
-            vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+            vp8_encode_intra4x4mby(x);
        }
        else
        {
-            vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+            vp8_encode_intra16x16mby(x);
        }

        sum_intra_stats(cpi, x);
@@ -1283,7 +1274,7 @@ int vp8cx_encode_inter_macroblock

        if (!x->skip)
        {
-            vp8_encode_inter16x16(IF_RTCD(&cpi->rtcd), x);
+            vp8_encode_inter16x16(x);

            // Clear mb_skip_coeff if mb_no_coeff_skip is not set
            if (!cpi->common.mb_no_coeff_skip)
@@ -1302,9 +1293,9 @@ int vp8cx_encode_inter_macroblock
        vp8_tokenize_mb(cpi, xd, t);

        if (xd->mode_info_context->mbmi.mode != B_PRED)
-            vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));
+            vp8_inverse_transform_mby(xd);

-        DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)
+        vp8_dequant_idct_add_uv_block
                        (xd->qcoeff+16*16, xd->dequant_uv,
                         xd->dst.u_buffer, xd->dst.v_buffer,
                         xd->dst.uv_stride, xd->eobs+16);
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -10,23 +10,14 @@


 #include "vpx_config.h"
-#include "vp8/common/idct.h"
+#include "vpx_rtcd.h"
 #include "quantize.h"
-#include "vp8/common/reconintra.h"
 #include "vp8/common/reconintra4x4.h"
 #include "encodemb.h"
 #include "vp8/common/invtrans.h"
-#include "vp8/common/recon.h"
-#include "dct.h"
 #include "encodeintra.h"


-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
-
 int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
 {

@@ -36,60 +27,58 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)

    if (use_dc_pred)
    {
-        const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
-
        x->e_mbd.mode_info_context->mbmi.mode = DC_PRED;
        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
        x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

-        vp8_encode_intra16x16mby(rtcd, x);
+        vp8_encode_intra16x16mby(x);

-        vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(&cpi->common.rtcd));
+        vp8_inverse_transform_mby(&x->e_mbd);
    }
    else
    {
        for (i = 0; i < 16; i++)
        {
            x->e_mbd.block[i].bmi.as_mode = B_DC_PRED;
-            vp8_encode_intra4x4block(IF_RTCD(&cpi->rtcd), x, i);
+            vp8_encode_intra4x4block(x, i);
        }
    }

-    intra_pred_var = VARIANCE_INVOKE(&cpi->rtcd.variance, getmbss)(x->src_diff);
+    intra_pred_var = vp8_get_mb_ss(x->src_diff);

    return intra_pred_var;
 }

-void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
-                              MACROBLOCK *x, int ib)
+void vp8_encode_intra4x4block(MACROBLOCK *x, int ib)
 {
    BLOCKD *b = &x->e_mbd.block[ib];
    BLOCK *be = &x->block[ib];
+    int dst_stride = x->e_mbd.dst.y_stride;
+    unsigned char *base_dst = x->e_mbd.dst.y_buffer;

-    RECON_INVOKE(&rtcd->common->recon, intra4x4_predict)
-                (*(b->base_dst) + b->dst, b->dst_stride,
+    vp8_intra4x4_predict(base_dst + b->offset, dst_stride,
                 b->bmi.as_mode, b->predictor, 16);

-    ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
+    vp8_subtract_b(be, b, 16);

-    x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
+    x->short_fdct4x4(be->src_diff, be->coeff, 32);

    x->quantize_b(be, b);

    if (*b->eob > 1)
    {
-        IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct16)(b->dqcoeff,
-            b->predictor, 16, *(b->base_dst) + b->dst, b->dst_stride);
+      vp8_short_idct4x4llm(b->dqcoeff,
+            b->predictor, 16, base_dst + b->offset, dst_stride);
    }
    else
    {
-        IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct1_scalar_add)
-            (b->dqcoeff[0], b->predictor, 16, *(b->base_dst) + b->dst,
-                b->dst_stride);
+      vp8_dc_only_idct_add
+            (b->dqcoeff[0], b->predictor, 16, base_dst + b->offset,
+                dst_stride);
    }
 }

-void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
+void vp8_encode_intra4x4mby(MACROBLOCK *mb)
 {
    int i;

@@ -97,18 +86,18 @@ void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
    vp8_intra_prediction_down_copy(x);

    for (i = 0; i < 16; i++)
-        vp8_encode_intra4x4block(rtcd, mb, i);
+        vp8_encode_intra4x4block(mb, i);
    return;
 }

-void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+void vp8_encode_intra16x16mby(MACROBLOCK *x)
 {
    BLOCK *b = &x->block[0];
    MACROBLOCKD *xd = &x->e_mbd;

-    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby_s)(&x->e_mbd);
+    vp8_build_intra_predictors_mby_s(&x->e_mbd);

-    ENCODEMB_INVOKE(&rtcd->encodemb, submby) (x->src_diff, *(b->base_src),
+    vp8_subtract_mby(x->src_diff, *(b->base_src),
        b->src_stride, xd->dst.y_buffer, xd->dst.y_stride);

    vp8_transform_intra_mby(x);
@@ -116,16 +105,16 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    vp8_quantize_mby(x);

    if (x->optimize)
-        vp8_optimize_mby(x, rtcd);
+        vp8_optimize_mby(x);
 }

-void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+void vp8_encode_intra16x16mbuv(MACROBLOCK *x)
 {
    MACROBLOCKD *xd = &x->e_mbd;

-    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv_s)(&x->e_mbd);
+    vp8_build_intra_predictors_mbuv_s(&x->e_mbd);

-    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
+    vp8_subtract_mbuv(x->src_diff, x->src.u_buffer,
        x->src.v_buffer, x->src.uv_stride, xd->dst.u_buffer,
        xd->dst.v_buffer, xd->dst.uv_stride);

@@ -134,5 +123,5 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    vp8_quantize_mbuv(x);

    if (x->optimize)
-        vp8_optimize_mbuv(x, rtcd);
+        vp8_optimize_mbuv(x);
 }
--- a/vp8/encoder/encodeintra.h
+++ b/vp8/encoder/encodeintra.h
@@ -14,9 +14,8 @@
 #include "onyx_int.h"

 int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred);
-void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
-void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
-void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *, MACROBLOCK *mb);
-void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
-                              MACROBLOCK *x, int ib);
+void vp8_encode_intra16x16mby(MACROBLOCK *x);
+void vp8_encode_intra16x16mbuv(MACROBLOCK *x);
+void vp8_encode_intra4x4mby(MACROBLOCK *mb);
+void vp8_encode_intra4x4block(MACROBLOCK *x, int ib);
 #endif
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -10,22 +10,15 @@


 #include "vpx_config.h"
+#include "vpx_rtcd.h"
 #include "encodemb.h"
 #include "vp8/common/reconinter.h"
 #include "quantize.h"
 #include "tokenize.h"
 #include "vp8/common/invtrans.h"
-#include "vp8/common/recon.h"
-#include "vp8/common/reconintra.h"
-#include "dct.h"
 #include "vpx_mem/vpx_mem.h"
 #include "rdopt.h"

-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
 void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
 {
    unsigned char *src_ptr = (*(be->base_src) + be->src);
@@ -100,13 +93,13 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
    }
 }

-static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+static void vp8_subtract_mb(MACROBLOCK *x)
 {
    BLOCK *b = &x->block[0];

-    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
+    vp8_subtract_mby(x->src_diff, *(b->base_src),
        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
-    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
+    vp8_subtract_mbuv(x->src_diff, x->src.u_buffer,
        x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer,
        x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
 }
@@ -128,7 +121,7 @@ void vp8_transform_mbuv(MACROBLOCK *x)

    for (i = 16; i < 24; i += 2)
    {
-        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+        x->short_fdct8x4(&x->block[i].src_diff[0],
            &x->block[i].coeff[0], 16);
    }
 }
@@ -140,7 +133,7 @@ void vp8_transform_intra_mby(MACROBLOCK *x)

    for (i = 0; i < 16; i += 2)
    {
-        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+        x->short_fdct8x4(&x->block[i].src_diff[0],
            &x->block[i].coeff[0], 32);
    }

@@ -160,7 +153,7 @@ static void transform_mb(MACROBLOCK *x)

    for (i = 0; i < 16; i += 2)
    {
-        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+        x->short_fdct8x4(&x->block[i].src_diff[0],
            &x->block[i].coeff[0], 32);
    }

@@ -170,7 +163,7 @@ static void transform_mb(MACROBLOCK *x)

    for (i = 16; i < 24; i += 2)
    {
-        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+        x->short_fdct8x4(&x->block[i].src_diff[0],
            &x->block[i].coeff[0], 16);
    }

@@ -188,7 +181,7 @@ static void transform_mby(MACROBLOCK *x)

    for (i = 0; i < 16; i += 2)
    {
-        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+        x->short_fdct8x4(&x->block[i].src_diff[0],
            &x->block[i].coeff[0], 32);
    }

@@ -229,8 +222,7 @@ static const int plane_rd_mult[4]=
 };

 static void optimize_b(MACROBLOCK *mb, int ib, int type,
-                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                       const VP8_ENCODER_RTCD *rtcd)
+                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
 {
    BLOCK *b;
    BLOCKD *d;
@@ -511,7 +503,7 @@ static void check_reset_2nd_coeffs(MACROBLOCKD *x, int type,
    }
 }

-static void optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+static void optimize_mb(MACROBLOCK *x)
 {
    int b;
    int type;
@@ -534,27 +526,27 @@ static void optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
    for (b = 0; b < 16; b++)
    {
        optimize_b(x, b, type,
-            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
    }

    for (b = 16; b < 24; b++)
    {
        optimize_b(x, b, PLANE_TYPE_UV,
-            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
    }

    if (has_2nd_order)
    {
        b=24;
        optimize_b(x, b, PLANE_TYPE_Y2,
-            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
        check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2,
            ta + vp8_block2above[b], tl + vp8_block2left[b]);
    }
 }


-void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+void vp8_optimize_mby(MACROBLOCK *x)
 {
    int b;
    int type;
@@ -583,7 +575,7 @@ void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
    for (b = 0; b < 16; b++)
    {
        optimize_b(x, b, type,
-            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
    }


@@ -591,13 +583,13 @@ void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
    {
        b=24;
        optimize_b(x, b, PLANE_TYPE_Y2,
-            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
        check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2,
            ta + vp8_block2above[b], tl + vp8_block2left[b]);
    }
 }

-void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+void vp8_optimize_mbuv(MACROBLOCK *x)
 {
    int b;
    ENTROPY_CONTEXT_PLANES t_above, t_left;
@@ -619,38 +611,38 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
    for (b = 16; b < 24; b++)
    {
        optimize_b(x, b, PLANE_TYPE_UV,
-            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
    }
 }

-void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+void vp8_encode_inter16x16(MACROBLOCK *x)
 {
    vp8_build_inter_predictors_mb(&x->e_mbd);

-    vp8_subtract_mb(rtcd, x);
+    vp8_subtract_mb(x);

    transform_mb(x);

    vp8_quantize_mb(x);

    if (x->optimize)
-        optimize_mb(x, rtcd);
+        optimize_mb(x);
 }

 /* this funciton is used by first pass only */
-void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+void vp8_encode_inter16x16y(MACROBLOCK *x)
 {
    BLOCK *b = &x->block[0];

    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
                                        x->e_mbd.dst.y_stride);

-    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
+    vp8_subtract_mby(x->src_diff, *(b->base_src),
        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);

    transform_mby(x);

    vp8_quantize_mby(x);

-    vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(rtcd->common));
+    vp8_inverse_transform_mby(&x->e_mbd);
 }
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -12,98 +12,15 @@
 #ifndef __INC_ENCODEMB_H
 #define __INC_ENCODEMB_H

-
-#include "vpx_config.h"
-#include "block.h"
-
-#define prototype_mberr(sym) \
-    int (sym)(MACROBLOCK *mb, int dc)
-
-#define prototype_berr(sym) \
-    int (sym)(short *coeff, short *dqcoeff)
-
-#define prototype_mbuverr(sym) \
-    int (sym)(MACROBLOCK *mb)
-
-#define prototype_subb(sym) \
-    void (sym)(BLOCK *be,BLOCKD *bd, int pitch)
-
-#define prototype_submby(sym) \
-    void (sym)(short *diff, unsigned char *src, int src_stride, \
-        unsigned char *pred, int pred_stride)
-
-#define prototype_submbuv(sym) \
-    void (sym)(short *diff, unsigned char *usrc, unsigned char *vsrc,\
-               int src_stride, unsigned char *upred, unsigned char *vpred,\
-               int pred_stride)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/encodemb_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/encodemb_arm.h"
-#endif
-
-#ifndef vp8_encodemb_berr
-#define vp8_encodemb_berr vp8_block_error_c
-#endif
-extern prototype_berr(vp8_encodemb_berr);
-
-#ifndef vp8_encodemb_mberr
-#define vp8_encodemb_mberr vp8_mbblock_error_c
-#endif
-extern prototype_mberr(vp8_encodemb_mberr);
-
-#ifndef vp8_encodemb_mbuverr
-#define vp8_encodemb_mbuverr vp8_mbuverror_c
-#endif
-extern prototype_mbuverr(vp8_encodemb_mbuverr);
-
-#ifndef vp8_encodemb_subb
-#define vp8_encodemb_subb vp8_subtract_b_c
-#endif
-extern prototype_subb(vp8_encodemb_subb);
-
-#ifndef vp8_encodemb_submby
-#define vp8_encodemb_submby vp8_subtract_mby_c
-#endif
-extern prototype_submby(vp8_encodemb_submby);
-
-#ifndef vp8_encodemb_submbuv
-#define vp8_encodemb_submbuv vp8_subtract_mbuv_c
-#endif
-extern prototype_submbuv(vp8_encodemb_submbuv);
-
-
-typedef struct
-{
-    prototype_berr(*berr);
-    prototype_mberr(*mberr);
-    prototype_mbuverr(*mbuverr);
-    prototype_subb(*subb);
-    prototype_submby(*submby);
-    prototype_submbuv(*submbuv);
-} vp8_encodemb_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define ENCODEMB_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define ENCODEMB_INVOKE(ctx,fn) vp8_encodemb_##fn
-#endif
-
-
-
 #include "onyx_int.h"
-struct VP8_ENCODER_RTCD;
-void vp8_encode_inter16x16(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+void vp8_encode_inter16x16(MACROBLOCK *x);

 void vp8_build_dcblock(MACROBLOCK *b);
 void vp8_transform_mb(MACROBLOCK *mb);
 void vp8_transform_mbuv(MACROBLOCK *x);
 void vp8_transform_intra_mby(MACROBLOCK *x);

-void vp8_optimize_mby(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
-void vp8_optimize_mbuv(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
-void vp8_encode_inter16x16y(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+void vp8_optimize_mby(MACROBLOCK *x);
+void vp8_optimize_mbuv(MACROBLOCK *x);
+void vp8_encode_inter16x16y(MACROBLOCK *x);
 #endif
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -148,7 +148,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                    x->rdmult = cpi->RDMULT;

                    //Copy current mb to a buffer
-                    RECON_INVOKE(&xd->rtcd->recon, copy16x16)(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+                    vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);

                    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
                        vp8_activity_masking(cpi, x);
@@ -304,8 +304,8 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    z->mv_row_max    = x->mv_row_max;
    */

-    z->vp8_short_fdct4x4     = x->vp8_short_fdct4x4;
-    z->vp8_short_fdct8x4     = x->vp8_short_fdct8x4;
+    z->short_fdct4x4     = x->short_fdct4x4;
+    z->short_fdct8x4     = x->short_fdct8x4;
    z->short_walsh4x4    = x->short_walsh4x4;
    z->quantize_b        = x->quantize_b;
    z->quantize_b_pair   = x->quantize_b_pair;
@@ -425,9 +425,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
        mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
        mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
        mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
-#if CONFIG_RUNTIME_CPU_DETECT
-        mbd->rtcd                   = xd->rtcd;
-#endif
        mb->gf_active_ptr            = x->gf_active_ptr;

        vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
@@ -440,9 +437,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,

        mbd->frame_type = cm->frame_type;

-        mbd->frames_since_golden = cm->frames_since_golden;
-        mbd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
-
        mb->src = * cpi->Source;
        mbd->pre = cm->yv12_fb[cm->lst_fb_idx];
        mbd->dst = cm->yv12_fb[cm->new_fb_idx];
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -31,12 +31,6 @@

 //#define OUTPUT_FPF 1

-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
-
 extern void vp8_build_block_offsets(MACROBLOCK *x);
 extern void vp8_setup_block_ptrs(MACROBLOCK *x);
 extern void vp8cx_frame_init_quantizer(VP8_COMP *cpi);
@@ -402,14 +396,14 @@ static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG
    unsigned char *src_ptr = (*(b->base_src) + b->src);
    int src_stride = b->src_stride;
    unsigned char *ref_ptr;
-    int ref_stride=d->pre_stride;
+    int ref_stride = x->e_mbd.pre.y_stride;

    // Set up pointers for this macro block recon buffer
    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;

-    ref_ptr = (unsigned char *)(*(d->base_pre) + d->pre );
+    ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );

-    VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16) ( src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err));
+    vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err));
 }

 static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
@@ -433,7 +427,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
    int new_mv_mode_penalty = 256;

    // override the default variance function to use MSE
-    v_fn_ptr.vf    = VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16);
+    v_fn_ptr.vf    = vp8_mse16x16;

    // Set up pointers for this macro block recon buffer
    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
@@ -576,7 +570,7 @@ void vp8_first_pass(VP8_COMP *cpi)
            xd->left_available = (mb_col != 0);

            //Copy current mb to a buffer
-            RECON_INVOKE(&xd->rtcd->recon, copy16x16)(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+            vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);

            // do intra 16x16 prediction
            this_error = vp8_encode_intra(cpi, x, use_dc_pred);
@@ -674,7 +668,7 @@ void vp8_first_pass(VP8_COMP *cpi)
                    d->bmi.mv.as_mv.col <<= 3;
                    this_error = motion_error;
                    vp8_set_mbmode_and_mvs(x, NEWMV, &d->bmi.mv);
-                    vp8_encode_inter16x16y(IF_RTCD(&cpi->rtcd), x);
+                    vp8_encode_inter16x16y(x);
                    sum_mvr += d->bmi.mv.as_mv.row;
                    sum_mvr_abs += abs(d->bmi.mv.as_mv.row);
                    sum_mvc += d->bmi.mv.as_mv.col;
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -1,116 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-
-void vp8_arch_x86_encoder_init(VP8_COMP *cpi);
-void vp8_arch_arm_encoder_init(VP8_COMP *cpi);
-
-void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,
-                                        YV12_BUFFER_CONFIG *dst_ybc);
-extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
-                                        YV12_BUFFER_CONFIG *dst_ybc);
-
-void vp8_cmachine_specific_config(VP8_COMP *cpi)
-{
-#if CONFIG_RUNTIME_CPU_DETECT
-    cpi->rtcd.common                    = &cpi->common.rtcd;
-    cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;
-    cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
-    cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
-    cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
-    cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;
-
-    cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_c;
-    cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_c;
-    cpi->rtcd.variance.sad8x16x3             = vp8_sad8x16x3_c;
-    cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_c;
-    cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_c;
-
-    cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_c;
-    cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_c;
-    cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_c;
-    cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_c;
-    cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_c;
-
-    cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_c;
-    cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_c;
-    cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_c;
-    cpi->rtcd.variance.sad8x8x4d             = vp8_sad8x8x4d_c;
-    cpi->rtcd.variance.sad4x4x4d             = vp8_sad4x4x4d_c;
-#if ARCH_X86 || ARCH_X86_64
-    cpi->rtcd.variance.copy32xn              = vp8_copy32xn_c;
-#endif
-    cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
-    cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
-    cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
-    cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
-    cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;
-
-    cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
-    cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
-    cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
-    cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
-    cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;
-    cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_c;
-    cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_c;
-    cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_c;
-    cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_c;
-
-    cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
-    cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
-
-    cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;
-
-    cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
-    cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
-    cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_c;
-    cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_c;
-    cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;
-
-    cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
-    cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
-    cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;
-    cpi->rtcd.encodemb.subb                  = vp8_subtract_b_c;
-    cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;
-    cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;
-
-    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
-    cpi->rtcd.quantize.quantb_pair           = vp8_regular_quantize_b_pair;
-    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
-    cpi->rtcd.quantize.fastquantb_pair       = vp8_fast_quantize_b_pair_c;
-    cpi->rtcd.search.full_search             = vp8_full_search_sad;
-    cpi->rtcd.search.refining_search         = vp8_refining_search_sad;
-    cpi->rtcd.search.diamond_search          = vp8_diamond_search_sad;
-#if !(CONFIG_REALTIME_ONLY)
-    cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_c;
-#endif
-#if CONFIG_INTERNAL_STATS
-    cpi->rtcd.variance.ssimpf_8x8            = vp8_ssim_parms_8x8_c;
-    cpi->rtcd.variance.ssimpf_16x16          = vp8_ssim_parms_16x16_c;
-#endif
-#endif
-
-    // Pure C:
-    vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
-
-#if ARCH_X86 || ARCH_X86_64
-    vp8_arch_x86_encoder_init(cpi);
-#endif
-
-#if ARCH_ARM
-    vp8_arch_arm_encoder_init(cpi);
-#endif
-
-}
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -211,10 +211,13 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

    int y_stride;
    int offset;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+

 #if ARCH_X86 || ARCH_X86_64
    MACROBLOCKD *xd = &x->e_mbd;
-    unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+    unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
    unsigned char *y;
    int buf_r1, buf_r2, buf_c1, buf_c2;

@@ -226,11 +229,11 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
    y_stride = 32;

    /* Copy to intermediate buffer before searching. */
-    vfp->copymem(y0 - buf_c1 - d->pre_stride*buf_r1, d->pre_stride, xd->y_buf, y_stride, 16+buf_r1+buf_r2);
+    vfp->copymem(y0 - buf_c1 - pre_stride*buf_r1, pre_stride, xd->y_buf, y_stride, 16+buf_r1+buf_r2);
    y = xd->y_buf + y_stride*buf_r1 +buf_c1;
 #else
-    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-    y_stride = d->pre_stride;
+    unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    y_stride = pre_stride;
 #endif

    offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
@@ -347,19 +350,21 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
    int whichdir ;
    int thismse;
    int y_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;

 #if ARCH_X86 || ARCH_X86_64
    MACROBLOCKD *xd = &x->e_mbd;
-    unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+    unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
    unsigned char *y;

    y_stride = 32;
    /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
-     vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);
+     vfp->copymem(y0 - 1 - pre_stride, pre_stride, xd->y_buf, y_stride, 18);
     y = xd->y_buf + y_stride + 1;
 #else
-     unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-     y_stride = d->pre_stride;
+     unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+     y_stride = pre_stride;
 #endif

    // central mv
@@ -662,19 +667,21 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
    int whichdir ;
    int thismse;
    int y_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;

 #if ARCH_X86 || ARCH_X86_64
    MACROBLOCKD *xd = &x->e_mbd;
-    unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+    unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
    unsigned char *y;

    y_stride = 32;
    /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
-    vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);
+    vfp->copymem(y0 - 1 - pre_stride, pre_stride, xd->y_buf, y_stride, 18);
    y = xd->y_buf + y_stride + 1;
 #else
-    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-    y_stride = d->pre_stride;
+    unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    y_stride = pre_stride;
 #endif

    // central mv
@@ -842,7 +849,10 @@ int vp8_hex_search

    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
-    int in_what_stride = d->pre_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+    int in_what_stride = pre_stride;
    int br, bc;
    int_mv this_mv;
    unsigned int bestsad = 0x7fffffff;
@@ -865,8 +875,8 @@ int vp8_hex_search
    bc = ref_mv->as_mv.col;

    // Work out the start point for the search
-    base_offset = (unsigned char *)(*(d->base_pre) + d->pre);
-    this_offset = base_offset + (br * (d->pre_stride)) + bc;
+    base_offset = (unsigned char *)(base_pre + d->offset);
+    this_offset = base_offset + (br * (pre_stride)) + bc;
    this_mv.as_mv.row = br;
    this_mv.as_mv.col = bc;
    bestsad = vfp->sdf( what, what_stride, this_offset,
@@ -1009,7 +1019,7 @@ cal_neighbors:
 #undef CHECK_POINT
 #undef CHECK_BETTER

-int vp8_diamond_search_sad
+int vp8_diamond_search_sad_c
 (
    MACROBLOCK *x,
    BLOCK *b,
@@ -1029,7 +1039,9 @@ int vp8_diamond_search_sad
    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
    unsigned char *in_what;
-    int in_what_stride = d->pre_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
    unsigned char *best_address;

    int tot_steps;
@@ -1061,7 +1073,7 @@ int vp8_diamond_search_sad
    best_mv->as_mv.col = ref_col;

    // Work out the start point for the search
-    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+    in_what = (unsigned char *)(base_pre + d->offset + (ref_row * pre_stride) + ref_col);
    best_address = in_what;

    // Check the starting position
@@ -1150,7 +1162,9 @@ int vp8_diamond_search_sadx4
    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
    unsigned char *in_what;
-    int in_what_stride = d->pre_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
    unsigned char *best_address;

    int tot_steps;
@@ -1182,7 +1196,7 @@ int vp8_diamond_search_sadx4
    best_mv->as_mv.col = ref_col;

    // Work out the start point for the search
-    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+    in_what = (unsigned char *)(base_pre + d->offset + (ref_row * pre_stride) + ref_col);
    best_address = in_what;

    // Check the starting position
@@ -1292,7 +1306,7 @@ int vp8_diamond_search_sadx4
        + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }

-int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                        int sad_per_bit, int distance,
                        vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
                        int_mv *center_mv)
@@ -1300,8 +1314,10 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
    unsigned char *in_what;
-    int in_what_stride = d->pre_stride;
-    int mv_stride = d->pre_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
+    int mv_stride = pre_stride;
    unsigned char *bestaddress;
    int_mv *best_mv = &d->bmi.mv;
    int_mv this_mv;
@@ -1325,8 +1341,8 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

    // Work out the mid point for the search
-    in_what = *(d->base_pre) + d->pre;
-    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+    in_what = base_pre + d->offset;
+    bestaddress = in_what + (ref_row * pre_stride) + ref_col;

    best_mv->as_mv.row = ref_row;
    best_mv->as_mv.col = ref_col;
@@ -1392,8 +1408,10 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
    unsigned char *in_what;
-    int in_what_stride = d->pre_stride;
-    int mv_stride = d->pre_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
+    int mv_stride = pre_stride;
    unsigned char *bestaddress;
    int_mv *best_mv = &d->bmi.mv;
    int_mv this_mv;
@@ -1419,8 +1437,8 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

    // Work out the mid point for the search
-    in_what = *(d->base_pre) + d->pre;
-    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+    in_what = base_pre + d->offset;
+    bestaddress = in_what + (ref_row * pre_stride) + ref_col;

    best_mv->as_mv.row = ref_row;
    best_mv->as_mv.col = ref_col;
@@ -1521,9 +1539,11 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
 {
    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
    unsigned char *in_what;
-    int in_what_stride = d->pre_stride;
-    int mv_stride = d->pre_stride;
+    int in_what_stride = pre_stride;
+    int mv_stride = pre_stride;
    unsigned char *bestaddress;
    int_mv *best_mv = &d->bmi.mv;
    int_mv this_mv;
@@ -1550,8 +1570,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

    // Work out the mid point for the search
-    in_what = *(d->base_pre) + d->pre;
-    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+    in_what = base_pre + d->offset;
+    bestaddress = in_what + (ref_row * pre_stride) + ref_col;

    best_mv->as_mv.row = ref_row;
    best_mv->as_mv.col = ref_col;
@@ -1674,7 +1694,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
        return INT_MAX;
 }

-int vp8_refining_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                            int error_per_bit, int search_range,
                            vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
                            int_mv *center_mv)
@@ -1684,10 +1704,12 @@ int vp8_refining_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
    short this_row_offset, this_col_offset;

    int what_stride = b->src_stride;
-    int in_what_stride = d->pre_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
    unsigned char *what = (*(b->base_src) + b->src);
-    unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +
-        (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);
+    unsigned char *best_address = (unsigned char *)(base_pre + d->offset +
+        (ref_mv->as_mv.row * pre_stride) + ref_mv->as_mv.col);
    unsigned char *check_here;
    unsigned int thissad;
    int_mv this_mv;
@@ -1761,10 +1783,12 @@ int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
    short this_row_offset, this_col_offset;

    int what_stride = b->src_stride;
-    int in_what_stride = d->pre_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
    unsigned char *what = (*(b->base_src) + b->src);
-    unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +
-        (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);
+    unsigned char *best_address = (unsigned char *)(base_pre + d->offset +
+        (ref_mv->as_mv.row * pre_stride) + ref_mv->as_mv.col);
    unsigned char *check_here;
    unsigned int thissad;
    int_mv this_mv;
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -50,98 +50,51 @@ typedef int (fractional_mv_step_fp)
    (MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv,
     int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2],
     int *distortion, unsigned int *sse);
+
 extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
 extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;
 extern fractional_mv_step_fp vp8_find_best_half_pixel_step;
 extern fractional_mv_step_fp vp8_skip_fractional_mv_step;

-#define prototype_full_search_sad(sym)\
-    int (sym)\
-    (\
-     MACROBLOCK *x, \
-     BLOCK *b, \
-     BLOCKD *d, \
-     int_mv *ref_mv, \
-     int sad_per_bit, \
-     int distance, \
-     vp8_variance_fn_ptr_t *fn_ptr, \
-     int *mvcost[2], \
-     int_mv *center_mv \
-    )
+typedef int (*vp8_full_search_fn_t)
+    (
+     MACROBLOCK *x,
+     BLOCK *b,
+     BLOCKD *d,
+     int_mv *ref_mv,
+     int sad_per_bit,
+     int distance,
+     vp8_variance_fn_ptr_t *fn_ptr,
+     int *mvcost[2],
+     int_mv *center_mv
+    );

-#define prototype_refining_search_sad(sym)\
-    int (sym)\
-    (\
-     MACROBLOCK *x, \
-     BLOCK *b, \
-     BLOCKD *d, \
-     int_mv *ref_mv, \
-     int sad_per_bit, \
-     int distance, \
-     vp8_variance_fn_ptr_t *fn_ptr, \
-     int *mvcost[2], \
-     int_mv *center_mv \
-    )
+typedef int (*vp8_refining_search_fn_t)
+    (
+     MACROBLOCK *x,
+     BLOCK *b,
+     BLOCKD *d,
+     int_mv *ref_mv,
+     int sad_per_bit,
+     int distance,
+     vp8_variance_fn_ptr_t *fn_ptr,
+     int *mvcost[2],
+     int_mv *center_mv
+    );

-#define prototype_diamond_search_sad(sym)\
-    int (sym)\
-    (\
-     MACROBLOCK *x, \
-     BLOCK *b, \
-     BLOCKD *d, \
-     int_mv *ref_mv, \
-     int_mv *best_mv, \
-     int search_param, \
-     int sad_per_bit, \
-     int *num00, \
-     vp8_variance_fn_ptr_t *fn_ptr, \
-     int *mvcost[2], \
-     int_mv *center_mv \
-    )
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/mcomp_x86.h"
-#endif
-
-typedef prototype_full_search_sad(*vp8_full_search_fn_t);
-extern prototype_full_search_sad(vp8_full_search_sad);
-extern prototype_full_search_sad(vp8_full_search_sadx3);
-extern prototype_full_search_sad(vp8_full_search_sadx8);
-
-typedef prototype_refining_search_sad(*vp8_refining_search_fn_t);
-extern prototype_refining_search_sad(vp8_refining_search_sad);
-extern prototype_refining_search_sad(vp8_refining_search_sadx4);
-
-typedef prototype_diamond_search_sad(*vp8_diamond_search_fn_t);
-extern prototype_diamond_search_sad(vp8_diamond_search_sad);
-extern prototype_diamond_search_sad(vp8_diamond_search_sadx4);
-
-#ifndef vp8_search_full_search
-#define vp8_search_full_search vp8_full_search_sad
-#endif
-extern prototype_full_search_sad(vp8_search_full_search);
-
-#ifndef vp8_search_refining_search
-#define vp8_search_refining_search vp8_refining_search_sad
-#endif
-extern prototype_refining_search_sad(vp8_search_refining_search);
-
-#ifndef vp8_search_diamond_search
-#define vp8_search_diamond_search vp8_diamond_search_sad
-#endif
-extern prototype_diamond_search_sad(vp8_search_diamond_search);
-
-typedef struct
-{
-    prototype_full_search_sad(*full_search);
-    prototype_refining_search_sad(*refining_search);
-    prototype_diamond_search_sad(*diamond_search);
-} vp8_search_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define SEARCH_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define SEARCH_INVOKE(ctx,fn) vp8_search_##fn
-#endif
+typedef int (*vp8_diamond_search_fn_t)
+    (
+     MACROBLOCK *x,
+     BLOCK *b,
+     BLOCKD *d,
+     int_mv *ref_mv,
+     int_mv *best_mv,
+     int search_param,
+     int sad_per_bit,
+     int *num00,
+     vp8_variance_fn_ptr_t *fn_ptr,
+     int *mvcost[2],
+     int_mv *center_mv
+    );

 #endif
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
John Koleszar	b76c4c6ba7	WIP: reuse splitmv segmentation (partial) still not reusing split mvs, but reusing partition size (8x8, 8x16, etc) Change-Id: I4655b06fcdcbc71a97bff07def78297ae8e5104c	2012-03-22 15:06:53 -07:00
John Koleszar	7f6a695771	WIP: force keyframe based on frame refresh flags Change-Id: I78d3001ebc02cb5a06d256b5d9ec6aa96f1f8bc0	2012-03-22 15:05:35 -07:00
John Koleszar	a764f61f70	WIP: read modemv from a different file Change-Id: I0531a3ed133cea42cbad63daeb37815a92b00d83	2012-03-21 17:23:25 -07:00
John Koleszar	50c5e81c7c	WIP: reuse splitmv directly (disabled for now) Change-Id: Ia3a6c09ea5eee886b515ff74c6813a1d50e18b08	2012-03-21 17:11:19 -07:00
John Koleszar	6463c7cf72	WIP: add support for using the modemv from the original stream directly Change-Id: I4e849a67eaeb654cd1f5c2ac907145cac040a532	2012-03-21 16:08:12 -07:00
John Koleszar	7070dab445	wip: reuse mode/mv in multistream file Add --read-mvinfo and --write-mvinfo to pass modes and motion vectors between encodes Change-Id: I8d73fbd43d27f765bb2ff3026f4a2191b81c46a9	2012-03-09 18:00:12 -08:00
John Koleszar	0a9b65ba1c	vpxenc: accept webm input Change-Id: Id71cccb267fc29acbbe51d6e7378825cb692b2d3	2012-03-08 12:19:09 -08:00
John Koleszar	0a50a29121	vpxenc: generate multistream file Add the ability to append mode/mv records onto the bitstream. Change-Id: I83d3125ffe8e6c25dd9e2fa900c963f4f571e6f9	2012-03-08 12:19:09 -08:00
John Koleszar	118b445bab	vpxenc: support scaling prior to encoding Scales the input of the encoder using libyuv's "box filter". Each stream may have a different width and height specified. If the width (or height) parameter is missing (or is explicitly set to 0) then the value will be calculated based on the specified height (or width) and the input file's dimensions, preserving its aspect ratio. Leaving the height unspecified behaves similarly. Note: This functionality still does not take advantage of the accelerated multi-resolution encoder support with CONFIG_MULTI_RES_ENCODING. Change-Id: Ic7026810b13be030826be80dc6f7fc4aaf0c35d0	2012-03-01 14:44:24 -08:00
John Koleszar	3e78b5f2b8	libyuv: fix compilation on ARM ScaleRowDown2_NEON, ScaleRowDown4_NEON, ScaleRowDown34_NEON, ScaleRowDown38_NEON had anonymous parameters, which are not valid in C. Change-Id: If55f765e0c410f35b01a23c2bb9aea8966c0109d	2012-03-01 14:44:23 -08:00
John Koleszar	a6f538cefa	vpx_timer: increase resolution There's no useful reason to limit this timer to 1 second. Change-Id: Idd1960268624e8bdfe958d99833ae6482fdb423e	2012-03-01 13:57:17 -08:00
John Koleszar	9e50ed7f27	vpxenc: initial implementation of multistream support Add the ability to specify multiple output streams on the command line. Streams are delimited by --, and most parameters inherit from previous streams. In this implementation, resizing streams is still not supported. It does not make use of the new multistream support in the encoder either. Two pass support runs all streams independently, though it's theoretically possible that we could combine firstpass runs in the future. The logic required for this is too tricky to do as part of this initial implementation. This is mostly an effort to get the parameter passing and independent streams working from the application's perspective, and a later commit will add the rescaling and multiresolution support. Change-Id: Ibf18c2355f54189fc91952c734c899e5c072b3e0	2012-02-16 12:30:01 -08:00
John Koleszar	732cb9a643	vpxenc: factor out input open/close Simplify some of the file I/O for later commits which will add multistream support Change-Id: Idf1a05f3a29c95331d0c4a6ea5960904e4897fd4	2012-02-16 12:30:00 -08:00
John Koleszar	c535025c12	vpxenc: add warning()/fatal() helpers Cosmetic. Allows exiting with an error message without opening a new scope. Change-Id: If227b29b825f0241acea79dd38f19e524552ee18	2012-02-16 12:26:58 -08:00
John Koleszar	efd54f8f41	vpxenc: factor out global config options This is a first step towards specifying multiple output streams with one command line. Change-Id: Iac784d3911bf553694d024bbd0c3d547261e914b	2012-02-15 16:11:35 -08:00
John Koleszar	e6df50031e	Merge "support changing resolution with vpx_codec_enc_config_set"	2012-02-10 16:18:00 -08:00
Johann	169823428f	Missed some variance casts Change-Id: I9fb510f9421fb3c317a8e32e3058cee977ddf9fa	2012-02-10 11:07:33 -08:00
Johann	12d45f62f6	Merge "max_sad check is not always implemented"	2012-02-10 10:28:00 -08:00
Johann	8c50a70a95	max_sad check is not always implemented As an optimization some architectures use the max_sad argument to break out early from the SAD. Pass in INT_MAX instead of 0 to prevent this. Change-Id: I653c476834b97771578d63f231233d445388629d	2012-02-09 16:19:10 -08:00
Johann	fea3556e20	Fix variance overflow In the variance calculations the difference is summed and later squared. When the sum exceeds sqrt(2^31) the value is treated as a negative when it is shifted which gives incorrect results. To fix this we cast the result of the multiplication as unsigned. The alternative fix is to shift sum down by 4 before multiplying. However that will reduce precision. For 16x16 blocks the maximum sum is 65280 and sqrt(2^31) is 46340 (and change). PPC change is untested. Change-Id: I1bad27ea0720067def6d71a6da5f789508cec265	2012-02-09 12:38:31 -08:00
John Koleszar	2e0d55314c	Merge "Add OS/2 supports"	2012-02-08 11:00:55 -08:00
KO Myung-Hun	2dad8d65d9	Add OS/2 supports Change-Id: I792d5236451905eb20a8ebe444ef5b2274e4f7a4	2012-02-08 09:44:42 -08:00
John Koleszar	51acb01167	support changing resolution with vpx_codec_enc_config_set Allow the application to change the frame size during encoding. This is only supported when not using lagged compress. Change-Id: I89b585d703d5fd728a9e3dedf997f1b595d0db0f	2012-02-07 17:09:40 -08:00
John Koleszar	417b852967	Align internal mfqe framebuffer dimensions MFQE postproc crashed with stream dimensions not a multiple of 16. The buffer was memset unconditionally, so if the buffer allocation fails we end up trying to write to NULL. This patch traps an allocation failure with vpx_internal_error(), and aligns the buffer dimensions to what vp8_yv12_alloc_frame_buffer() expects. Change-Id: I3915d597cd66886a24f4ef39752751ebe6425066	2012-02-07 10:40:26 -08:00
Adrian Grange	45f4b87e8e	Fixed bug in 5-layer multi-layer encode The 5-layer encode must have a keyframe every 16 frames. The KF flag was being reset after the encode of the first frame, which it should not do for the 5-layer case (mode=6). Change-Id: I207d6e689d347fe3fd1075b97a817e82f7ad53b9	2012-02-06 15:02:33 -08:00
Adrian Grange	9df0d29823	Merge "Added 2 temporal patterns with new parameters"	2012-02-06 14:45:33 -08:00
Yunqing Wang	a040eb37e4	Merge "Allow to skip highest-resolution encoding in multi-resolution encoder"	2012-02-06 13:58:11 -08:00
Yunqing Wang	fa1a9290e6	Allow to skip highest-resolution encoding in multi-resolution encoder Sometimes, a user doesn't have enough bandwidth to send high-resolution (i.e. HD) video even though the camera catches HD video. This change allowed users to skip highest-resolution encoding by setting that level's target bit rate to 0. To test it, modify the following line in vp8_multi_resolution_encoder.c. unsigned int target_bitrate[NUM_ENCODERS]={1400, 500, 100}; To skip the highest-resolution level, change it to unsigned int target_bitrate[NUM_ENCODERS]={0, 500, 100}; To skip the first and second highest resolution levels, change it to unsigned int target_bitrate[NUM_ENCODERS]={0, 0, 100}; This change also fixed a small problem in mapping, which slightly helped quality and performance. Change-Id: I977bae9a9fbfba85c8be4bd5af01539f2b84bc81	2012-02-03 13:39:05 -05:00
Scott LaVarnway	d8ebdcd89d	Moved ref_frame_cost from MACROBLOCKD to MACROBLOCK Change-Id: I05788522e9cde4322cfb12032483bdbf184bdf0b	2012-02-02 13:40:08 -05:00
Scott LaVarnway	11c706488b	Removed frames_till_alt_ref_frame from MACROBLOCKD Change-Id: Ieb05270ac332a4cc38ec4b7b995fc0150e0fffdf	2012-02-02 13:34:13 -05:00
Scott LaVarnway	e2000cc5ca	Removed frames_since_golden from MACROBLOCKD Change-Id: I10efa441d663fceb6bc97a3bfad518cd3d9a5128	2012-02-02 13:28:41 -05:00
Scott LaVarnway	07c6eb18ad	Merge "Improved uv mv calculations in build inter predictor"	2012-01-31 10:43:49 -08:00
Scott LaVarnway	749bc98618	BLOCKD structure cleanup Removed redundancies. All of the information can be found in the MACROBLOCKD structure. Change-Id: I7556392c6f67b43bef2a5e9932180a737466ef93	2012-01-31 11:02:39 -05:00
John Koleszar	57d459ba82	RTCD: remove unimplemented vp8_short_walsh4x4_mmx This function does not exist. Change-Id: I84b72fb17d572d5cccee92220467b84c15842d4d	2012-01-30 12:55:45 -08:00
John Koleszar	8aae246089	RTCD: finalize removal of old RTCD system This is the final commit in the series converting to the new RTCD system. It removes the encoder csystemdependent files and the remaining global function pointers that didn't conform to the old RTCD system. Change-Id: I9649706f1bb89f0cbf431ab0e3e7552d37be4d8e	2012-01-30 12:10:48 -08:00
John Koleszar	109b69a706	RTCD: add arnr functions This commit continues the process of converting to the new RTCD system. It removes the last of the VP8_ENCODER_RTCD struct references. Change-Id: I2a44f52d7cccf5177e1ca98a028ead570d045395	2012-01-30 12:10:48 -08:00
John Koleszar	0b0bc8d098	RTCD: add motion search functions This commit continues the process of converting to the new RTCD system. Change-Id: Ia5828b7ecc80db55b21916704aa3d54cbb98f625	2012-01-30 12:10:47 -08:00
John Koleszar	be8af188d0	RTCD: add block subtraction functions This commit continues the process of converting to the new RTCD system. Change-Id: Id8a287fdd4bd050ea4452e1582ad85520f3081be	2012-01-30 12:10:47 -08:00
John Koleszar	61311e6103	RTCD: add quantizer functions This commit continues the process of converting to the new RTCD system. Change-Id: Iba9df4c03a508e51c37201c621be43523fae87d9	2012-01-30 12:10:46 -08:00
John Koleszar	510e0ab467	RTCD: add FDCT functions This commit continues the process of converting to the new RTCD system. Change-Id: I3f9c07db65eb206f6363d21bdb80e871570da767	2012-01-30 12:10:42 -08:00
John Koleszar	83a91e789c	RTCD: add variance functions This commit continues the process of converting to the new RTCD system. Change-Id: Ie5c1aa480637e98dc3918fb562ff45c37a66c538	2012-01-30 12:08:30 -08:00
John Koleszar	f103dcefaf	RTCD: add subpixel functions This commit continues the process of converting to the new RTCD system. Change-Id: I6c519ab61e4f4e0ebcc796f2df061f945c48cefe	2012-01-30 12:08:29 -08:00
John Koleszar	2a8f57f50d	RTCD: add postproc functions This commit continues the process of converting to the new RTCD system. Change-Id: If54eb5cb5d1b0cac6c4c0633a9e99c93ca860ba2	2012-01-30 12:08:29 -08:00
John Koleszar	fdb61a4531	RTCD: add recon functions This commit continues the process of converting to the new RTCD system. Change-Id: I9bfcf9bef65c3d4ba0fb9a3e1532bad1463a10d6	2012-01-30 12:08:28 -08:00
John Koleszar	ab77b4e898	RTCD: add remaining IDCT functions This commit continues the process of converting to the new RTCD system. Change-Id: I03c4dbf30dfd3558b0e256ff9d3ff4c012aadc80	2012-01-30 12:08:22 -08:00
John Koleszar	55f74c59c7	RTCD: add loopfilter functions This commit continues the process of converting to the new RTCD system. Change-Id: Ic8a4047d72ff3a54ec98977dd90e70c13213db71	2012-01-30 12:06:31 -08:00
John Koleszar	a910049aea	New RTCD implementation This is a proof of concept RTCD implementation to replace the current system of nested includes, prototypes, INVOKE macros, etc. Currently only the decoder specific functions are implemented in the new system. Additional functions will be added in subsequent commits. Overview: RTCD "functions" are implemented as either a global function pointer or a macro (when only one eligible specialization available). Functions which have RTCD specializations are listed using a simple DSL identifying the function's base name, its prototype, and the architecture extensions that specializations are available for. Advantages over the old system: - No INVOKE macros. A call to an RTCD function looks like an ordinary function call. - No need to pass vtables around. - If there is only one eligible function to call, the function is called directly, rather than indirecting through a function pointer. - Supports the notion of "required" extensions, so in combination with the above, on x86_64 if the best function available is sse2 or lower it will be called directly, since all x86_64 platforms implement sse2. - Elides all references to functions which will never be called, which could reduce binary size. For example if sse2 is required and there are both mmx and sse2 implementations of a certain function, the code will have no link time references to the mmx code. - Significantly easier to add a new function, just one file to edit. Disadvantages: - Requires global writable data (though this is not a new requirement) - 1 new generated source file. Change-Id: Iae6edab65315f79c168485c96872641c5aa09d55	2012-01-30 12:06:27 -08:00
John Koleszar	57cc35dd60	Merge Duclair release into master branch Change-Id: Ibf577972e8cd10488d44385ff74f136a07466c0c	2012-01-27 14:08:40 -08:00
John Koleszar	9951f46133	Merge "Hook up VP8D_GET_LAST_REF_USED"	2012-01-27 11:31:35 -08:00
Adrian Grange	8978358358	Added 2 temporal patterns with new parameters Added new 2 and 3 layer prediction frame patterns to vp8_scalable_patterns.c and modified the coding parameters. Change-Id: I18798fd7326a79d2ad1e1d5b6c26f5516b6d247f	2012-01-26 16:50:15 -08:00
John Koleszar	319f7c4d56	Merge changes I17e1a348,Iad710941 * changes: Correct clamping in use of vp8_find_near_mvs() Revert "Multithreaded encoder, late sync loopfilter"	2012-01-26 14:33:28 -08:00
Attila Nagy	294aa37745	Rename save_neon_reg.asm as save_reg_neon.asm Easier to filter out all NEON asm. Change-Id: I0022dae8321a9608e864b09d4181414c5fff4610	2012-01-26 09:44:00 +02:00
Fritz Koenig	c14754be1d	Merge "Disconnect ARM tgt_isa from dsp extensions"	2012-01-23 11:13:33 -08:00
Scott LaVarnway	8a6af9f98f	Improved uv mv calculations in build inter predictor Changed calculations to use shifts instead of if-then-else. Eliminates branches. Change-Id: I11b75e8bb305301ffd9cb577fb7df059a3cf9ea4	2012-01-23 11:34:43 -05:00
Fritz Koenig	892102842a	Disconnect ARM tgt_isa from dsp extensions A processor with ARMv7 instructions does not necessarily have NEON dsp extensions. This CL has the added side effect of allowing the ability to enable/disable the dsp extensions cleanly. Change-Id: Ie1e879b8fe131885bc3d4138a0acc9ffe73a36df	2012-01-20 10:38:15 -08:00