vpx_codec_dec_init: check that the iface is a decoder

Make sure the given interface is actually a decoder interface before initializing it. Change-Id: Ie48d737f2956cc2f0891666de5ea87251e96bc49
Remove unused vp8_get4x4sse_cs_mmx declaration
2011-03-24 15:05:10 +02:00 · 2011-03-24 15:05:10 +02:00 · 2011-03-24 15:05:10 +02:00 · 2011-03-24 15:05:10 +02:00 · 2011-03-24 15:05:09 +02:00 · 2011-03-24 15:05:09 +02:00
88 changed files with 3881 additions and 2537 deletions
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -152,8 +152,8 @@ endif
 # Rule to extract assembly constants from C sources
 #
 obj_int_extract: build/make/obj_int_extract.c
-	$(if $(quiet),echo "    [HOSTCC] $@")
-	$(qexec)$(HOSTCC) -I. -o $@ $<
+	$(if $(quiet),@echo "    [HOSTCC] $@")
+	$(qexec)$(HOSTCC) -I. -I$(SRC_PATH_BARE) -o $@ $<
 CLEAN-OBJS += obj_int_extract

 #
@@ -255,7 +255,7 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),)
 endif

 #
-# Configuration dependant rules
+# Configuration dependent rules
 #
 $(call pairmap,install_map_templates,$(INSTALL_MAPS))

@@ -332,7 +332,7 @@ ifneq ($(call enabled,DIST-SRCS),)
    DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/yasm.rules
    DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
    #
-    # This isn't really ARCH_ARM dependent, it's dependant on whether we're
+    # This isn't really ARCH_ARM dependent, it's dependent on whether we're
    # using assembly code or not (CONFIG_OPTIMIZATIONS maybe). Just use
    # this for now.
    DIST-SRCS-$(ARCH_ARM)    += build/make/obj_int_extract.c
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -83,7 +83,7 @@ Build options:
  ${toggle_werror}            treat warnings as errors, if possible
                              (not available with all compilers)
  ${toggle_optimizations}     turn on/off compiler optimization flags
-  ${toggle_pic}               turn on/off Position Independant Code
+  ${toggle_pic}               turn on/off Position Independent Code
  ${toggle_ccache}            turn on/off compiler cache
  ${toggle_debug}             enable/disable debug mode
  ${toggle_gprof}             enable/disable gprof profiling instrumentation
@@ -957,7 +957,7 @@ process_common_toolchain() {
        enabled small && check_add_cflags -O2 || check_add_cflags -O3
    fi

-    # Position Independant Code (PIC) support, for building relocatable
+    # Position Independent Code (PIC) support, for building relocatable
    # shared objects
    enabled gcc && enabled pic && check_add_cflags -fPIC

--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -33,6 +33,7 @@ Options:
    --proj-guid=GUID            GUID to use for the project
    --module-def=filename       File containing export definitions (for DLLs)
    --ver=version               Version (7,8,9) of visual studio to generate for
+    --src-path-bare=dir         Path to root of source tree
    -Ipath/to/include           Additional include directories
    -DFLAG[=value]              Preprocessor macros to define
    -Lpath/to/lib               Additional library search paths
@@ -191,6 +192,8 @@ for opt in "$@"; do
        ;;
        --lib) proj_kind="lib"
        ;;
+        --src-path-bare=*) src_path_bare="$optval"
+        ;;
        --static-crt) use_static_runtime=true
        ;;
        --ver=*)
@@ -335,6 +338,35 @@ generate_vcproj() {
        case "$target" in
            x86*)
                case "$name" in
+                    obj_int_extract)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            Optimization="0" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
+                            RuntimeLibrary="$debug_runtime" \
+                            WarningLevel="3" \
+                            Detect64BitPortabilityProblems="true" \
+                            DebugInformationFormat="1" \
+                    ;;
+                    vpx)
+                        tag Tool \
+                            Name="VCPreBuildEventTool" \
+                            CommandLine="call obj_int_extract.bat $src_path_bare" \
+
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            Optimization="0" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+                            RuntimeLibrary="$debug_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            DebugInformationFormat="1" \
+                            Detect64BitPortabilityProblems="true" \
+
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="1"
+                    ;;
                    *)
                        tag Tool \
                            Name="VCCLCompilerTool" \
@@ -358,6 +390,12 @@ generate_vcproj() {
                case "$target" in
                    x86*)
                        case "$name" in
+                            obj_int_extract)
+                                tag Tool \
+                                    Name="VCLinkerTool" \
+                                    OutputFile="${name}.exe" \
+                                    GenerateDebugInformation="true" \
+                            ;;
                            *)
                                tag Tool \
                                    Name="VCLinkerTool" \
@@ -406,6 +444,34 @@ generate_vcproj() {
        case "$target" in
            x86*)
                case "$name" in
+                    obj_int_extract)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
+                            RuntimeLibrary="$release_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            Detect64BitPortabilityProblems="true" \
+                            DebugInformationFormat="0" \
+                    ;;
+                    vpx)
+                        tag Tool \
+                            Name="VCPreBuildEventTool" \
+                            CommandLine="call obj_int_extract.bat $src_path_bare" \
+
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+                            RuntimeLibrary="$release_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            DebugInformationFormat="0" \
+                            Detect64BitPortabilityProblems="true" \
+
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs"
+                    ;;
                    *)
                        tag Tool \
                            Name="VCCLCompilerTool" \
@@ -428,6 +494,12 @@ generate_vcproj() {
                case "$target" in
                    x86*)
                        case "$name" in
+                            obj_int_extract)
+                                tag Tool \
+                                    Name="VCLinkerTool" \
+                                    OutputFile="${name}.exe" \
+                                    GenerateDebugInformation="true" \
+                            ;;
                            *)
                                tag Tool \
                                    Name="VCLinkerTool" \
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
@@ -14,7 +14,7 @@

 #include "vpx_config.h"

-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__MINGW32__)
 #include <io.h>
 #include <share.h>
 #include "vpx/vpx_integer.h"
@@ -59,20 +59,47 @@ int parse_macho(uint8_t *base_buf, size_t sz)
    struct mach_header header;
    uint8_t *buf = base_buf;
    int base_data_section = 0;
+    int bits = 0;

+    /* We can read in mach_header for 32 and 64 bit architectures
+     * because it's identical to mach_header_64 except for the last
+     * element (uint32_t reserved), which we don't use. Then, when
+     * we know which architecture we're looking at, increment buf
+     * appropriately.
+     */
    memcpy(&header, buf, sizeof(struct mach_header));
-    buf += sizeof(struct mach_header);

-    if (header.magic != MH_MAGIC)
+    if (header.magic == MH_MAGIC)
    {
-        log_msg("Bad magic number for object file. 0x%x expected, 0x%x found.\n",
-                header.magic, MH_MAGIC);
-        goto bail;
+        if (header.cputype == CPU_TYPE_ARM
+            || header.cputype == CPU_TYPE_X86)
+        {
+            bits = 32;
+            buf += sizeof(struct mach_header);
+        }
+        else
+        {
+            log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_[ARM|X86].\n");
+            goto bail;
+        }
    }
-
-    if (header.cputype != CPU_TYPE_ARM)
+    else if (header.magic == MH_MAGIC_64)
    {
-        log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_ARM.\n");
+        if (header.cputype == CPU_TYPE_X86_64)
+        {
+            bits = 64;
+            buf += sizeof(struct mach_header_64);
+        }
+        else
+        {
+            log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_X86_64.\n");
+            goto bail;
+        }
+    }
+    else
+    {
+        log_msg("Bad magic number for object file. 0x%x or 0x%x expected, 0x%x found.\n",
+                MH_MAGIC, MH_MAGIC_64, header.magic);
        goto bail;
    }

@@ -85,8 +112,6 @@ int parse_macho(uint8_t *base_buf, size_t sz)
    for (i = 0; i < header.ncmds; i++)
    {
        struct load_command lc;
-        struct symtab_command sc;
-        struct segment_command seg_c;

        memcpy(&lc, buf, sizeof(struct load_command));

@@ -94,50 +119,99 @@ int parse_macho(uint8_t *base_buf, size_t sz)
        {
            uint8_t *seg_buf = buf;
            struct section s;
+            struct segment_command seg_c;

-            memcpy(&seg_c, buf, sizeof(struct segment_command));
-
+            memcpy(&seg_c, seg_buf, sizeof(struct segment_command));
            seg_buf += sizeof(struct segment_command);

-            for (j = 0; j < seg_c.nsects; j++)
+            /* Although each section is given it's own offset, nlist.n_value
+             * references the offset of the first section. This isn't
+             * apparent without debug information because the offset of the
+             * data section is the same as the first section. However, with
+             * debug sections mixed in, the offset of the debug section
+             * increases but n_value still references the first section.
+             */
+            if (seg_c.nsects < 1)
            {
-                memcpy(&s, seg_buf + (j * sizeof(struct section)), sizeof(struct section));
-
-                // Need to get this offset which is the start of the symbol table
-                // before matching the strings up with symbols.
-                base_data_section = s.offset;
+                log_msg("Not enough sections\n");
+                goto bail;
            }
+
+            memcpy(&s, seg_buf, sizeof(struct section));
+            base_data_section = s.offset;
+        }
+        else if (lc.cmd == LC_SEGMENT_64)
+        {
+            uint8_t *seg_buf = buf;
+            struct section_64 s;
+            struct segment_command_64 seg_c;
+
+            memcpy(&seg_c, seg_buf, sizeof(struct segment_command_64));
+            seg_buf += sizeof(struct segment_command_64);
+
+            /* Explanation in LG_SEGMENT */
+            if (seg_c.nsects < 1)
+            {
+                log_msg("Not enough sections\n");
+                goto bail;
+            }
+
+            memcpy(&s, seg_buf, sizeof(struct section_64));
+            base_data_section = s.offset;
        }
        else if (lc.cmd == LC_SYMTAB)
        {
-            uint8_t *sym_buf = base_buf;
-            uint8_t *str_buf = base_buf;
-
            if (base_data_section != 0)
            {
+                struct symtab_command sc;
+                uint8_t *sym_buf = base_buf;
+                uint8_t *str_buf = base_buf;
+
                memcpy(&sc, buf, sizeof(struct symtab_command));

                if (sc.cmdsize != sizeof(struct symtab_command))
+                {
                    log_msg("Can't find symbol table!\n");
+                    goto bail;
+                }

                sym_buf += sc.symoff;
                str_buf += sc.stroff;

                for (j = 0; j < sc.nsyms; j++)
                {
-                    struct nlist nl;
-                    int val;
+                    /* Location of string is cacluated each time from the
+                     * start of the string buffer.  On darwin the symbols
+                     * are prefixed by "_", so we bump the pointer by 1.
+                     * The target value is defined as an int in asm_*_offsets.c,
+                     * which is 4 bytes on all targets we currently use.
+                     */
+                    if (bits == 32)
+                    {
+                        struct nlist nl;
+                        int val;

-                    memcpy(&nl, sym_buf + (j * sizeof(struct nlist)), sizeof(struct nlist));
+                        memcpy(&nl, sym_buf, sizeof(struct nlist));
+                        sym_buf += sizeof(struct nlist);

-                    val = *((int *)(base_buf + base_data_section + nl.n_value));
+                        memcpy(&val, base_buf + base_data_section + nl.n_value,
+                               sizeof(val));
+                        printf("%-40s EQU %5d\n",
+                               str_buf + nl.n_un.n_strx + 1, val);
+                    }
+                    else /* if (bits == 64) */
+                    {
+                        struct nlist_64 nl;
+                        int val;

-                    // Location of string is cacluated each time from the
-                    // start of the string buffer.  On darwin the symbols
-                    // are prefixed by "_".  On other platforms it is not
-                    // so it needs to be removed.  That is the reason for
-                    // the +1.
-                    printf("%-40s EQU %5d\n", str_buf + nl.n_un.n_strx + 1, val);
+                        memcpy(&nl, sym_buf, sizeof(struct nlist_64));
+                        sym_buf += sizeof(struct nlist_64);
+
+                        memcpy(&val, base_buf + base_data_section + nl.n_value,
+                               sizeof(val));
+                        printf("%-40s EQU %5d\n",
+                               str_buf + nl.n_un.n_strx + 1, val);
+                    }
                }
            }
        }
@@ -218,7 +292,7 @@ bail:
    return EXIT_FAILURE;
 }

-#else
+#elif defined(__ELF__)
 #include "elf.h"

 #define COPY_STRUCT(dst, buf, ofst, sz) do {\
@@ -237,212 +311,420 @@ bail:

 typedef struct
 {
-    uint8_t     *buf; /* Buffer containing ELF data */
-    size_t       sz;  /* Buffer size */
-    int          le_data;   /* Data is little-endian */
-    Elf32_Ehdr   hdr;
+    uint8_t      *buf; /* Buffer containing ELF data */
+    size_t        sz;  /* Buffer size */
+    int           le_data; /* Data is little-endian */
+    unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */
+    int           bits; /* 32 or 64 */
+    Elf32_Ehdr    hdr32;
+    Elf64_Ehdr    hdr64;
 } elf_obj_t;

-int parse_elf32_header(elf_obj_t *elf)
+int parse_elf_header(elf_obj_t *elf)
 {
    int res;
-    /* Verify ELF32 header */
-    COPY_STRUCT(&elf->hdr, elf->buf, 0, elf->sz);
-    res = elf->hdr.e_ident[EI_MAG0] == ELFMAG0;
-    res &= elf->hdr.e_ident[EI_MAG1] == ELFMAG1;
-    res &= elf->hdr.e_ident[EI_MAG2] == ELFMAG2;
-    res &= elf->hdr.e_ident[EI_MAG3] == ELFMAG3;
-    res &= elf->hdr.e_ident[EI_CLASS] == ELFCLASS32;
-    res &= elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB
-           || elf->hdr.e_ident[EI_DATA] == ELFDATA2MSB;
+    /* Verify ELF Magic numbers */
+    COPY_STRUCT(&elf->e_ident, elf->buf, 0, elf->sz);
+    res = elf->e_ident[EI_MAG0] == ELFMAG0;
+    res &= elf->e_ident[EI_MAG1] == ELFMAG1;
+    res &= elf->e_ident[EI_MAG2] == ELFMAG2;
+    res &= elf->e_ident[EI_MAG3] == ELFMAG3;
+    res &= elf->e_ident[EI_CLASS] == ELFCLASS32
+        || elf->e_ident[EI_CLASS] == ELFCLASS64;
+    res &= elf->e_ident[EI_DATA] == ELFDATA2LSB;

    if (!res) goto bail;

-    elf->le_data = elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB;
+    elf->le_data = elf->e_ident[EI_DATA] == ELFDATA2LSB;

-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_type);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_machine);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_version);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_entry);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_flags);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_ehsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shstrndx);
-    return 0;
-bail:
-    return 1;
-}
-
-int parse_elf32_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr)
-{
-    if (idx >= elf->hdr.e_shnum)
-        goto bail;
-
-    COPY_STRUCT(hdr, elf->buf, elf->hdr.e_shoff + idx * elf->hdr.e_shentsize,
-                elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_name);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_type);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_flags);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addr);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_offset);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_size);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_link);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_info);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addralign);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_entsize);
-    return 0;
-bail:
-    return 1;
-}
-
-char *parse_elf32_string_table(elf_obj_t *elf, int s_idx, int idx)
-{
-    Elf32_Shdr shdr;
-
-    if (parse_elf32_section(elf, s_idx, &shdr))
+    /* Read in relevant values */
+    if (elf->e_ident[EI_CLASS] == ELFCLASS32)
    {
-        log_msg("Failed to parse ELF string table: section %d, index %d\n",
-                s_idx, idx);
-        return "";
+        elf->bits = 32;
+        COPY_STRUCT(&elf->hdr32, elf->buf, 0, elf->sz);
+
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_type);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_machine);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_version);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_entry);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_flags);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_ehsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shstrndx);
+    }
+    else /* if (elf->e_ident[EI_CLASS] == ELFCLASS64) */
+    {
+        elf->bits = 64;
+        COPY_STRUCT(&elf->hdr64, elf->buf, 0, elf->sz);
+
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_type);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_machine);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_version);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_entry);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_flags);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_ehsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shstrndx);
    }

-    return (char *)(elf->buf + shdr.sh_offset + idx);
+    return 0;
+bail:
+    log_msg("Failed to parse ELF file header");
+    return 1;
 }

-int parse_elf32_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym)
+int parse_elf_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr32, Elf64_Shdr *hdr64)
 {
-    COPY_STRUCT(sym, elf->buf, ofst, elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_name);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_value);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_size);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_info);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_other);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_shndx);
+    if (hdr32)
+    {
+        if (idx >= elf->hdr32.e_shnum)
+            goto bail;
+
+        COPY_STRUCT(hdr32, elf->buf, elf->hdr32.e_shoff + idx * elf->hdr32.e_shentsize,
+                    elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_name);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_type);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_flags);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addr);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_offset);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_size);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_link);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_info);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addralign);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_entsize);
+    }
+    else /* if (hdr64) */
+    {
+        if (idx >= elf->hdr64.e_shnum)
+            goto bail;
+
+        COPY_STRUCT(hdr64, elf->buf, elf->hdr64.e_shoff + idx * elf->hdr64.e_shentsize,
+                    elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_name);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_type);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_flags);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addr);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_offset);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_size);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_link);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_info);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addralign);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_entsize);
+    }
+
    return 0;
 bail:
    return 1;
 }

-int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode)
+char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx)
 {
-    elf_obj_t  elf;
-    Elf32_Shdr shdr;
+    if (elf->bits == 32)
+    {
+        Elf32_Shdr shdr;
+
+        if (parse_elf_section(elf, s_idx, &shdr, NULL))
+        {
+            log_msg("Failed to parse ELF string table: section %d, index %d\n",
+                    s_idx, idx);
+            return "";
+        }
+
+        return (char *)(elf->buf + shdr.sh_offset + idx);
+    }
+    else /* if (elf->bits == 64) */
+    {
+        Elf64_Shdr shdr;
+
+        if (parse_elf_section(elf, s_idx, NULL, &shdr))
+        {
+            log_msg("Failed to parse ELF string table: section %d, index %d\n",
+                    s_idx, idx);
+            return "";
+        }
+
+        return (char *)(elf->buf + shdr.sh_offset + idx);
+    }
+}
+
+int parse_elf_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym32, Elf64_Sym *sym64)
+{
+    if (sym32)
+    {
+        COPY_STRUCT(sym32, elf->buf, ofst, elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_name);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_value);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_size);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_info);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_other);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_shndx);
+    }
+    else /* if (sym64) */
+    {
+        COPY_STRUCT(sym64, elf->buf, ofst, elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_name);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_value);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_size);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_info);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_other);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_shndx);
+    }
+    return 0;
+bail:
+    return 1;
+}
+
+int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode)
+{
+    elf_obj_t    elf;
    unsigned int ofst;
-    int         i;
-    Elf32_Off strtab_off;   /* save String Table offset for later use */
+    int          i;
+    Elf32_Off    strtab_off32;
+    Elf64_Off    strtab_off64; /* save String Table offset for later use */

    memset(&elf, 0, sizeof(elf));
    elf.buf = buf;
    elf.sz = sz;

    /* Parse Header */
-    if (parse_elf32_header(&elf))
-    {
-        log_msg("Parse error: File does not appear to be valid ELF32\n");
-        return 1;
-    }
+    if (parse_elf_header(&elf))
+      goto bail;

-    for (i = 0; i < elf.hdr.e_shnum; i++)
+    if (elf.bits == 32)
    {
-        parse_elf32_section(&elf, i, &shdr);
-
-        if (shdr.sh_type == SHT_STRTAB)
+        Elf32_Shdr shdr;
+        for (i = 0; i < elf.hdr32.e_shnum; i++)
        {
-            char strtsb_name[128];
+            parse_elf_section(&elf, i, &shdr, NULL);

-            strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
-
-            if (!(strcmp(strtsb_name, ".shstrtab")))
+            if (shdr.sh_type == SHT_STRTAB)
            {
-                log_msg("found section: %s\n", strtsb_name);
-                strtab_off = shdr.sh_offset;
-                break;
+                char strtsb_name[128];
+
+                strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
+
+                if (!(strcmp(strtsb_name, ".shstrtab")))
+                {
+                    /* log_msg("found section: %s\n", strtsb_name); */
+                    strtab_off32 = shdr.sh_offset;
+                    break;
+                }
+            }
+        }
+    }
+    else /* if (elf.bits == 64) */
+    {
+        Elf64_Shdr shdr;
+        for (i = 0; i < elf.hdr64.e_shnum; i++)
+        {
+            parse_elf_section(&elf, i, NULL, &shdr);
+
+            if (shdr.sh_type == SHT_STRTAB)
+            {
+                char strtsb_name[128];
+
+                strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
+
+                if (!(strcmp(strtsb_name, ".shstrtab")))
+                {
+                    /* log_msg("found section: %s\n", strtsb_name); */
+                    strtab_off64 = shdr.sh_offset;
+                    break;
+                }
            }
        }
    }

    /* Parse all Symbol Tables */
-    for (i = 0; i < elf.hdr.e_shnum; i++)
+    if (elf.bits == 32)
    {
-
-        parse_elf32_section(&elf, i, &shdr);
-
-        if (shdr.sh_type == SHT_SYMTAB)
+        Elf32_Shdr shdr;
+        for (i = 0; i < elf.hdr32.e_shnum; i++)
        {
-            for (ofst = shdr.sh_offset;
-                 ofst < shdr.sh_offset + shdr.sh_size;
-                 ofst += shdr.sh_entsize)
+            parse_elf_section(&elf, i, &shdr, NULL);
+
+            if (shdr.sh_type == SHT_SYMTAB)
            {
-                Elf32_Sym sym;
-
-                parse_elf32_symbol(&elf, ofst, &sym);
-
-                /* For all OBJECTS (data objects), extract the value from the
-                 * proper data segment.
-                 */
-                if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
-                    log_msg("found data object %s\n",
-                            parse_elf32_string_table(&elf,
-                                                     shdr.sh_link,
-                                                     sym.st_name));
-
-                if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
-                    && sym.st_size == 4)
+                for (ofst = shdr.sh_offset;
+                     ofst < shdr.sh_offset + shdr.sh_size;
+                     ofst += shdr.sh_entsize)
                {
-                    Elf32_Shdr dhdr;
-                    int32_t      val;
-                    char section_name[128];
+                    Elf32_Sym sym;

-                    parse_elf32_section(&elf, sym.st_shndx, &dhdr);
+                    parse_elf_symbol(&elf, ofst, &sym, NULL);

-                    /* For explanition - refer to _MSC_VER version of code */
-                    strcpy(section_name, (char *)(elf.buf + strtab_off + dhdr.sh_name));
-                    log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type);
+                    /* For all OBJECTS (data objects), extract the value from the
+                     * proper data segment.
+                     */
+                    /* if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
+                        log_msg("found data object %s\n",
+                                parse_elf_string_table(&elf,
+                                                       shdr.sh_link,
+                                                       sym.st_name));
+                     */

-                    if (!(strcmp(section_name, ".bss")))
+                    if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
+                        && sym.st_size == 4)
                    {
-                        val = 0;
-                    }
-                    else
-                    {
-                        memcpy(&val,
-                               elf.buf + dhdr.sh_offset + sym.st_value,
-                               sizeof(val));
+                        Elf32_Shdr dhdr;
+                        int val = 0;
+                        char section_name[128];
+
+                        parse_elf_section(&elf, sym.st_shndx, &dhdr, NULL);
+
+                        /* For explanition - refer to _MSC_VER version of code */
+                        strcpy(section_name, (char *)(elf.buf + strtab_off32 + dhdr.sh_name));
+                        /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
+
+                        if (strcmp(section_name, ".bss"))
+                        {
+                            if (sizeof(val) != sym.st_size)
+                            {
+                                /* The target value is declared as an int in
+                                 * asm_*_offsets.c, which is 4 bytes on all
+                                 * targets we currently use. Complain loudly if
+                                 * this is not true.
+                                 */
+                                log_msg("Symbol size is wrong\n");
+                                goto bail;
+                            }
+
+                            memcpy(&val,
+                                   elf.buf + dhdr.sh_offset + sym.st_value,
+                                   sym.st_size);
+                        }
+
+                        if (!elf.le_data)
+                        {
+                            log_msg("Big Endian data not supported yet!\n");
+                            goto bail;
+                        }
+
+                        switch (mode)
+                        {
+                            case OUTPUT_FMT_RVDS:
+                                printf("%-40s EQU %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            case OUTPUT_FMT_GAS:
+                                printf(".equ %-40s, %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            default:
+                                printf("%s = %d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                        }
                    }
+                }
+            }
+        }
+    }
+    else /* if (elf.bits == 64) */
+    {
+        Elf64_Shdr shdr;
+        for (i = 0; i < elf.hdr64.e_shnum; i++)
+        {
+            parse_elf_section(&elf, i, NULL, &shdr);

-                    if (!elf.le_data)
-                    {
-                        log_msg("Big Endian data not supported yet!\n");
-                        goto bail;
-                    }\
+            if (shdr.sh_type == SHT_SYMTAB)
+            {
+                for (ofst = shdr.sh_offset;
+                     ofst < shdr.sh_offset + shdr.sh_size;
+                     ofst += shdr.sh_entsize)
+                {
+                    Elf64_Sym sym;

-                    switch (mode)
+                    parse_elf_symbol(&elf, ofst, NULL, &sym);
+
+                    /* For all OBJECTS (data objects), extract the value from the
+                     * proper data segment.
+                     */
+                    /* if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
+                        log_msg("found data object %s\n",
+                                parse_elf_string_table(&elf,
+                                                       shdr.sh_link,
+                                                       sym.st_name));
+                     */
+
+                    if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT
+                        && sym.st_size == 4)
                    {
-                    case OUTPUT_FMT_RVDS:
-                        printf("%-40s EQU %5d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
-                        break;
-                    case OUTPUT_FMT_GAS:
-                        printf(".equ %-40s, %5d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
-                        break;
-                    default:
-                        printf("%s = %d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
+                        Elf64_Shdr dhdr;
+                        int val = 0;
+                        char section_name[128];
+
+                        parse_elf_section(&elf, sym.st_shndx, NULL, &dhdr);
+
+                        /* For explanition - refer to _MSC_VER version of code */
+                        strcpy(section_name, (char *)(elf.buf + strtab_off64 + dhdr.sh_name));
+                        /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
+
+                        if ((strcmp(section_name, ".bss")))
+                        {
+                            if (sizeof(val) != sym.st_size)
+                            {
+                                /* The target value is declared as an int in
+                                 * asm_*_offsets.c, which is 4 bytes on all
+                                 * targets we currently use. Complain loudly if
+                                 * this is not true.
+                                 */
+                                log_msg("Symbol size is wrong\n");
+                                goto bail;
+                            }
+
+                            memcpy(&val,
+                                   elf.buf + dhdr.sh_offset + sym.st_value,
+                                   sym.st_size);
+                        }
+
+                        if (!elf.le_data)
+                        {
+                            log_msg("Big Endian data not supported yet!\n");
+                            goto bail;
+                        }
+
+                        switch (mode)
+                        {
+                            case OUTPUT_FMT_RVDS:
+                                printf("%-40s EQU %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            case OUTPUT_FMT_GAS:
+                                printf(".equ %-40s, %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            default:
+                                printf("%s = %d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                        }
                    }
                }
            }
@@ -454,7 +736,7 @@ int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode)

    return 0;
 bail:
-    log_msg("Parse error: File does not appear to be valid ELF32\n");
+    log_msg("Parse error: File does not appear to be valid ELF32 or ELF64\n");
    return 1;
 }

@@ -521,8 +803,7 @@ int main(int argc, char **argv)
        goto bail;
    }

-    res = parse_elf32(file_buf, stat_buf.st_size, mode);
-    //res = parse_coff(file_buf, stat_buf.st_size);
+    res = parse_elf(file_buf, stat_buf.st_size, mode);
    free(file_buf);

    if (!res)
@@ -535,7 +816,7 @@ bail:
 #endif


-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__MINGW32__)
 /*  See "Microsoft Portable Executable and Common Object File Format Specification"
    for reference.
 */
@@ -549,7 +830,6 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
    unsigned int i;
    unsigned __int8 *ptr;
    unsigned __int32 symoffset;
-    FILE *fp;

    char **sectionlist;  //this array holds all section names in their correct order.
    //it is used to check if the symbol is in .bss or .data section.
@@ -560,9 +840,18 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
    strtab_ptr = symtab_ptr + symtab_sz * 18;

    if (nsections > 96)
-        goto bail;
+    {
+        log_msg("Too many sections\n");
+        return 1;
+    }

-    sectionlist = malloc(nsections * sizeof * sectionlist);
+    sectionlist = malloc(nsections * sizeof(sectionlist));
+
+    if (sectionlist == NULL)
+    {
+        log_msg("Allocating first level of section list failed\n");
+        return 1;
+    }

    //log_msg("COFF: Found %u symbols in %u sections.\n", symtab_sz, nsections);

@@ -580,6 +869,12 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
        //log_msg("COFF: Parsing section %s\n",sectionname);

        sectionlist[i] = malloc(strlen(sectionname) + 1);
+
+        if (sectionlist[i] == NULL)
+        {
+            log_msg("Allocating storage for %s failed\n", sectionname);
+            goto bail;
+        }
        strcpy(sectionlist[i], sectionname);

        if (!strcmp(sectionname, ".data")) sectionrawdata_ptr = get_le32(ptr + 20);
@@ -590,14 +885,6 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
    //log_msg("COFF: Symbol table at offset %u\n", symtab_ptr);
    //log_msg("COFF: raw data pointer ofset for section .data is %u\n", sectionrawdata_ptr);

-    fp = fopen("assembly_offsets.asm", "w");
-
-    if (fp == NULL)
-    {
-        perror("open file");
-        goto bail;
-    }
-
    /*  The compiler puts the data with non-zero offset in .data section, but puts the data with
        zero offset in .bss section. So, if the data in in .bss section, set offset=0.
        Note from Wiki: In an object module compiled from C, the bss section contains
@@ -631,13 +918,23 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
                char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
                strncpy(name, ptr, 8);
                //log_msg("COFF: Parsing symbol %s\n",name);
-                fprintf(fp, "%-40s EQU ", name);
+                /* The 64bit Windows compiler doesn't prefix with an _.
+                 * Check what's there, and bump if necessary
+                 */
+                if (name[0] == '_')
+                    printf("%-40s EQU ", name + 1);
+                else
+                    printf("%-40s EQU ", name);
            }
            else
            {
                //log_msg("COFF: Parsing symbol %s\n",
                //        buf + strtab_ptr + get_le32(ptr+4));
-                fprintf(fp, "%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4));
+                if ((buf + strtab_ptr + get_le32(ptr + 4))[0] == '_')
+                    printf("%-40s EQU ",
+                           buf + strtab_ptr + get_le32(ptr + 4) + 1);
+                else
+                    printf("%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4));
            }

            if (!(strcmp(sectionlist[section-1], ".bss")))
@@ -654,14 +951,13 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
            //log_msg("      Address: %u\n",get_le32(ptr+8));
            //log_msg("      Offset: %u\n", symoffset);

-            fprintf(fp, "%5d\n", symoffset);
+            printf("%5d\n", symoffset);
        }

        ptr += 18;
    }

-    fprintf(fp, "    END\n");
-    fclose(fp);
+    printf("    END\n");

    for (i = 0; i < nsections; i++)
    {
@@ -711,11 +1007,7 @@ int main(int argc, char **argv)
    else
        f = argv[1];

-    if (_sopen_s(&fd, f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE))
-    {
-        perror("Unable to open file");
-        goto bail;
-    }
+    fd = _sopen(f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE);

    if (_fstat(fd, &stat_buf))
    {
--- a/build/x86-msvs/obj_int_extract.bat
+++ b/build/x86-msvs/obj_int_extract.bat
@@ -0,0 +1,15 @@
+REM   Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+REM
+REM   Use of this source code is governed by a BSD-style license
+REM   that can be found in the LICENSE file in the root of the source
+REM   tree. An additional intellectual property rights grant can be found
+REM   in the file PATENTS.  All contributing project authors may
+REM   be found in the AUTHORS file in the root of the source tree.
+echo on
+
+cl /I "./" /I "%1" /nologo /c "%1/vp8/common/asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/asm_dec_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/asm_enc_offsets.c"
+obj_int_extract.exe rvds "asm_com_offsets.obj" > "asm_com_offsets.asm"
+obj_int_extract.exe rvds "asm_dec_offsets.obj" > "asm_dec_offsets.asm"
+obj_int_extract.exe rvds "asm_enc_offsets.obj" > "asm_enc_offsets.asm"
--- a/libs.mk
+++ b/libs.mk
@@ -9,7 +9,13 @@
 ##


-ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm)
+# ARM assembly files are written in RVCT-style. We use some make magic to
+# filter those files to allow GCC compilation
+ifeq ($(ARCH_ARM),yes)
+  ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm)
+else
+  ASM:=.asm
+endif

 CODEC_SRCS-yes += libs.mk

@@ -126,6 +132,23 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS)
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)

+obj_int_extract.vcproj: $(SRC_PATH_BARE)/build/make/obj_int_extract.c
+	@cp $(SRC_PATH_BARE)/build/x86-msvs/obj_int_extract.bat .
+	@echo "    [CREATE] $@"
+	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+    --exe \
+    --target=$(TOOLCHAIN) \
+    --name=obj_int_extract \
+    --ver=$(CONFIG_VS_VERSION) \
+    --proj-guid=E1360C65-D375-4335-8057-7ED99CC3F9B2 \
+    $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
+    --out=$@ $^ \
+    -I. \
+    -I"$(SRC_PATH_BARE)" \
+
+PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.vcproj
+PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.bat
+
 vpx.def: $(call enabled,CODEC_EXPORTS)
 	@echo "    [CREATE] $@"
 	$(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\
@@ -135,15 +158,16 @@ CLEAN-OBJS += vpx.def

 vpx.vcproj: $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh\
-			--lib\
-			--target=$(TOOLCHAIN)\
+	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+			--lib \
+			--target=$(TOOLCHAIN) \
            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --name=vpx\
-            --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74\
-            --module-def=vpx.def\
-            --ver=$(CONFIG_VS_VERSION)\
-            --out=$@ $(CFLAGS) $^\
+            --name=vpx \
+            --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74 \
+            --module-def=vpx.def \
+            --ver=$(CONFIG_VS_VERSION) \
+            --out=$@ $(CFLAGS) $^ \
+            --src-path-bare="$(SRC_PATH_BARE)" \

 PROJECTS-$(BUILD_LIBVPX) += vpx.vcproj

@@ -207,36 +231,38 @@ endif
 #
 # Add assembler dependencies for configuration and offsets
 #
-$(filter %.s.o,$(OBJS-yes)):   $(BUILD_PFX)vpx_config.asm
-$(filter %.asm.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
+$(filter %.s.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
+$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm

 #
 # Calculate platform- and compiler-specific offsets for hand coded assembly
 #
-ifeq ($(ARCH_ARM), yes)
-  asm_com_offsets.asm: obj_int_extract
-  asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
+ifeq ($(CONFIG_EXTERNAL_BUILD),) # Visual Studio uses obj_int_extract.bat
+  ifeq ($(ARCH_ARM), yes)
+    asm_com_offsets.asm: obj_int_extract
+    asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
 	./obj_int_extract rvds $< $(ADS2GAS) > $@
-  OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
-  CLEAN-OBJS += asm_com_offsets.asm
-  $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
+    OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
+    CLEAN-OBJS += asm_com_offsets.asm
+    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm

-  ifeq ($(CONFIG_VP8_ENCODER), yes)
-    asm_enc_offsets.asm: obj_int_extract
-    asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+    ifeq ($(CONFIG_VP8_ENCODER), yes)
+      asm_enc_offsets.asm: obj_int_extract
+      asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
 	./obj_int_extract rvds $< $(ADS2GAS) > $@
-    OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
-    CLEAN-OBJS += asm_enc_offsets.asm
-    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
-  endif
+      OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+      CLEAN-OBJS += asm_enc_offsets.asm
+      $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
+    endif

-  ifeq ($(CONFIG_VP8_DECODER), yes)
-    asm_dec_offsets.asm: obj_int_extract
-    asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+    ifeq ($(CONFIG_VP8_DECODER), yes)
+      asm_dec_offsets.asm: obj_int_extract
+      asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
 	./obj_int_extract rvds $< $(ADS2GAS) > $@
-    OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
-    CLEAN-OBJS += asm_dec_offsets.asm
-    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
+      OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+      CLEAN-OBJS += asm_dec_offsets.asm
+      $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
+    endif
  endif
 endif

--- a/solution.mk
+++ b/solution.mk
@@ -13,8 +13,9 @@ vpx.sln: $(wildcard *.vcproj)
 	@echo "    [CREATE] $@"
 	$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
            $(if $(filter %vpx.vcproj,$^),\
-                $(foreach vcp,$(filter-out %vpx.vcproj,$^),\
+                $(foreach vcp,$(filter-out %vpx.vcproj %obj_int_extract.vcproj,$^),\
                  --dep=$(vcp:.vcproj=):vpx)) \
+            --dep=vpx:obj_int_extract \
            --ver=$(CONFIG_VS_VERSION)\
            --out=$@ $^
 vpx.sln.mk: vpx.sln
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -20,7 +20,7 @@

 extern  void vp8_init_scan_order_mask();

-void vp8_update_mode_info_border(MODE_INFO *mi, int rows, int cols)
+static void update_mode_info_border(MODE_INFO *mi, int rows, int cols)
 {
    int i;
    vpx_memset(mi - cols - 2, 0, sizeof(MODE_INFO) * (cols + 1));
@@ -119,7 +119,7 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
        return 1;
    }

-    vp8_update_mode_info_border(oci->mi, oci->mb_rows, oci->mb_cols);
+    update_mode_info_border(oci->mi, oci->mb_rows, oci->mb_cols);

    return 0;
 }
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -19,14 +19,6 @@
 #include "vp8/common/idct.h"
 #include "vp8/common/onyxc_int.h"

-extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);
-
-extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);
-
 void vp8_arch_arm_common_init(VP8_COMMON *ctx)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -106,31 +98,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
        rtcd->recon.recon2      = vp8_recon2b_neon;
        rtcd->recon.recon4      = vp8_recon4b_neon;
        rtcd->recon.recon_mb    = vp8_recon_mb_neon;
-
+        rtcd->recon.build_intra_predictors_mby =
+            vp8_build_intra_predictors_mby_neon;
+        rtcd->recon.build_intra_predictors_mby_s =
+            vp8_build_intra_predictors_mby_s_neon;
    }
 #endif

 #endif
-
-#if HAVE_ARMV6
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (has_media)
-#endif
-    {
-        vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
-        vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
-    }
-#endif
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (has_neon)
-#endif
-    {
-        vp8_build_intra_predictors_mby_ptr =
-         vp8_build_intra_predictors_mby_neon;
-        vp8_build_intra_predictors_mby_s_ptr =
-         vp8_build_intra_predictors_mby_s_neon;
-    }
-#endif
 }
--- a/vp8/common/arm/recon_arm.h
+++ b/vp8/common/arm/recon_arm.h
@@ -53,6 +53,9 @@ extern prototype_copy_block(vp8_copy_mem16x16_neon);

 extern prototype_recon_macroblock(vp8_recon_mb_neon);

+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_neon);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon);
+
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_recon_recon
 #define vp8_recon_recon vp8_recon_b_neon
@@ -74,6 +77,13 @@ extern prototype_recon_macroblock(vp8_recon_mb_neon);

 #undef  vp8_recon_recon_mb
 #define vp8_recon_recon_mb vp8_recon_mb_neon
+
+#undef  vp8_recon_build_intra_predictors_mby
+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_neon
+
+#undef  vp8_recon_build_intra_predictors_mby_s
+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon
+
 #endif
 #endif

--- a/vp8/common/filter.c
+++ b/vp8/common/filter.c
@@ -38,7 +38,7 @@ DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
    { 0, -1,   12,  123,  -6,  0 },
 };

-void vp8_filter_block2d_first_pass
+static void filter_block2d_first_pass
 (
    unsigned char *src_ptr,
    int *output_ptr,
@@ -82,7 +82,7 @@ void vp8_filter_block2d_first_pass
    }
 }

-void vp8_filter_block2d_second_pass
+static void filter_block2d_second_pass
 (
    int *src_ptr,
    unsigned char *output_ptr,
@@ -129,7 +129,7 @@ void vp8_filter_block2d_second_pass
 }


-void vp8_filter_block2d
+static void filter_block2d
 (
    unsigned char  *src_ptr,
    unsigned char  *output_ptr,
@@ -142,39 +142,13 @@ void vp8_filter_block2d
    int FData[9*4]; /* Temp data buffer used in filtering */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);

    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
+    filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
 }


-void vp8_block_variation_c
-(
-    unsigned char  *src_ptr,
-    int   src_pixels_per_line,
-    int *HVar,
-    int *VVar
-)
-{
-    int i, j;
-    unsigned char *Ptr = src_ptr;
-
-    for (i = 0; i < 4; i++)
-    {
-        for (j = 0; j < 4; j++)
-        {
-            *HVar += abs((int)Ptr[j] - (int)Ptr[j+1]);
-            *VVar += abs((int)Ptr[j] - (int)Ptr[j+src_pixels_per_line]);
-        }
-
-        Ptr += src_pixels_per_line;
-    }
-}
-
-
-
-
 void vp8_sixtap_predict_c
 (
    unsigned char  *src_ptr,
@@ -191,7 +165,7 @@ void vp8_sixtap_predict_c
    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

-    vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
+    filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
 }
 void vp8_sixtap_predict8x8_c
 (
@@ -211,11 +185,11 @@ void vp8_sixtap_predict8x8_c
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);


    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);

 }

@@ -237,11 +211,11 @@ void vp8_sixtap_predict8x4_c
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);


    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);

 }

@@ -264,10 +238,10 @@ void vp8_sixtap_predict16x16_c
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);

    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
+    filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);

 }

@@ -294,7 +268,7 @@ void vp8_sixtap_predict16x16_c
 *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
 *
 ****************************************************************************/
-void vp8_filter_block2d_bil_first_pass
+static void filter_block2d_bil_first_pass
 (
    unsigned char  *src_ptr,
    unsigned short *dst_ptr,
@@ -345,7 +319,7 @@ void vp8_filter_block2d_bil_first_pass
 *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
 *
 ****************************************************************************/
-void vp8_filter_block2d_bil_second_pass
+static void filter_block2d_bil_second_pass
 (
    unsigned short *src_ptr,
    unsigned char  *dst_ptr,
@@ -399,7 +373,7 @@ void vp8_filter_block2d_bil_second_pass
 *  SPECIAL NOTES : The largest block size can be handled here is 16x16
 *
 ****************************************************************************/
-void vp8_filter_block2d_bil
+static void filter_block2d_bil
 (
    unsigned char *src_ptr,
    unsigned char *dst_ptr,
@@ -415,10 +389,10 @@ void vp8_filter_block2d_bil
    unsigned short FData[17*16];    /* Temp data buffer used in filtering */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+    filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

    /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+    filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }


@@ -444,19 +418,19 @@ void vp8_bilinear_predict4x4_c
        unsigned char temp2[16];

        bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
-        vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+        filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);

        for (i = 0; i < 16; i++)
        {
            if (temp1[i] != temp2[i])
            {
                bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
-                vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+                filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
            }
        }
    }
 #endif
-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);

 }

@@ -476,7 +450,7 @@ void vp8_bilinear_predict8x8_c
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];

-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);

 }

@@ -496,7 +470,7 @@ void vp8_bilinear_predict8x4_c
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];

-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);

 }

@@ -516,5 +490,5 @@ void vp8_bilinear_predict16x16_c
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];

-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -11,6 +11,13 @@

 #include "findnearmv.h"

+const unsigned char vp8_mbsplit_offset[4][16] = {
+    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+};
+
 /* Predict motion vectors using those from already-decoded nearby blocks.
   Note that we only consider one 4x4 subblock from each candidate 16x16
   macroblock.   */
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -70,4 +70,6 @@ const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b);

 const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride);

+extern const unsigned char vp8_mbsplit_offset[4][16];
+
 #endif
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -20,12 +20,6 @@
 extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);
 extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);

-void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
-
-void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
-
 void vp8_machine_specific_config(VP8_COMMON *ctx)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -45,6 +39,10 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
    rtcd->recon.recon4      = vp8_recon4b_c;
    rtcd->recon.recon_mb    = vp8_recon_mb_c;
    rtcd->recon.recon_mby   = vp8_recon_mby_c;
+    rtcd->recon.build_intra_predictors_mby =
+        vp8_build_intra_predictors_mby;
+    rtcd->recon.build_intra_predictors_mby_s =
+        vp8_build_intra_predictors_mby_s;

    rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;
    rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;
@@ -75,9 +73,6 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
 #endif

 #endif
-    /* Pure C: */
-    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
-    vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;

 #if ARCH_X86 || ARCH_X86_64
    vp8_arch_x86_common_init(ctx);
--- a/vp8/common/mbpitch.c
+++ b/vp8/common/mbpitch.c
@@ -17,7 +17,7 @@ typedef enum
    DEST = 1
 } BLOCKSET;

-void vp8_setup_block
+static void setup_block
 (
    BLOCKD *b,
    int mv_stride,
@@ -43,7 +43,8 @@ void vp8_setup_block

 }

-void vp8_setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)
+
+static void setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)
 {
    int block;

@@ -64,16 +65,16 @@ void vp8_setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)

    for (block = 0; block < 16; block++) /* y blocks */
    {
-        vp8_setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride,
+        setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride,
                        (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4, bs);
    }

    for (block = 16; block < 20; block++) /* U and V blocks */
    {
-        vp8_setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride,
+        setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride,
                        ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);

-        vp8_setup_block(&x->block[block+4], x->dst.uv_stride, v, x->dst.uv_stride,
+        setup_block(&x->block[block+4], x->dst.uv_stride, v, x->dst.uv_stride,
                        ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
    }
 }
@@ -124,6 +125,6 @@ void vp8_build_block_doffsets(MACROBLOCKD *x)
 {

    /* handle the destination pitch features */
-    vp8_setup_macroblock(x, DEST);
-    vp8_setup_macroblock(x, PRED);
+    setup_macroblock(x, DEST);
+    setup_macroblock(x, PRED);
 }
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -211,7 +211,7 @@ void vp8_post_proc_down_and_across_c
    }
 }

-int vp8_q2mbl(int x)
+static int q2mbl(int x)
 {
    if (x < 20) x = 20;

@@ -314,8 +314,8 @@ static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG         *source,
    (void) flag;

    POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer, source->y_stride,  post->y_stride, source->y_height, source->y_width,  ppl);
-    POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride, post->y_height, post->y_width, vp8_q2mbl(q));
-    POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride, post->y_height, post->y_width, vp8_q2mbl(q));
+    POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
+    POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));

    POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
    POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -23,6 +23,9 @@
 #define prototype_recon_macroblock(sym) \
    void sym(const struct vp8_recon_rtcd_vtable *rtcd, MACROBLOCKD *x)

+#define prototype_build_intra_predictors(sym) \
+    void sym(MACROBLOCKD *x)
+
 struct vp8_recon_rtcd_vtable;

 #if ARCH_X86 || ARCH_X86_64
@@ -73,9 +76,23 @@ extern prototype_recon_macroblock(vp8_recon_recon_mb);
 #endif
 extern prototype_recon_macroblock(vp8_recon_recon_mby);

+#ifndef vp8_recon_build_intra_predictors_mby
+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby
+#endif
+extern prototype_build_intra_predictors\
+    (vp8_recon_build_intra_predictors_mby);
+
+#ifndef vp8_recon_build_intra_predictors_mby_s
+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s
+#endif
+extern prototype_build_intra_predictors\
+    (vp8_recon_build_intra_predictors_mby_s);
+
+
 typedef prototype_copy_block((*vp8_copy_block_fn_t));
 typedef prototype_recon_block((*vp8_recon_fn_t));
 typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));
+typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t));
 typedef struct vp8_recon_rtcd_vtable
 {
    vp8_copy_block_fn_t  copy16x16;
@@ -86,6 +103,8 @@ typedef struct vp8_recon_rtcd_vtable
    vp8_recon_fn_t       recon4;
    vp8_recon_mb_fn_t    recon_mb;
    vp8_recon_mb_fn_t    recon_mby;
+    vp8_build_intra_pred_fn_t  build_intra_predictors_mby_s;
+    vp8_build_intra_pred_fn_t  build_intra_predictors_mby;
 } vp8_recon_rtcd_vtable_t;

 #if CONFIG_RUNTIME_CPU_DETECT
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -168,7 +168,7 @@ void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf)
    }
 }

-void vp8_build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch)
+static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch)
 {
    unsigned char *ptr_base;
    unsigned char *ptr;
@@ -187,7 +187,7 @@ void vp8_build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch)
    }
 }

-void vp8_build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch)
+static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch)
 {
    unsigned char *ptr_base;
    unsigned char *ptr;
@@ -246,7 +246,7 @@ void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
            BLOCKD *d1 = &x->block[i+1];

            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                vp8_build_inter_predictors2b(x, d0, 8);
+                build_inter_predictors2b(x, d0, 8);
            else
            {
                vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
@@ -291,7 +291,7 @@ void vp8_build_inter_predictors_mby(MACROBLOCKD *x)
            for (i = 0; i < 4; i++)
            {
                BLOCKD *d = &x->block[bbb[i]];
-                vp8_build_inter_predictors4b(x, d, 16);
+                build_inter_predictors4b(x, d, 16);
            }

        }
@@ -303,7 +303,7 @@ void vp8_build_inter_predictors_mby(MACROBLOCKD *x)
                BLOCKD *d1 = &x->block[i+1];

                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                    vp8_build_inter_predictors2b(x, d0, 16);
+                    build_inter_predictors2b(x, d0, 16);
                else
                {
                    vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
@@ -372,7 +372,7 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
            for (i = 0; i < 4; i++)
            {
                BLOCKD *d = &x->block[bbb[i]];
-                vp8_build_inter_predictors4b(x, d, 16);
+                build_inter_predictors4b(x, d, 16);
            }
        }
        else
@@ -383,7 +383,7 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
                BLOCKD *d1 = &x->block[i+1];

                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                    vp8_build_inter_predictors2b(x, d0, 16);
+                    build_inter_predictors2b(x, d0, 16);
                else
                {
                    vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
@@ -400,7 +400,7 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
            BLOCKD *d1 = &x->block[i+1];

            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                vp8_build_inter_predictors2b(x, d0, 8);
+                build_inter_predictors2b(x, d0, 8);
            else
            {
                vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
@@ -600,7 +600,7 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
            for (i = 0; i < 4; i++)
            {
                BLOCKD *d = &x->block[bbb[i]];
-                /*vp8_build_inter_predictors4b(x, d, 16);*/
+                /*build_inter_predictors4b(x, d, 16);*/

                {
                    unsigned char *ptr_base;
@@ -630,7 +630,7 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)

                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
                {
-                    /*vp8_build_inter_predictors2b(x, d0, 16);*/
+                    /*build_inter_predictors2b(x, d0, 16);*/
                    unsigned char *ptr_base;
                    unsigned char *ptr;
                    unsigned char *pred_ptr = d0->predictor;
@@ -662,7 +662,7 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)

            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
            {
-                /*vp8_build_inter_predictors2b(x, d0, 8);*/
+                /*build_inter_predictors2b(x, d0, 8);*/
                unsigned char *ptr_base;
                unsigned char *ptr;
                unsigned char *pred_ptr = d0->predictor;
--- a/vp8/common/reconintra.h
+++ b/vp8/common/reconintra.h
@@ -14,13 +14,6 @@

 extern void init_intra_left_above_pixels(MACROBLOCKD *x);

-extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);
-extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);
-
 extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x);
 extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x);

--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -38,6 +38,7 @@
 #define pthread_self() GetCurrentThreadId()
 #else
 #ifdef __APPLE__
+#include <mach/mach_init.h>
 #include <mach/semaphore.h>
 #include <mach/task.h>
 #include <time.h>
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -113,97 +113,6 @@ nextrow:
    ret


-;
-; THIS FUNCTION APPEARS TO BE UNUSED
-;
-;void vp8_filter_block1d_v6_mmx
-;(
-;   short *src_ptr,
-;   unsigned char *output_ptr,
-;   unsigned int pixels_per_line,
-;   unsigned int pixel_step,
-;   unsigned int output_height,
-;   unsigned int output_width,
-;   short * vp8_filter
-;)
-global sym(vp8_filter_block1d_v6_mmx)
-sym(vp8_filter_block1d_v6_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        movq      mm5, [GLOBAL(rd)]
-        push        rbx
-        mov         rbx, arg(6) ;vp8_filter
-        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
-        movq      mm2, [rbx + 32]         ;
-        movq      mm6, [rbx + 48]        ;
-        movq      mm7, [rbx + 64]        ;
-
-        movsxd      rdx, dword ptr arg(2) ;pixels_per_line
-        mov         rdi, arg(1) ;output_ptr
-        mov         rsi, arg(0) ;src_ptr
-        sub         rsi, rdx
-        sub         rsi, rdx
-        movsxd      rcx, DWORD PTR arg(4) ;output_height
-        movsxd      rax, DWORD PTR arg(5) ;output_width      ; destination pitch?
-        pxor        mm0, mm0              ; mm0 = 00000000
-
-
-nextrow_v:
-        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
-        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
-
-
-        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
-        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
-        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
-        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-
-        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
-        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
-        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
-        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-
-        paddsw      mm3, mm5               ; mm3 += round value
-        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
-        packuswb    mm3, mm0              ; pack and saturate
-
-        movd        [rdi],mm3             ; store the results in the destination
-
-        add         rdi,rax;
-
-        dec         rcx                   ; decrement count
-        jnz         nextrow_v             ; next row
-
-        pop         rbx
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 ;void vp8_filter_block1dc_v6_mmx
 ;(
 ;   short *src_ptr,
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -228,15 +228,8 @@ unsigned int vp8_mv_cont_count[5][4] =
 };
 #endif

-unsigned char vp8_mbsplit_offset[4][16] = {
-    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
-};
-
-unsigned char vp8_mbsplit_fill_count[4] = {8, 8, 4, 1};
-unsigned char vp8_mbsplit_fill_offset[4][16] = {
+static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};
+static const unsigned char mbsplit_fill_offset[4][16] = {
    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
    { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15},
    { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15},
@@ -246,7 +239,7 @@ unsigned char vp8_mbsplit_fill_offset[4][16] = {



-void vp8_mb_mode_mv_init(VP8D_COMP *pbi)
+static void mb_mode_mv_init(VP8D_COMP *pbi)
 {
    vp8_reader *const bc = & pbi->bc;
    MV_CONTEXT *const mvc = pbi->common.fc.mvc;
@@ -287,7 +280,7 @@ void vp8_mb_mode_mv_init(VP8D_COMP *pbi)
    }
 }

-void vp8_read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
+static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                            int mb_row, int mb_col)
 {
    const MV Zero = { 0, 0};
@@ -405,10 +398,10 @@ void vp8_read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                    /* Fill (uniform) modes, mvs of jth subset.
                     Must do it here because ensuing subsets can
                     refer back to us via "left" or "above". */
-                    unsigned char *fill_offset;
-                    unsigned int fill_count = vp8_mbsplit_fill_count[s];
+                    const unsigned char *fill_offset;
+                    unsigned int fill_count = mbsplit_fill_count[s];

-                    fill_offset = &vp8_mbsplit_fill_offset[s][(unsigned char)j * vp8_mbsplit_fill_count[s]];
+                    fill_offset = &mbsplit_fill_offset[s][(unsigned char)j * mbsplit_fill_count[s]];

                    do {
                        mi->bmi[ *fill_offset] = bmi;
@@ -525,7 +518,7 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi)
    MODE_INFO *mi = pbi->common.mi;
    int mb_row = -1;

-    vp8_mb_mode_mv_init(pbi);
+    mb_mode_mv_init(pbi);

    while (++mb_row < pbi->common.mb_rows)
    {
@@ -543,11 +536,11 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi)

        while (++mb_col < pbi->common.mb_cols)
        {
-            /*vp8_read_mb_modes_mv(pbi, xd->mode_info_context, &xd->mode_info_context->mbmi, mb_row, mb_col);*/
+            /*read_mb_modes_mv(pbi, xd->mode_info_context, &xd->mode_info_context->mbmi, mb_row, mb_col);*/
            if(pbi->common.frame_type == KEY_FRAME)
                vp8_kfread_modes(pbi, mi, mb_row, mb_col);
            else
-                vp8_read_mb_modes_mv(pbi, mi, &mi->mbmi, mb_row, mb_col);
+                read_mb_modes_mv(pbi, mi, &mi->mbmi, mb_row, mb_col);

            mi++;       /* next macroblock */
        }
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -115,8 +115,8 @@ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
    {

        vp8_build_intra_predictors_mbuv_s(xd);
-        vp8_build_intra_predictors_mby_s_ptr(xd);
-
+        RECON_INVOKE(&pbi->common.rtcd.recon,
+                     build_intra_predictors_mby_s)(xd);
    }
    else
    {
@@ -175,7 +175,7 @@ void clamp_mvs(MACROBLOCKD *xd)

 }

-void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
+static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
    int eobtotal = 0;
    int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;
@@ -214,7 +214,8 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)

        if (xd->mode_info_context->mbmi.mode != B_PRED)
        {
-            vp8_build_intra_predictors_mby_ptr(xd);
+            RECON_INVOKE(&pbi->common.rtcd.recon,
+                         build_intra_predictors_mby)(xd);
        } else {
            vp8_intra_prediction_down_copy(xd);
        }
@@ -319,10 +320,8 @@ FILE *vpxlog = 0;



-void vp8_decode_mb_row(VP8D_COMP *pbi,
-                       VP8_COMMON *pc,
-                       int mb_row,
-                       MACROBLOCKD *xd)
+static void
+decode_mb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mb_row, MACROBLOCKD *xd)
 {

    int i;
@@ -394,7 +393,7 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
        else
        pbi->debugoutput =0;
        */
-        vp8_decode_macroblock(pbi, xd);
+        decode_macroblock(pbi, xd);

        /* check if the boolean decoder has suffered an error */
        xd->corrupted |= vp8dx_bool_error(xd->current_bc);
@@ -900,7 +899,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
                    ibc = 0;
            }

-            vp8_decode_mb_row(pbi, pc, mb_row, xd);
+            decode_mb_row(pbi, pc, mb_row, xd);
        }
    }

--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -19,7 +19,13 @@
 #define BOOL_DATA UINT8

 #define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
-DECLARE_ALIGNED(16, UINT8, vp8_coef_bands_x[16]) = { 0, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X};
+DECLARE_ALIGNED(16, static const unsigned char, coef_bands_x[16]) =
+{
+    0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X,
+    6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X,
+    6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+    6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X
+};
 #define EOB_CONTEXT_NODE            0
 #define ZERO_CONTEXT_NODE           1
 #define ONE_CONTEXT_NODE            2
@@ -135,7 +141,7 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
            Prob = coef_probs; \
            if(c<15) {\
            ++c; \
-            Prob += vp8_coef_bands_x[c]; \
+            Prob += coef_bands_x[c]; \
            goto branch; \
            } goto BLOCK_FINISHED; /*for malformed input */\
        } \
@@ -244,7 +250,7 @@ BLOCK_LOOP:
    Prob += v * ENTROPY_NODES;

 DO_WHILE:
-    Prob += vp8_coef_bands_x[c];
+    Prob += coef_bands_x[c];
    DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED);

 CHECK_0_:
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -37,43 +37,6 @@
 extern void vp8_init_loop_filter(VP8_COMMON *cm);
 extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);

-#if CONFIG_DEBUG
-void vp8_recon_write_yuv_frame(unsigned char *name, YV12_BUFFER_CONFIG *s)
-{
-    FILE *yuv_file = fopen((char *)name, "ab");
-    unsigned char *src = s->y_buffer;
-    int h = s->y_height;
-
-    do
-    {
-        fwrite(src, s->y_width, 1,  yuv_file);
-        src += s->y_stride;
-    }
-    while (--h);
-
-    src = s->u_buffer;
-    h = s->uv_height;
-
-    do
-    {
-        fwrite(src, s->uv_width, 1,  yuv_file);
-        src += s->uv_stride;
-    }
-    while (--h);
-
-    src = s->v_buffer;
-    h = s->uv_height;
-
-    do
-    {
-        fwrite(src, s->uv_width, 1, yuv_file);
-        src += s->uv_stride;
-    }
-    while (--h);
-
-    fclose(yuv_file);
-}
-#endif

 void vp8dx_initialize()
 {
@@ -155,35 +118,6 @@ void vp8dx_remove_decompressor(VP8D_PTR ptr)
 }


-void vp8dx_set_setting(VP8D_PTR comp, VP8D_SETTING oxst, int x)
-{
-    VP8D_COMP *pbi = (VP8D_COMP *) comp;
-
-    (void) pbi;
-    (void) x;
-
-    switch (oxst)
-    {
-    case VP8D_OK:
-        break;
-    }
-}
-
-int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst)
-{
-    VP8D_COMP *pbi = (VP8D_COMP *) comp;
-
-    (void) pbi;
-
-    switch (oxst)
-    {
-    case VP8D_OK:
-        break;
-    }
-
-    return -1;
-}
-
 int vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
@@ -203,6 +137,8 @@ int vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_C

    return 0;
 }
+
+
 int vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
@@ -459,12 +395,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
        vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
    }

-#if 0
-    /* DEBUG code */
-    /*vp8_recon_write_yuv_frame("recon.yuv", cm->frame_to_show);*/
-    if (cm->current_video_frame <= 5)
-        write_dx_frame_to_file(cm->frame_to_show, cm->current_video_frame);
-#endif

    vp8_clear_system_state();

--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -113,6 +113,7 @@ typedef struct VP8Decompressor
    pthread_t           *h_decoding_thread;
    sem_t               *h_event_start_decoding;
    sem_t                h_event_end_decoding;
+    sem_t				*h_mb_counter;
    /* end of threading data */
 #endif

--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -12,9 +12,6 @@
 #if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
 # include <unistd.h>
 #endif
-#ifdef __APPLE__
-#include <mach/mach_init.h>
-#endif
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/threading.h"
@@ -36,7 +33,7 @@ extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
 #define RTCD_VTABLE(x) NULL
 #endif

-void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
+static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
    VP8_COMMON *const pc = & pbi->common;
    int i, j;
@@ -90,7 +87,7 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
 }


-void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col)
+static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col)
 {
    int eobtotal = 0;
    int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;
@@ -217,7 +214,7 @@ void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb
 }


-THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
+static THREAD_FUNCTION thread_decoding_proc(void *p_data)
 {
    int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
    VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
@@ -278,6 +275,7 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)

                    for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
                    {
+/*
                        if ((mb_col & (nsync-1)) == 0)
                        {
                            while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != pc->mb_cols - 1)
@@ -286,6 +284,8 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                                thread_sleep(0);
                            }
                        }
+*/
+                        sem_wait(&pbi->h_mb_counter[ithread]);

                        if (xd->mode_info_context->mbmi.mode == SPLITMV || xd->mode_info_context->mbmi.mode == B_PRED)
                        {
@@ -296,18 +296,6 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                            }
                        }

-                        if(pbi->common.filter_level)
-                        {
-                            /*update loopfilter info*/
-                            Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
-                            filter_level = pbi->mt_baseline_filter_level[Segment];
-                            /* Distance of Mb to the various image edges.
-                             * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
-                             * Apply any context driven MB level adjustment
-                             */
-                            filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
-                        }
-
                        /* Distance of Mb to the various image edges.
                         * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
                         */
@@ -333,7 +321,7 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                        xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

                        vp8_build_uvmvs(xd, pc->full_pixel);
-                        vp8mt_decode_macroblock(pbi, xd, mb_row, mb_col);
+                        decode_macroblock(pbi, xd, mb_row, mb_col);

                        if (pbi->common.filter_level)
                        {
@@ -362,7 +350,16 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                                }
                            }

-                          /* loopfilter on this macroblock. */
+                            /* update loopfilter info */
+                            Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
+                            filter_level = pbi->mt_baseline_filter_level[Segment];
+                            /* Distance of Mb to the various image edges.
+                             * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                             * Apply any context driven MB level adjustment
+                             */
+                            filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
+
+                            /* loopfilter on this macroblock. */
                            if (filter_level)
                            {
                                if (mb_col > 0)
@@ -389,6 +386,9 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)

                        /*pbi->mb_row_di[ithread].current_mb_col = mb_col;*/
                        pbi->mt_current_mb_col[mb_row] = mb_col;
+
+                        if (mb_row != pbi->common.mb_rows-1)
+                            sem_post(&pbi->h_mb_counter[ithread+1]);
                    }

                    /* adjust to the next row of mbs */
@@ -444,6 +444,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)

        CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count));
        CHECK_MEM_ERROR(pbi->h_event_start_decoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
+        CHECK_MEM_ERROR(pbi->h_mb_counter, vpx_malloc(sizeof(sem_t) * (pbi->decoding_thread_count + 1)));
        CHECK_MEM_ERROR(pbi->mb_row_di, vpx_memalign(32, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count));
        vpx_memset(pbi->mb_row_di, 0, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count);
        CHECK_MEM_ERROR(pbi->de_thread_data, vpx_malloc(sizeof(DECODETHREAD_DATA) * pbi->decoding_thread_count));
@@ -456,9 +457,12 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)
            pbi->de_thread_data[ithread].ptr1     = (void *)pbi;
            pbi->de_thread_data[ithread].ptr2     = (void *) &pbi->mb_row_di[ithread];

-            pthread_create(&pbi->h_decoding_thread[ithread], 0, vp8_thread_decoding_proc, (&pbi->de_thread_data[ithread]));
+            pthread_create(&pbi->h_decoding_thread[ithread], 0, thread_decoding_proc, (&pbi->de_thread_data[ithread]));
        }

+        for (ithread = 0; ithread < pbi->decoding_thread_count + 1; ithread++)
+            sem_init(&pbi->h_mb_counter[ithread], 0, 0);
+
        sem_init(&pbi->h_event_end_decoding, 0, 0);

        pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
@@ -621,6 +625,9 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
            sem_destroy(&pbi->h_event_start_decoding[i]);
        }

+        for (i = 0; i < pbi->decoding_thread_count + 1; i++)
+            sem_destroy(&pbi->h_mb_counter[i]);
+
        sem_destroy(&pbi->h_event_end_decoding);

            vpx_free(pbi->h_decoding_thread);
@@ -629,6 +636,11 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
            vpx_free(pbi->h_event_start_decoding);
            pbi->h_event_start_decoding = NULL;

+        if (pbi->h_mb_counter)
+        {
+            vpx_free(pbi->h_mb_counter);
+            pbi->h_mb_counter = NULL;
+        }
            vpx_free(pbi->mb_row_di);
            pbi->mb_row_di = NULL ;

@@ -638,7 +650,7 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
 }


-void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
+static void lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
 {
    VP8_COMMON *cm  = &pbi->common;
    MACROBLOCKD *mbd = &pbi->mb;
@@ -721,14 +733,17 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
            vpx_memset(pbi->mt_uleft_col[i], (unsigned char)129, 8);
            vpx_memset(pbi->mt_vleft_col[i], (unsigned char)129, 8);
        }
-        vp8mt_lpf_init(pbi, pc->filter_level);
+        lpf_init(pbi, pc->filter_level);
    }

-    vp8_setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
+    setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
+
+

    for (i = 0; i < pbi->decoding_thread_count; i++)
        sem_post(&pbi->h_event_start_decoding[i]);

+
    for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
    {

@@ -761,6 +776,7 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)

            for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
            {
+/*
                if ( mb_row > 0 && (mb_col & (nsync-1)) == 0){
                    while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != pc->mb_cols - 1)
                    {
@@ -768,6 +784,9 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                        thread_sleep(0);
                    }
                }
+*/
+                if(mb_row > 0)
+                    sem_wait(&pbi->h_mb_counter[pbi->decoding_thread_count]);

                if (xd->mode_info_context->mbmi.mode == SPLITMV || xd->mode_info_context->mbmi.mode == B_PRED)
                {
@@ -778,18 +797,6 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                    }
                }

-                if(pbi->common.filter_level)
-                {
-                    /* update loopfilter info */
-                    Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
-                    filter_level = pbi->mt_baseline_filter_level[Segment];
-                    /* Distance of Mb to the various image edges.
-                     * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
-                     * Apply any context driven MB level adjustment
-                     */
-                    filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
-                }
-
                /* Distance of Mb to the various image edges.
                 * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
                 */
@@ -821,7 +828,7 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                }

                vp8_build_uvmvs(xd, pc->full_pixel);
-                vp8mt_decode_macroblock(pbi, xd, mb_row, mb_col);
+                decode_macroblock(pbi, xd, mb_row, mb_col);

                /* check if the boolean decoder has suffered an error */
                xd->corrupted |= vp8dx_bool_error(xd->current_bc);
@@ -853,6 +860,15 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                        }
                    }

+                    /* update loopfilter info */
+                    Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
+                    filter_level = pbi->mt_baseline_filter_level[Segment];
+                    /* Distance of Mb to the various image edges.
+                     * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                     * Apply any context driven MB level adjustment
+                     */
+                    filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
+
                    /* loopfilter on this macroblock. */
                    if (filter_level)
                    {
@@ -879,6 +895,10 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                xd->above_context++;

                pbi->mt_current_mb_col[mb_row] = mb_col;
+
+                /* macroblock counter */
+                if (mb_row != pbi->common.mb_rows-1)
+                    sem_post(&pbi->h_mb_counter[0]);
            }

            /* adjust to the next row of mbs */
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -17,7 +17,7 @@
 #if HAVE_MMX
 void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);

-void vp8_dequantize_b_mmx(BLOCKD *d)
+static void dequantize_b_mmx(BLOCKD *d)
 {
    short *sq = (short *) d->qcoeff;
    short *dq = (short *) d->dqcoeff;
@@ -41,7 +41,7 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
 #if HAVE_MMX
    if (flags & HAS_MMX)
    {
-        pbi->dequant.block               = vp8_dequantize_b_mmx;
+        pbi->dequant.block               = dequantize_b_mmx;
        pbi->dequant.idct_add            = vp8_dequant_idct_add_mmx;
        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_mmx;
        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx;
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -35,23 +35,23 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
        cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;*/

-        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
-        cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
-        cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
+        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;*/
+        cpi->rtcd.variance.var8x8                = vp8_variance8x8_armv6;
+        /*cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;*/
        cpi->rtcd.variance.var16x16              = vp8_variance16x16_armv6;

-        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
-        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
-        cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
+        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;*/
+        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_armv6;
+        /*cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;*/
        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_armv6;
        cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_armv6;
        cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_armv6;
        cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_armv6;

-        /*cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
-        cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
+        cpi->rtcd.variance.mse16x16              = vp8_mse16x16_armv6;
+        /*cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/

        /*cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;
        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
@@ -59,9 +59,9 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;*/

        /*cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
-        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_c;
-        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_c;*/
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;*/
+        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_armv6;
+        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_armv6;
        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_armv6;

        /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
@@ -71,8 +71,8 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;
        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;*/

-        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
-        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;*/
+        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;*/
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_armv6;
    }
 #endif

--- a/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
@@ -0,0 +1,262 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_fast_fdct4x4_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_fast_fdct4x4_armv6| PROC
+
+    stmfd       sp!, {r4 - r12, lr}
+
+    ; PART 1
+
+    ; coeffs 0-3
+    ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]
+
+    ldr         r10, c7500
+    ldr         r11, c14500
+    ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]
+    ldr         lr, c0x00080008
+    ror         r5, r5, #16         ; [i2 | i3]
+
+    qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8
+    smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8
+
+    smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]
+
+    pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]
+
+    str         r6, [r1, #4]
+
+    ; coeffs 4-7
+    ror         r9, r9, #16         ; [i6 | i7]
+
+    qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift
+    qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8
+    smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8
+
+    smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]
+
+    pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]
+
+    str         r6, [r1, #12]
+
+    ; coeffs 8-11
+    ror         r5, r5, #16         ; [i10 | i11]
+
+    qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8
+    smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8
+
+    smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]
+
+    pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]
+
+    str         r6, [r1, #20]
+
+    ; coeffs 12-15
+    ror         r5, r5, #16         ; [i14 | i15]
+
+    qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift
+    qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8
+    smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8
+
+    smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]
+
+    str         r6, [r1, #28]
+
+
+    ; PART 2 -------------------------------------------------
+    ldr         r11, c12000
+    ldr         r10, c51000
+    ldr         lr, c0x00070007
+
+    qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]
+    qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]
+    qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]
+    qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    add         r0, r11, #0x10000   ; add (d!=0)
+
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    ldr         r12, c0x08a914e8    ; [2217 | 5352]
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #0]        ; [     o1 |      o0]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #16]       ; [     o9 |      o8]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    ldr         r3, [r1, #4]        ; [i3 | i2]
+
+    pkhtb       r5, r5, r4, asr #16 ; [o13|o12]
+
+    str         r9, [r1, #8]        ; [o5 | 04]
+
+    ldr         r9, [r1, #12]       ; [i7 | i6]
+    ldr         r8, [r1, #28]       ; [i15|i14]
+    ldr         r2, [r1, #20]       ; [i11|i10]
+    str         r5, [r1, #24]       ; [o13|o12]
+
+    qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]
+    qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #4]        ; [     o3 |      o2]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #20]       ; [    o11 |     o10]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    str         r9, [r1, #12]       ; [o7 | o6]
+    pkhtb       r5, r5, r4, asr #16 ; [o15|o14]
+
+    str         r5, [r1, #28]       ; [o15|o14]
+
+    ldmfd       sp!, {r4 - r12, pc}
+
+    ENDP
+
+; Used constants
+c7500
+    DCD     7500
+c14500
+    DCD     14500
+c0x22a453a0
+    DCD     0x22a453a0
+c0x00080008
+    DCD     0x00080008
+c12000
+    DCD     12000
+c51000
+    DCD     51000
+c0x00070007
+    DCD     0x00070007
+c0x08a914e8
+    DCD     0x08a914e8
+
+    END
--- a/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
@@ -0,0 +1,224 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_fast_quantize_b_armv6|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    BLOCK *b
+; r1    BLOCKD *d
+|vp8_fast_quantize_b_armv6| PROC
+    stmfd   sp!, {r1, r4-r11, lr}
+
+    ldr     r3, [r0, #vp8_block_coeff]      ; coeff
+    ldr     r4, [r0, #vp8_block_quant_fast] ; quant_fast
+    ldr     r5, [r0, #vp8_block_round]      ; round
+    ldr     r6, [r1, #vp8_blockd_qcoeff]    ; qcoeff
+    ldr     r7, [r1, #vp8_blockd_dqcoeff]   ; dqcoeff
+    ldr     r8, [r1, #vp8_blockd_dequant]   ; dequant
+
+    ldr     r2, loop_count          ; loop_count=0x1000000. 'lsls' instruction
+                                    ; is used to update the counter so that
+                                    ; it can be used to mark nonzero
+                                    ; quantized coefficient pairs.
+
+    mov     r1, #0                  ; flags for quantized coeffs
+
+    ; PART 1: quantization and dequantization loop
+loop
+    ldr     r9, [r3], #4            ; [z1 | z0]
+    ldr     r10, [r5], #4           ; [r1 | r0]
+    ldr     r11, [r4], #4           ; [q1 | q0]
+
+    ssat16  lr, #1, r9              ; [sz1 | sz0]
+    eor     r9, r9, lr              ; [z1 ^ sz1 | z0 ^ sz0]
+    ssub16  r9, r9, lr              ; x = (z ^ sz) - sz
+    sadd16  r9, r9, r10             ; [x1+r1 | x0+r0]
+
+    ldr     r12, [r3], #4           ; [z3 | z2]
+
+    smulbb  r0, r9, r11             ; [(x0+r0)*q0]
+    smultt  r9, r9, r11             ; [(x1+r1)*q1]
+
+    ldr     r10, [r5], #4           ; [r3 | r2]
+
+    ssat16  r11, #1, r12            ; [sz3 | sz2]
+    eor     r12, r12, r11           ; [z3 ^ sz3 | z2 ^ sz2]
+    pkhtb   r0, r9, r0, asr #16     ; [y1 | y0]
+    ldr     r9, [r4], #4            ; [q3 | q2]
+    ssub16  r12, r12, r11           ; x = (z ^ sz) - sz
+
+    sadd16  r12, r12, r10           ; [x3+r3 | x2+r2]
+
+    eor     r0, r0, lr              ; [(y1 ^ sz1) | (y0 ^ sz0)]
+
+    smulbb  r10, r12, r9            ; [(x2+r2)*q2]
+    smultt  r12, r12, r9            ; [(x3+r3)*q3]
+
+    ssub16  r0, r0, lr              ; x = (y ^ sz) - sz
+
+    cmp     r0, #0                  ; check if zero
+    orrne   r1, r1, r2, lsr #24     ; add flag for nonzero coeffs
+
+    str     r0, [r6], #4            ; *qcoeff++ = x
+    ldr     r9, [r8], #4            ; [dq1 | dq0]
+
+    pkhtb   r10, r12, r10, asr #16  ; [y3 | y2]
+    eor     r10, r10, r11           ; [(y3 ^ sz3) | (y2 ^ sz2)]
+    ssub16  r10, r10, r11           ; x = (y ^ sz) - sz
+
+    cmp     r10, #0                 ; check if zero
+    orrne   r1, r1, r2, lsr #23     ; add flag for nonzero coeffs
+
+    str     r10, [r6], #4           ; *qcoeff++ = x
+    ldr     r11, [r8], #4           ; [dq3 | dq2]
+
+    smulbb  r12, r0, r9             ; [x0*dq0]
+    smultt  r0, r0, r9              ; [x1*dq1]
+
+    smulbb  r9, r10, r11            ; [x2*dq2]
+    smultt  r10, r10, r11           ; [x3*dq3]
+
+    lsls    r2, r2, #2              ; update loop counter
+    strh    r12, [r7, #0]           ; dqcoeff[0] = [x0*dq0]
+    strh    r0, [r7, #2]            ; dqcoeff[1] = [x1*dq1]
+    strh    r9, [r7, #4]            ; dqcoeff[2] = [x2*dq2]
+    strh    r10, [r7, #6]           ; dqcoeff[3] = [x3*dq3]
+    add     r7, r7, #8              ; dqcoeff += 8
+    bne     loop
+
+    ; PART 2: check position for eob...
+    mov     lr, #0                  ; init eob
+    cmp     r1, #0                  ; coeffs after quantization?
+    ldr     r11, [sp, #0]           ; restore BLOCKD pointer
+    beq     end                     ; skip eob calculations if all zero
+
+    ldr     r0, [r11, #vp8_blockd_qcoeff]
+
+    ; check shortcut for nonzero qcoeffs
+    tst    r1, #0x80
+    bne    quant_coeff_15_14
+    tst    r1, #0x20
+    bne    quant_coeff_13_11
+    tst    r1, #0x8
+    bne    quant_coeff_12_7
+    tst    r1, #0x40
+    bne    quant_coeff_10_9
+    tst    r1, #0x10
+    bne    quant_coeff_8_3
+    tst    r1, #0x2
+    bne    quant_coeff_6_5
+    tst    r1, #0x4
+    bne    quant_coeff_4_2
+    b      quant_coeff_1_0
+
+quant_coeff_15_14
+    ldrh    r2, [r0, #30]       ; rc=15, i=15
+    mov     lr, #16
+    cmp     r2, #0
+    bne     end
+
+    ldrh    r3, [r0, #28]       ; rc=14, i=14
+    mov     lr, #15
+    cmp     r3, #0
+    bne     end
+
+quant_coeff_13_11
+    ldrh    r2, [r0, #22]       ; rc=11, i=13
+    mov     lr, #14
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_12_7
+    ldrh    r3, [r0, #14]       ; rc=7,  i=12
+    mov     lr, #13
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #20]       ; rc=10, i=11
+    mov     lr, #12
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_10_9
+    ldrh    r3, [r0, #26]       ; rc=13, i=10
+    mov     lr, #11
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #24]       ; rc=12, i=9
+    mov     lr, #10
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_8_3
+    ldrh    r3, [r0, #18]       ; rc=9,  i=8
+    mov     lr, #9
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #12]       ; rc=6,  i=7
+    mov     lr, #8
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_6_5
+    ldrh    r3, [r0, #6]        ; rc=3,  i=6
+    mov     lr, #7
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #4]        ; rc=2,  i=5
+    mov     lr, #6
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_4_2
+    ldrh    r3, [r0, #10]       ; rc=5,  i=4
+    mov     lr, #5
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #16]       ; rc=8,  i=3
+    mov     lr, #4
+    cmp     r2, #0
+    bne     end
+
+    ldrh    r3, [r0, #8]        ; rc=4,  i=2
+    mov     lr, #3
+    cmp     r3, #0
+    bne     end
+
+quant_coeff_1_0
+    ldrh    r2, [r0, #2]        ; rc=1,  i=1
+    mov     lr, #2
+    cmp     r2, #0
+    bne     end
+
+    mov     lr, #1              ; rc=0,  i=0
+
+end
+    str     lr, [r11, #vp8_blockd_eob]
+    ldmfd   sp!, {r1, r4-r11, pc}
+
+    ENDP
+
+loop_count
+    DCD     0x1000000
+
+    END
+
--- a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
@@ -0,0 +1,133 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mse16x16_armv6|
+
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
+;      So, we can remove this part of calculation.
+
+|vp8_mse16x16_armv6| PROC
+
+    push    {r4-r9, lr}
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+
+    mov     r4, #0              ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r5, [r0, #0x0]      ; load 4 src pixels
+    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0x4]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+    ldr     r5, [r0, #0x8]      ; load 4 src pixels
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0xc]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    subs    r12, r12, #1        ; next row
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r1, [sp, #28]       ; get address of sse
+    mov     r0, r4              ; return sse
+    str     r4, [r1]            ; store sse
+
+    pop     {r4-r9, pc}
+
+    ENDP
+
+    END
--- a/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm
@@ -0,0 +1,95 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance8x8_armv6|
+
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance8x8_armv6| PROC
+
+    push    {r4-r10, lr}
+    mov     r12, #8             ; set loop counter to 8 (=block height)
+    mov     r4, #0              ; initialize sum = 0
+    mov     r5, #0              ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels
+    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r6, r7          ; calculate difference
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+    ; calculate total sum
+    add    r4, r4, r6           ; add positive differences to sum
+    sub    r4, r4, r7           ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r0, #0x4]      ; load 4 src pixels
+    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r6, r7          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r4, r4, r6          ; add positive differences to sum
+    sub     r4, r4, r7          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1        ; next row
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r8, [sp, #32]       ; get address of sse
+    mul     r1, r4, r4          ; sum * sum
+    str     r5, [r8]            ; store sse
+    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
+
+    pop     {r4-r10, pc}
+
+    ENDP
+
+    END
--- a/vp8/encoder/arm/dct_arm.c
+++ b/vp8/encoder/arm/dct_arm.c
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/encoder/dct.h"
+
+#if HAVE_ARMV6
+
+void vp8_fast_fdct8x4_armv6(short *input, short *output, int pitch)
+{
+    vp8_fast_fdct4x4_armv6(input,   output,    pitch);
+    vp8_fast_fdct4x4_armv6(input + 4, output + 16, pitch);
+}
+
+#endif /* HAVE_ARMV6 */
+
+
--- a/vp8/encoder/arm/dct_arm.h
+++ b/vp8/encoder/arm/dct_arm.h
@@ -14,12 +14,21 @@

 #if HAVE_ARMV6
 extern prototype_fdct(vp8_short_walsh4x4_armv6);
+extern prototype_fdct(vp8_fast_fdct4x4_armv6);
+extern prototype_fdct(vp8_fast_fdct8x4_armv6);

 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
+
+#undef  vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_armv6
+
+#undef  vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_armv6
 #endif
-#endif
+
+#endif /* HAVE_ARMV6 */

 #if HAVE_ARMV7
 extern prototype_fdct(vp8_short_fdct4x4_neon);
--- a/vp8/encoder/arm/quantize_arm.h
+++ b/vp8/encoder/arm/quantize_arm.h
@@ -12,6 +12,16 @@
 #ifndef QUANTIZE_ARM_H
 #define QUANTIZE_ARM_H

+#if HAVE_ARMV6
+
+extern prototype_quantize_block(vp8_fast_quantize_b_armv6);
+
+#undef  vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6
+
+#endif /* HAVE_ARMV6 */
+
+
 #if HAVE_ARMV7
 extern prototype_quantize_block(vp8_fast_quantize_b_neon);

--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -15,6 +15,34 @@

 #if HAVE_ARMV6

+unsigned int vp8_sub_pixel_variance8x8_armv6
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short first_pass[10*8];
+    unsigned char  second_pass[8*8];
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                            src_pixels_per_line,
+                                            9, 8, HFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                             8, 8, 8, VFilter);
+
+    return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
+                                   dst_pixels_per_line, sse);
+}
+
 unsigned int vp8_sub_pixel_variance16x16_armv6
 (
    const unsigned char  *src_ptr,
--- a/vp8/encoder/arm/variance_arm.h
+++ b/vp8/encoder/arm/variance_arm.h
@@ -16,10 +16,13 @@

 extern prototype_sad(vp8_sad16x16_armv6);
 extern prototype_variance(vp8_variance16x16_armv6);
+extern prototype_variance(vp8_variance8x8_armv6);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_armv6);
 extern prototype_variance(vp8_variance_halfpixvar16x16_h_armv6);
 extern prototype_variance(vp8_variance_halfpixvar16x16_v_armv6);
 extern prototype_variance(vp8_variance_halfpixvar16x16_hv_armv6);
+extern prototype_variance(vp8_mse16x16_armv6);

 #if !CONFIG_RUNTIME_CPU_DETECT

@@ -29,9 +32,18 @@ extern prototype_variance(vp8_variance_halfpixvar16x16_hv_armv6);
 #undef  vp8_variance_subpixvar16x16
 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6

+#undef  vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_armv6
+
 #undef  vp8_variance_var16x16
 #define vp8_variance_var16x16 vp8_variance16x16_armv6

+#undef  vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_armv6
+
+#undef  vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_armv6
+
 #undef  vp8_variance_halfpixvar16x16_h
 #define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_armv6

--- a/vp8/encoder/asm_enc_offsets.c
+++ b/vp8/encoder/asm_enc_offsets.c
@@ -65,6 +65,17 @@ DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));

 DEFINE(vp8_common_mb_rows,                      offsetof(VP8_COMMON, mb_rows));

+// offsets from BLOCK structure
+DEFINE(vp8_block_coeff,                         offsetof(BLOCK, coeff));
+DEFINE(vp8_block_quant_fast,                    offsetof(BLOCK, quant_fast));
+DEFINE(vp8_block_round,                         offsetof(BLOCK, round));
+
+// offsets from BLOCKD structure
+DEFINE(vp8_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));
+DEFINE(vp8_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));
+DEFINE(vp8_blockd_dequant,                      offsetof(BLOCKD, dequant));
+DEFINE(vp8_blockd_eob,                          offsetof(BLOCKD, eob));
+
 // These two sizes are used in vp8cx_pack_tokens.  They are hard coded
 // so if the size changes this will have to be adjusted.
 #if HAVE_ARMV5TE
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1366,6 +1366,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
    oh.show_frame = (int) pc->show_frame;
    oh.type = (int)pc->frame_type;
    oh.version = pc->version;
+    oh.first_partition_length_in_bytes = 0;

    mb_feature_data_bits = vp8_mb_feature_data_bits;
    cx_data += 3;
@@ -1634,6 +1635,21 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)

    vp8_stop_encode(bc);

+    oh.first_partition_length_in_bytes = cpi->bc.pos;
+
+    /* update frame tag */
+    {
+        int v = (oh.first_partition_length_in_bytes << 5) |
+                (oh.show_frame << 4) |
+                (oh.version << 1) |
+                oh.type;
+
+        dest[0] = v;
+        dest[1] = v >> 8;
+        dest[2] = v >> 16;
+    }
+
+    *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc.pos;

    if (pc->multi_token_partition != ONE_PARTITION)
    {
@@ -1643,9 +1659,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)

        pack_tokens_into_partitions(cpi, cx_data + bc->pos, num_part, &asize);

-        oh.first_partition_length_in_bytes = cpi->bc.pos;
-
-        *size = cpi->bc.pos + VP8_HEADER_SIZE + asize + extra_bytes_packed;
+        *size += asize;
    }
    else
    {
@@ -1659,19 +1673,8 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);

        vp8_stop_encode(&cpi->bc2);
-        oh.first_partition_length_in_bytes = cpi->bc.pos ;
-        *size = cpi->bc2.pos + cpi->bc.pos + VP8_HEADER_SIZE + extra_bytes_packed;
-    }

-    {
-        int v = (oh.first_partition_length_in_bytes << 5) |
-                (oh.show_frame << 4) |
-                (oh.version << 1) |
-                oh.type;
-
-        dest[0] = v;
-        dest[1] = v >> 8;
-        dest[2] = v >> 16;
+        *size += cpi->bc2.pos;
    }
 }

--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -808,7 +808,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);

            for (i = 0; i < cm->mb_rows; i++)
-                cpi->mt_current_mb_col[i] = 0;
+                cpi->mt_current_mb_col[i] = -1;

            for (i = 0; i < cpi->encoding_thread_count; i++)
            {
@@ -1184,7 +1184,8 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
            int distortion2;

            x->e_mbd.mode_info_context->mbmi.mode = mode;
-            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+            RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
+                (&x->e_mbd);
            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
            rate2  = x->mbmode_cost[x->e_mbd.frame_type][mode];
            this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -25,19 +25,6 @@
 #define intra4x4pbias_rate    256


-void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode)
-{
-    if (i < 12)
-    {
-        abmode[i+4] = best_mode;
-    }
-
-    if ((i & 3) != 3)
-    {
-        lbmode[i+1] = best_mode;
-    }
-
-}
 #if CONFIG_RUNTIME_CPU_DETECT
 #define IF_RTCD(x) (x)
 #else
@@ -80,7 +67,7 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
    int b;

-    vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd);

    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);

--- a/vp8/encoder/encodeintra.h
+++ b/vp8/encoder/encodeintra.h
@@ -17,7 +17,6 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *, MACROBLOCK *mb);
 void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);
-void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode);
 void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);

 #endif
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -104,7 +104,7 @@ static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
 }

-void vp8_build_dcblock(MACROBLOCK *x)
+static void build_dcblock(MACROBLOCK *x)
 {
    short *src_diff_ptr = &x->src_diff[384];
    int i;
@@ -138,7 +138,7 @@ void vp8_transform_intra_mby(MACROBLOCK *x)
    }

    // build dc block from 16 y dc values
-    vp8_build_dcblock(x);
+    build_dcblock(x);

    // do 2nd order transform on the dc block
    x->short_walsh4x4(&x->block[24].src_diff[0],
@@ -147,7 +147,7 @@ void vp8_transform_intra_mby(MACROBLOCK *x)
 }


-void vp8_transform_mb(MACROBLOCK *x)
+static void transform_mb(MACROBLOCK *x)
 {
    int i;

@@ -159,7 +159,7 @@ void vp8_transform_mb(MACROBLOCK *x)

    // build dc block from 16 y dc values
    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
-        vp8_build_dcblock(x);
+        build_dcblock(x);

    for (i = 16; i < 24; i += 2)
    {
@@ -174,7 +174,8 @@ void vp8_transform_mb(MACROBLOCK *x)

 }

-void vp8_transform_mby(MACROBLOCK *x)
+
+static void transform_mby(MACROBLOCK *x)
 {
    int i;

@@ -187,7 +188,7 @@ void vp8_transform_mby(MACROBLOCK *x)
    // build dc block from 16 y dc values
    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
    {
-        vp8_build_dcblock(x);
+        build_dcblock(x);
        x->short_walsh4x4(&x->block[24].src_diff[0],
            &x->block[24].coeff[0], 8);
    }
@@ -255,9 +256,9 @@ static const int plane_rd_mult[4]=
    Y1_RD_MULT
 };

-void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
-                    ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                    const VP8_ENCODER_RTCD *rtcd)
+static void optimize_b(MACROBLOCK *mb, int ib, int type,
+                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                       const VP8_ENCODER_RTCD *rtcd)
 {
    BLOCK *b;
    BLOCKD *d;
@@ -501,7 +502,7 @@ void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
    *a = *l = (d->eob != !type);
 }

-void vp8_optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+static void optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
 {
    int b;
    int type;
@@ -522,20 +523,20 @@ void vp8_optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)

    for (b = 0; b < 16; b++)
    {
-        vp8_optimize_b(x, b, type,
+        optimize_b(x, b, type,
            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
    }

    for (b = 16; b < 24; b++)
    {
-        vp8_optimize_b(x, b, PLANE_TYPE_UV,
+        optimize_b(x, b, PLANE_TYPE_UV,
            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
    }

    if (has_2nd_order)
    {
        b=24;
-        vp8_optimize_b(x, b, PLANE_TYPE_Y2,
+        optimize_b(x, b, PLANE_TYPE_Y2,
            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
    }
 }
@@ -569,7 +570,7 @@ void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)

    for (b = 0; b < 16; b++)
    {
-        vp8_optimize_b(x, b, type,
+        optimize_b(x, b, type,
        ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
    }

@@ -577,7 +578,7 @@ void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
    if (has_2nd_order)
    {
        b=24;
-        vp8_optimize_b(x, b, PLANE_TYPE_Y2,
+        optimize_b(x, b, PLANE_TYPE_Y2,
            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
    }
 }
@@ -603,7 +604,7 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)

    for (b = 16; b < 24; b++)
    {
-        vp8_optimize_b(x, b, PLANE_TYPE_UV,
+        optimize_b(x, b, PLANE_TYPE_UV,
            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
    }
 }
@@ -615,13 +616,13 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    vp8_subtract_mb(rtcd, x);

-    vp8_transform_mb(x);
+    transform_mb(x);

    vp8_quantize_mb(x);

 #if !(CONFIG_REALTIME_ONLY)
    if (x->optimize)
-        vp8_optimize_mb(x, rtcd);
+        optimize_mb(x, rtcd);
 #endif

    vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
@@ -638,7 +639,7 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);

-    vp8_transform_mby(x);
+    transform_mby(x);

    vp8_quantize_mby(x);

@@ -649,22 +650,6 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 }


-void vp8_encode_inter16x16uv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
-{
-    vp8_build_inter_predictors_mbuv(&x->e_mbd);
-
-    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
-
-    vp8_transform_mbuv(x);
-
-    vp8_quantize_mbuv(x);
-
-    vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-
-    vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
-}
-
-
 void vp8_encode_inter16x16uvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
    vp8_build_inter_predictors_mbuv(&x->e_mbd);
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -101,9 +101,6 @@ void vp8_build_dcblock(MACROBLOCK *b);
 void vp8_transform_mb(MACROBLOCK *mb);
 void vp8_transform_mbuv(MACROBLOCK *x);
 void vp8_transform_intra_mby(MACROBLOCK *x);
-void Encode16x16Y(MACROBLOCK *x);
-void Encode16x16UV(MACROBLOCK *x);
-void vp8_encode_inter16x16uv(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
 void vp8_encode_inter16x16uvrd(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
 void vp8_optimize_mby(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
 void vp8_optimize_mbuv(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -24,6 +24,32 @@ extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
 extern void vp8_build_block_offsets(MACROBLOCK *x);
 extern void vp8_setup_block_ptrs(MACROBLOCK *x);

+extern void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
+
+static THREAD_FUNCTION loopfilter_thread(void *p_data)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);
+    VP8_COMMON *cm = &cpi->common;
+
+    while (1)
+    {
+        if (cpi->b_multi_threaded == 0)
+            break;
+
+        if (sem_wait(&cpi->h_event_start_lpf) == 0)
+        {
+            if (cpi->b_multi_threaded == FALSE) // we're shutting down
+                break;
+
+            loopfilter_frame(cpi, cm);
+
+            sem_post(&cpi->h_event_end_lpf);
+        }
+    }
+
+    return 0;
+}
+
 static
 THREAD_FUNCTION thread_encoding_proc(void *p_data)
 {
@@ -429,56 +455,70 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,

 void vp8cx_create_encoder_threads(VP8_COMP *cpi)
 {
-    cpi->b_multi_threaded = 0;
+    const VP8_COMMON * cm = &cpi->common;

+    cpi->b_multi_threaded = 0;
+    cpi->encoding_thread_count = 0;
    cpi->processor_core_count = 32; //vp8_get_proc_core_count();

    if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
    {
        int ithread;
+        int th_count = cpi->oxcf.multi_threaded - 1;

        if (cpi->oxcf.multi_threaded > cpi->processor_core_count)
-            cpi->encoding_thread_count = cpi->processor_core_count - 1;
-        else
-            cpi->encoding_thread_count = cpi->oxcf.multi_threaded - 1;
+            th_count = cpi->processor_core_count - 1;

-        CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->h_event_start_encoding, vpx_malloc(sizeof(sem_t) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count));
-        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count);
-        CHECK_MEM_ERROR(cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->mt_current_mb_col, vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cpi->common.mb_rows));
+        /* we have th_count + 1 (main) threads processing one row each */
+        /* no point to have more threads than the sync range allows */
+        if(th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1))
+        {
+            th_count = (cm->mb_cols / cpi->mt_sync_range) - 1;
+        }
+
+        if(th_count == 0)
+            return;
+
+        CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * th_count));
+        CHECK_MEM_ERROR(cpi->h_event_start_encoding, vpx_malloc(sizeof(sem_t) * th_count));
+        CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
+        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
+        CHECK_MEM_ERROR(cpi->en_thread_data,
+                        vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
+        CHECK_MEM_ERROR(cpi->mt_current_mb_col,
+                        vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));

-        //cpi->h_event_main = CreateEvent(NULL, FALSE, FALSE, NULL);
        sem_init(&cpi->h_event_end_encoding, 0, 0);

        cpi->b_multi_threaded = 1;
+        cpi->encoding_thread_count = th_count;

-        //printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n", (cpi->encoding_thread_count +1));
+        /*
+        printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n",
+               (cpi->encoding_thread_count +1));
+        */

-        for (ithread = 0; ithread < cpi->encoding_thread_count; ithread++)
+        for (ithread = 0; ithread < th_count; ithread++)
        {
            ENCODETHREAD_DATA * ethd = &cpi->en_thread_data[ithread];

-            //cpi->h_event_mbrencoding[ithread] = CreateEvent(NULL, FALSE, FALSE, NULL);
            sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
            ethd->ithread = ithread;
            ethd->ptr1 = (void *)cpi;
            ethd->ptr2 = (void *)&cpi->mb_row_ei[ithread];

-            //printf(" call begin thread %d \n", ithread);
-
-            //cpi->h_encoding_thread[ithread] =   (HANDLE)_beginthreadex(
-            //  NULL,           // security
-            //  0,              // stksize
-            //  thread_encoding_proc,
-            //  (&cpi->en_thread_data[ithread]),          // Thread data
-            //  0,
-            //  NULL);
-
            pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, ethd);
        }

+        {
+            LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data;
+
+            sem_init(&cpi->h_event_start_lpf, 0, 0);
+            sem_init(&cpi->h_event_end_lpf, 0, 0);
+
+            lpfthd->ptr1 = (void *)cpi;
+            pthread_create(&cpi->h_filter_thread, 0, loopfilter_thread, lpfthd);
+        }
    }

 }
@@ -500,9 +540,14 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi)

                sem_destroy(&cpi->h_event_start_encoding[i]);
            }
+
+            sem_post(&cpi->h_event_start_lpf);
+            pthread_join(cpi->h_filter_thread, 0);
        }

        sem_destroy(&cpi->h_event_end_encoding);
+        sem_destroy(&cpi->h_event_end_lpf);
+        sem_destroy(&cpi->h_event_start_lpf);

        //free thread related resources
        vpx_free(cpi->h_event_start_encoding);
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -67,7 +67,7 @@ static int vscale_lookup[7] = {0, 1, 1, 2, 2, 3, 3};
 static int hscale_lookup[7] = {0, 0, 1, 1, 2, 2, 3};


-const int cq_level[QINDEX_RANGE] =
+static const int cq_level[QINDEX_RANGE] =
 {
    0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
    9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
@@ -79,10 +79,9 @@ const int cq_level[QINDEX_RANGE] =
    86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
 };

-void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
-int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps);
+static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);

-int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
+static int encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
 {

    int i;
@@ -146,7 +145,7 @@ static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    /*start_pos = cpi->stats_in;
    sum_iiratio = 0.0;
    i = 0;
-    while ( (i < 1) && vp8_input_stats(cpi,&next_frame) != EOF )
+    while ( (i < 1) && input_stats(cpi,&next_frame) != EOF )
    {

        next_iiratio = next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error);
@@ -212,7 +211,7 @@ static const double weight_table[256] = {
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
 };

-double vp8_simple_weight(YV12_BUFFER_CONFIG *source)
+static double simple_weight(YV12_BUFFER_CONFIG *source)
 {
    int i, j;

@@ -240,7 +239,7 @@ double vp8_simple_weight(YV12_BUFFER_CONFIG *source)


 // This function returns the current per frame maximum bitrate target
-int frame_max_bits(VP8_COMP *cpi)
+static int frame_max_bits(VP8_COMP *cpi)
 {
    // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left
    int max_bits;
@@ -281,38 +280,26 @@ int frame_max_bits(VP8_COMP *cpi)
 }


-extern size_t vp8_firstpass_stats_sz(unsigned int mb_count)
-{
-    /* Calculate the size of a stats packet, which is dependent on the frame
-     * resolution. The FIRSTPASS_STATS struct has a single element array,
-     * motion_map, which is virtually expanded to have one element per
-     * macroblock.
-     */
-    size_t stats_sz;
-
-    stats_sz = sizeof(FIRSTPASS_STATS) + mb_count;
-    stats_sz = (stats_sz + 7) & ~7;
-    return stats_sz;
-}
-
-
-void vp8_output_stats(const VP8_COMP            *cpi,
-                      struct vpx_codec_pkt_list *pktlist,
-                      FIRSTPASS_STATS            *stats)
+static void output_stats(const VP8_COMP            *cpi,
+                         struct vpx_codec_pkt_list *pktlist,
+                         FIRSTPASS_STATS            *stats)
 {
    struct vpx_codec_cx_pkt pkt;
    pkt.kind = VPX_CODEC_STATS_PKT;
    pkt.data.twopass_stats.buf = stats;
-    pkt.data.twopass_stats.sz = vp8_firstpass_stats_sz(cpi->common.MBs);
+    pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
    vpx_codec_pkt_list_add(pktlist, &pkt);

 // TEMP debug code
 #if OUTPUT_FPF
+
    {
        FILE *fpfile;
        fpfile = fopen("firstpass.stt", "a");

-        fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f\n",
+        fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f"
+                " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
+                " %12.0f %12.4f\n",
                stats->frame,
                stats->intra_error,
                stats->coded_error,
@@ -320,6 +307,7 @@ void vp8_output_stats(const VP8_COMP            *cpi,
                stats->pcnt_inter,
                stats->pcnt_motion,
                stats->pcnt_second_ref,
+                stats->pcnt_neutral,
                stats->MVr,
                stats->mvr_abs,
                stats->MVc,
@@ -327,30 +315,24 @@ void vp8_output_stats(const VP8_COMP            *cpi,
                stats->MVrv,
                stats->MVcv,
                stats->mv_in_out_count,
-                stats->count);
-        fclose(fpfile);
-
-
-        fpfile = fopen("fpmotionmap.stt", "a");
-        if(fwrite(cpi->fp_motion_map, 1, cpi->common.MBs, fpfile));
+                stats->count,
+                stats->duration);
        fclose(fpfile);
    }
 #endif
 }

-int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
+static int input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
 {
-    size_t stats_sz = vp8_firstpass_stats_sz(cpi->common.MBs);
-
    if (cpi->stats_in >= cpi->stats_in_end)
        return EOF;

    *fps = *cpi->stats_in;
-    cpi->stats_in = (void*)((char *)cpi->stats_in + stats_sz);
+    cpi->stats_in = (void*)((char *)cpi->stats_in + sizeof(FIRSTPASS_STATS));
    return 1;
 }

-void vp8_zero_stats(FIRSTPASS_STATS *section)
+static void zero_stats(FIRSTPASS_STATS *section)
 {
    section->frame      = 0.0;
    section->intra_error = 0.0;
@@ -359,6 +341,7 @@ void vp8_zero_stats(FIRSTPASS_STATS *section)
    section->pcnt_inter  = 0.0;
    section->pcnt_motion  = 0.0;
    section->pcnt_second_ref = 0.0;
+    section->pcnt_neutral = 0.0;
    section->MVr        = 0.0;
    section->mvr_abs     = 0.0;
    section->MVc        = 0.0;
@@ -369,7 +352,7 @@ void vp8_zero_stats(FIRSTPASS_STATS *section)
    section->count      = 0.0;
    section->duration   = 1.0;
 }
-void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
+static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
 {
    section->frame += frame->frame;
    section->intra_error += frame->intra_error;
@@ -378,6 +361,7 @@ void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
    section->pcnt_inter  += frame->pcnt_inter;
    section->pcnt_motion += frame->pcnt_motion;
    section->pcnt_second_ref += frame->pcnt_second_ref;
+    section->pcnt_neutral += frame->pcnt_neutral;
    section->MVr        += frame->MVr;
    section->mvr_abs     += frame->mvr_abs;
    section->MVc        += frame->MVc;
@@ -388,7 +372,7 @@ void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
    section->count      += frame->count;
    section->duration   += frame->duration;
 }
-void vp8_avg_stats(FIRSTPASS_STATS *section)
+static void avg_stats(FIRSTPASS_STATS *section)
 {
    if (section->count < 1.0)
        return;
@@ -398,6 +382,7 @@ void vp8_avg_stats(FIRSTPASS_STATS *section)
    section->ssim_weighted_pred_err /= section->count;
    section->pcnt_inter  /= section->count;
    section->pcnt_second_ref /= section->count;
+    section->pcnt_neutral /= section->count;
    section->pcnt_motion /= section->count;
    section->MVr        /= section->count;
    section->mvr_abs     /= section->count;
@@ -409,65 +394,17 @@ void vp8_avg_stats(FIRSTPASS_STATS *section)
    section->duration   /= section->count;
 }

-unsigned char *vp8_fpmm_get_pos(VP8_COMP *cpi)
-{
-    return cpi->fp_motion_map_stats;
-}
-void vp8_fpmm_reset_pos(VP8_COMP *cpi, unsigned char *target_pos)
-{
-    cpi->fp_motion_map_stats = target_pos;
-}
-
-void vp8_advance_fpmm(VP8_COMP *cpi, int count)
-{
-    cpi->fp_motion_map_stats = (void*)((char*)cpi->fp_motion_map_stats +
-        count * vp8_firstpass_stats_sz(cpi->common.MBs));
-}
-
-void vp8_input_fpmm(VP8_COMP *cpi)
-{
-    unsigned char *fpmm = cpi->fp_motion_map;
-    int MBs = cpi->common.MBs;
-    int max_frames = cpi->active_arnr_frames;
-    int i;
-
-    for (i=0; i<max_frames; i++)
-    {
-        char *motion_map = (char*)cpi->fp_motion_map_stats
-                           + sizeof(FIRSTPASS_STATS);
-
-        memcpy(fpmm, motion_map, MBs);
-        fpmm += MBs;
-        vp8_advance_fpmm(cpi, 1);
-    }
-
-    // Flag the use of weights in the temporal filter
-    cpi->use_weighted_temporal_filter = 1;
-}
-
 void vp8_init_first_pass(VP8_COMP *cpi)
 {
-    vp8_zero_stats(cpi->total_stats);
-
-// TEMP debug code
-#ifdef OUTPUT_FPF
-    {
-        FILE *fpfile;
-        fpfile = fopen("firstpass.stt", "w");
-        fclose(fpfile);
-        fpfile = fopen("fpmotionmap.stt", "wb");
-        fclose(fpfile);
-    }
-#endif
-
+    zero_stats(cpi->total_stats);
 }

 void vp8_end_first_pass(VP8_COMP *cpi)
 {
-    vp8_output_stats(cpi, cpi->output_pkt_list, cpi->total_stats);
+    output_stats(cpi, cpi->output_pkt_list, cpi->total_stats);
 }

-void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
+static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
 {
    MACROBLOCKD * const xd = & x->e_mbd;
    BLOCK *b = &x->block[0];
@@ -486,7 +423,7 @@ void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * r
    VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16) ( src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err));
 }

-void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset )
+static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset )
 {
    MACROBLOCKD *const xd = & x->e_mbd;
    BLOCK *b = &x->block[0];
@@ -570,13 +507,12 @@ void vp8_first_pass(VP8_COMP *cpi)
    int intercount = 0;
    int second_ref_count = 0;
    int intrapenalty = 256;
+    int neutral_count = 0;

    int sum_in_vectors = 0;

    MV zero_ref_mv = {0, 0};

-    unsigned char *fp_motion_map_ptr = cpi->fp_motion_map;
-
    vp8_clear_system_state();  //__asm emms;

    x->src = * cpi->Source;
@@ -628,7 +564,6 @@ void vp8_first_pass(VP8_COMP *cpi)
        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
        {
            int this_error;
-            int zero_error;
            int zz_to_best_ratio;
            int gf_motion_error = INT_MAX;
            int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
@@ -639,7 +574,7 @@ void vp8_first_pass(VP8_COMP *cpi)
            xd->left_available = (mb_col != 0);

            // do intra 16x16 prediction
-            this_error = vp8_encode_intra(cpi, x, use_dc_pred);
+            this_error = encode_intra(cpi, x, use_dc_pred);

            // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
            // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
@@ -650,9 +585,6 @@ void vp8_first_pass(VP8_COMP *cpi)
            // Cumulative intra error total
            intra_error += (long long)this_error;

-            // Indicate default assumption of intra in the motion map
-            *fp_motion_map_ptr = 0;
-
            // Set up limit values for motion vectors to prevent them extending outside the UMV borders
            x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
            x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
@@ -667,16 +599,13 @@ void vp8_first_pass(VP8_COMP *cpi)
                int motion_error = INT_MAX;

                // Simple 0,0 motion with no mv overhead
-                vp8_zz_motion_search( cpi, x, lst_yv12, &motion_error, recon_yoffset );
+                zz_motion_search( cpi, x, lst_yv12, &motion_error, recon_yoffset );
                d->bmi.mv.as_mv.row = 0;
                d->bmi.mv.as_mv.col = 0;

-                // Save (0,0) error for later use
-                zero_error = motion_error;
-
                // Test last reference frame using the previous best mv as the
                // starting point (best reference) for the search
-                vp8_first_pass_motion_search(cpi, x, &best_ref_mv.as_mv,
+                first_pass_motion_search(cpi, x, &best_ref_mv.as_mv,
                                        &d->bmi.mv.as_mv, lst_yv12,
                                        &motion_error, recon_yoffset);

@@ -684,7 +613,7 @@ void vp8_first_pass(VP8_COMP *cpi)
                if (best_ref_mv.as_int)
                {
                   tmp_err = INT_MAX;
-                   vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv,
+                   first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv,
                                     lst_yv12, &tmp_err, recon_yoffset);

                   if ( tmp_err < motion_error )
@@ -698,7 +627,7 @@ void vp8_first_pass(VP8_COMP *cpi)
                // Experimental search in a second reference frame ((0,0) based only)
                if (cm->current_video_frame > 1)
                {
-                    vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, gld_yv12, &gf_motion_error, recon_yoffset);
+                    first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, gld_yv12, &gf_motion_error, recon_yoffset);

                    if ((gf_motion_error < motion_error) && (gf_motion_error < this_error))
                    {
@@ -726,6 +655,17 @@ void vp8_first_pass(VP8_COMP *cpi)

                if (motion_error <= this_error)
                {
+                    // Keep a count of cases where the inter and intra were
+                    // very close and very low. This helps with scene cut
+                    // detection for example in cropped clips with black bars
+                    // at the sides or top and bottom.
+                    if( (((this_error-intrapenalty) * 9) <=
+                         (motion_error*10)) &&
+                        (this_error < (2*intrapenalty)) )
+                    {
+                        neutral_count++;
+                    }
+
                    d->bmi.mv.as_mv.row <<= 3;
                    d->bmi.mv.as_mv.col <<= 3;
                    this_error = motion_error;
@@ -777,25 +717,6 @@ void vp8_first_pass(VP8_COMP *cpi)
                            else if (d->bmi.mv.as_mv.col < 0)
                                sum_in_vectors--;
                        }
-
-                        // Compute how close (0,0) predictor is to best
-                        // predictor in terms of their prediction error
-                        zz_to_best_ratio = (10*zero_error + this_error/2)
-                                            / (this_error+!this_error);
-
-                        if ((zero_error < 50000) &&
-                            (zz_to_best_ratio <= 11) )
-                            *fp_motion_map_ptr = 1;
-                        else
-                            *fp_motion_map_ptr = 0;
-                    }
-                    else
-                    {
-                        // 0,0 mv was best
-                        if( zero_error<50000 )
-                            *fp_motion_map_ptr = 2;
-                        else
-                            *fp_motion_map_ptr = 1;
                    }
                }
            }
@@ -809,9 +730,6 @@ void vp8_first_pass(VP8_COMP *cpi)

            recon_yoffset += 16;
            recon_uvoffset += 8;
-
-            // Update the motion map
-            fp_motion_map_ptr++;
        }

        // adjust to the next row of mbs
@@ -833,7 +751,7 @@ void vp8_first_pass(VP8_COMP *cpi)
        fps.frame      = cm->current_video_frame ;
        fps.intra_error = intra_error >> 8;
        fps.coded_error = coded_error >> 8;
-        weight = vp8_simple_weight(cpi->Source);
+        weight = simple_weight(cpi->Source);


        if (weight < 0.1)
@@ -854,6 +772,7 @@ void vp8_first_pass(VP8_COMP *cpi)

        fps.pcnt_inter   = 1.0 * (double)intercount / cm->MBs;
        fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
+        fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;

        if (mvcount > 0)
        {
@@ -872,15 +791,12 @@ void vp8_first_pass(VP8_COMP *cpi)
        // than the full time between subsequent cpi->source_time_stamp s  .
        fps.duration = cpi->source_end_time_stamp - cpi->source_time_stamp;

-        // don't want to do outputstats with a stack variable!
+        // don't want to do output stats with a stack variable!
        memcpy(cpi->this_frame_stats,
               &fps,
               sizeof(FIRSTPASS_STATS));
-        memcpy((char*)cpi->this_frame_stats + sizeof(FIRSTPASS_STATS),
-               cpi->fp_motion_map,
-               sizeof(cpi->fp_motion_map[0]) * cpi->common.MBs);
-        vp8_output_stats(cpi, cpi->output_pkt_list, cpi->this_frame_stats);
-        vp8_accumulate_stats(cpi->total_stats, &fps);
+        output_stats(cpi, cpi->output_pkt_list, cpi->this_frame_stats);
+        accumulate_stats(cpi->total_stats, &fps);
    }

    // Copy the previous Last Frame into the GF buffer if specific conditions for doing so are met
@@ -924,10 +840,10 @@ void vp8_first_pass(VP8_COMP *cpi)
 extern const int vp8_bits_per_mb[2][QINDEX_RANGE];

 #define BASE_ERRPERMB   150
-static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
+static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
 {
    int Q;
-    int num_mbs = ((Height * Width) / (16 * 16));
+    int num_mbs = cpi->common.MBs;
    int target_norm_bits_per_mb;

    double err_per_mb = section_err / num_mbs;
@@ -1024,10 +940,10 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_

    return Q;
 }
-static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
+static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
 {
    int Q;
-    int num_mbs = ((Height * Width) / (16 * 16));
+    int num_mbs = cpi->common.MBs;
    int target_norm_bits_per_mb;

    double err_per_mb = section_err / num_mbs;
@@ -1075,10 +991,10 @@ static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_band
 }

 // Estimate a worst case Q for a KF group
-static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width, double group_iiratio)
+static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, double group_iiratio)
 {
    int Q;
-    int num_mbs = ((Height * Width) / (16 * 16));
+    int num_mbs = cpi->common.MBs;
    int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs;
    int bits_per_mb_at_this_q;

@@ -1173,11 +1089,10 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta

 // For cq mode estimate a cq level that matches the observed
 // complexity and data rate.
-static int estimate_cq(VP8_COMP *cpi, double section_err,
-                       int section_target_bandwitdh, int Height, int Width)
+static int estimate_cq(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
 {
    int Q;
-    int num_mbs = ((Height * Width) / (16 * 16));
+    int num_mbs = cpi->common.MBs;
    int target_norm_bits_per_mb;

    double err_per_mb = section_err / num_mbs;
@@ -1252,7 +1167,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)

    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);

-    vp8_zero_stats(cpi->total_stats);
+    zero_stats(cpi->total_stats);

    if (!cpi->stats_in_end)
        return;
@@ -1286,7 +1201,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)
    cpi->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
    cpi->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;

-    vp8_avg_stats(cpi->total_stats);
+    avg_stats(cpi->total_stats);

    // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
    {
@@ -1295,7 +1210,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)

        start_pos = cpi->stats_in;               // Note starting "file" position

-        while (vp8_input_stats(cpi, &this_frame) != EOF)
+        while (input_stats(cpi, &this_frame) != EOF)
        {
            IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
            IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
@@ -1316,7 +1231,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)
        cpi->modified_error_total = 0.0;
        cpi->modified_error_used = 0.0;

-        while (vp8_input_stats(cpi, &this_frame) != EOF)
+        while (input_stats(cpi, &this_frame) != EOF)
        {
            cpi->modified_error_total += calculate_modified_err(cpi, &this_frame);
        }
@@ -1331,8 +1246,6 @@ void vp8_init_second_pass(VP8_COMP *cpi)
    cpi->clip_bpe =  cpi->bits_left /
                     DOUBLE_DIVIDE_CHECK(cpi->modified_error_total);
    cpi->observed_bpe = cpi->clip_bpe;
-
-    cpi->fp_motion_map_stats = (unsigned char *)cpi->stats_in;
 }

 void vp8_end_second_pass(VP8_COMP *cpi)
@@ -1340,8 +1253,8 @@ void vp8_end_second_pass(VP8_COMP *cpi)
 }

 // This function gives and estimate of how badly we believe
-// the predicition quality is decaying from frame to frame.
-double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+// the prediction quality is decaying from frame to frame.
+static double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
 {
    double prediction_decay_rate;
    double motion_decay;
@@ -1376,6 +1289,52 @@ double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
    return prediction_decay_rate;
 }

+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(
+    VP8_COMP *cpi,
+    int frame_interval,
+    int still_interval,
+    double loop_decay_rate,
+    double decay_accumulator )
+{
+    BOOL trans_to_still = FALSE;
+
+    // Break clause to detect very still sections after motion
+    // For example a static image after a fade or other transition
+    // instead of a clean scene cut.
+    if ( (frame_interval > MIN_GF_INTERVAL) &&
+         (loop_decay_rate >= 0.999) &&
+         (decay_accumulator < 0.9) )
+    {
+        int j;
+        FIRSTPASS_STATS * position = cpi->stats_in;
+        FIRSTPASS_STATS tmp_next_frame;
+        double decay_rate;
+
+        // Look ahead a few frames to see if static condition
+        // persists...
+        for ( j = 0; j < still_interval; j++ )
+        {
+            if (EOF == input_stats(cpi, &tmp_next_frame))
+                break;
+
+            decay_rate = get_prediction_decay_rate(cpi, &tmp_next_frame);
+            if ( decay_rate < 0.999 )
+                break;
+        }
+        // Reset file position
+        reset_fpf_position(cpi, position);
+
+        // Only if it does do we signal a transition to still
+        if ( j == still_interval )
+            trans_to_still = TRUE;
+    }
+
+    return trans_to_still;
+}
+
 // Analyse and define a gf/arf group .
 static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
@@ -1406,8 +1365,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

    int max_bits = frame_max_bits(cpi);     // Max for a single frame

-    unsigned char *fpmm_pos;
-
    unsigned int allow_alt_ref =
                    cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;

@@ -1416,8 +1373,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

    vp8_clear_system_state();  //__asm emms;

-    fpmm_pos = vp8_fpmm_get_pos(cpi);
-
    start_pos = cpi->stats_in;

    vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
@@ -1461,7 +1416,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        mod_err_per_mb_accumulator +=
            mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);

-        if (EOF == vp8_input_stats(cpi, &next_frame))
+        if (EOF == input_stats(cpi, &next_frame))
            break;

        // Accumulate motion stats.
@@ -1528,7 +1483,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        if (r > GF_RMAX)
            r = GF_RMAX;

-        loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame);
+        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);

        // Cumulative effect of decay
        decay_accumulator = decay_accumulator * loop_decay_rate;
@@ -1537,48 +1492,13 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        boost_score += (decay_accumulator * r);

        // Break clause to detect very still sections after motion
-        // For example a staic image after a fade or other transition
-        // instead of a clean key frame.
-        if ( (i > MIN_GF_INTERVAL) &&
-             (loop_decay_rate >= 0.999) &&
-             (decay_accumulator < 0.9) )
+        // For example a staic image after a fade or other transition.
+        if ( detect_transition_to_still( cpi, i, 5,
+                                         loop_decay_rate, decay_accumulator ) )
        {
-            int j;
-            FIRSTPASS_STATS * position = cpi->stats_in;
-            FIRSTPASS_STATS tmp_next_frame;
-            double decay_rate;
-
-            // Look ahead a few frames to see if static condition
-            // persists...
-            for ( j = 0; j < 4; j++ )
-            {
-                if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
-                    break;
-
-                decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame);
-                if ( decay_rate < 0.999 )
-                    break;
-            }
-            reset_fpf_position(cpi, position);            // Reset file position
-
-            // Force GF not alt ref
-            if ( j == 4 )
-            {
-                if (0)
-                {
-                    FILE *f = fopen("fadegf.stt", "a");
-                    fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n",
-                         cpi->common.current_video_frame+i, i,
-                         loop_decay_rate, decay_accumulator,
-                         boost_score );
-                    fclose(f);
-                }
-
-                allow_alt_ref = FALSE;
-
-                boost_score = old_boost_score;
-                break;
-            }
+            allow_alt_ref = FALSE;
+            boost_score = old_boost_score;
+            break;
        }

        // Break out conditions.
@@ -1686,7 +1606,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        arf_frame_bits = (int)((double)Boost * (group_bits / (double)allocation_chunks));

        // Estimate if there are enough bits available to make worthwhile use of an arf.
-        tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits, cpi->common.Height, cpi->common.Width);
+        tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits);

        // Only use an arf if it is likely we will be able to code it at a lower Q than the surrounding frames.
        if (tmp_q < cpi->worst_quality)
@@ -1749,20 +1669,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            }

            cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
-
-            {
-                // Advance to & read in the motion map for those frames
-                // to be considered for filtering based on the position
-                // of the ARF
-                vp8_fpmm_reset_pos(cpi, cpi->fp_motion_map_stats_save);
-
-                // Position at the 'earliest' frame to be filtered
-                vp8_advance_fpmm(cpi,
-                    cpi->baseline_gf_interval - frames_bwd);
-
-                // Read / create a motion map for the region of interest
-                vp8_input_fpmm(cpi);
-            }
        }
        else
        {
@@ -1784,7 +1690,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        {
            while (cpi->baseline_gf_interval < cpi->frames_to_key)
            {
-                if (EOF == vp8_input_stats(cpi, this_frame))
+                if (EOF == input_stats(cpi, this_frame))
                    break;

                cpi->baseline_gf_interval++;
@@ -1963,16 +1869,16 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        FIRSTPASS_STATS sectionstats;
        double Ratio;

-        vp8_zero_stats(&sectionstats);
+        zero_stats(&sectionstats);
        reset_fpf_position(cpi, start_pos);

        for (i = 0 ; i < cpi->baseline_gf_interval ; i++)
        {
-            vp8_input_stats(cpi, &next_frame);
-            vp8_accumulate_stats(&sectionstats, &next_frame);
+            input_stats(cpi, &next_frame);
+            accumulate_stats(&sectionstats, &next_frame);
        }

-        vp8_avg_stats(&sectionstats);
+        avg_stats(&sectionstats);

        cpi->section_intra_rating =
            sectionstats.intra_error /
@@ -1992,9 +1898,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        reset_fpf_position(cpi, start_pos);
    }
-
-    // Reset the First pass motion map file position
-    vp8_fpmm_reset_pos(cpi, fpmm_pos);
 }

 // Allocate bits to a normal frame that is neither a gf an arf or a key frame.
@@ -2073,16 +1976,9 @@ void vp8_second_pass(VP8_COMP *cpi)

    vp8_clear_system_state();

-    if (EOF == vp8_input_stats(cpi, &this_frame))
+    if (EOF == input_stats(cpi, &this_frame))
        return;

-    vpx_memset(cpi->fp_motion_map, 0,
-                cpi->oxcf.arnr_max_frames*cpi->common.MBs);
-    cpi->fp_motion_map_stats_save = vp8_fpmm_get_pos(cpi);
-
-    // Step over this frame's first pass motion map
-    vp8_advance_fpmm(cpi, 1);
-
    this_frame_error = this_frame.ssim_weighted_pred_err;
    this_frame_intra_error = this_frame.intra_error;
    this_frame_coded_error = this_frame.coded_error;
@@ -2101,7 +1997,7 @@ void vp8_second_pass(VP8_COMP *cpi)
    {
        // Define next KF group and assign bits to it
        vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
-        vp8_find_next_key_frame(cpi, &this_frame_copy);
+        find_next_key_frame(cpi, &this_frame_copy);

        // Special case: Error error_resilient_mode mode does not make much sense for two pass but with its current meaning but this code is designed to stop
        // outlandish behaviour if someone does set it when using two pass. It effectively disables GF groups.
@@ -2214,8 +2110,7 @@ void vp8_second_pass(VP8_COMP *cpi)
            est_cq =
                estimate_cq( cpi,
                             (cpi->total_coded_error_left / frames_left),
-                             (int)(cpi->bits_left / frames_left),
-                             cpi->common.Height, cpi->common.Width);
+                             (int)(cpi->bits_left / frames_left));

            cpi->cq_target_quality = cpi->oxcf.cq_level;
            if ( est_cq > cpi->cq_target_quality )
@@ -2227,9 +2122,7 @@ void vp8_second_pass(VP8_COMP *cpi)
        cpi->maxq_min_limit = cpi->best_quality;
        tmp_q = estimate_max_q( cpi,
                                (cpi->total_coded_error_left / frames_left),
-                                (int)(cpi->bits_left / frames_left),
-                                cpi->common.Height,
-                                cpi->common.Width);
+                                (int)(cpi->bits_left / frames_left));

        // Limit the maxq value returned subsequently.
        // This increases the risk of overspend or underspend if the initial
@@ -2257,7 +2150,7 @@ void vp8_second_pass(VP8_COMP *cpi)
        if (frames_left < 1)
            frames_left = 1;

-        tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width);
+        tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left));

        // Move active_worst_quality but in a damped way
        if (tmp_q > cpi->active_worst_quality)
@@ -2285,7 +2178,7 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST
        (next_frame->pcnt_second_ref < 0.10) &&
        ((this_frame->pcnt_inter < 0.05) ||
         (
-             (this_frame->pcnt_inter < .25) &&
+             ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .25) &&
             ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
             ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
              (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
@@ -2332,7 +2225,9 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST
            // Test various breakout clauses
            if ((local_next_frame.pcnt_inter < 0.05) ||
                (next_iiratio < 1.5) ||
-                ((local_next_frame.pcnt_inter < 0.20) && (next_iiratio < 3.0)) ||
+                (((local_next_frame.pcnt_inter -
+                   local_next_frame.pcnt_neutral) < 0.20) &&
+                 (next_iiratio < 3.0)) ||
                ((boost_score - old_boost_score) < 0.5) ||
                (local_next_frame.intra_error < 200)
               )
@@ -2343,7 +2238,7 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST
            old_boost_score = boost_score;

            // Get the next frame details
-            if (EOF == vp8_input_stats(cpi, &local_next_frame))
+            if (EOF == input_stats(cpi, &local_next_frame))
                break;
        }

@@ -2361,15 +2256,15 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST

    return is_viable_kf;
 }
-void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
-    int i;
+    int i,j;
    FIRSTPASS_STATS last_frame;
    FIRSTPASS_STATS first_frame;
    FIRSTPASS_STATS next_frame;
    FIRSTPASS_STATS *start_position;

-    double decay_accumulator = 0;
+    double decay_accumulator = 1.0;
    double boost_score = 0;
    double old_boost_score = 0.0;
    double loop_decay_rate;
@@ -2379,6 +2274,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    double kf_group_intra_err = 0.0;
    double kf_group_coded_err = 0.0;
    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+    double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};

    vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean

@@ -2407,6 +2303,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    kf_mod_err = calculate_modified_err(cpi, this_frame);

    // find the next keyframe
+    i = 0;
    while (cpi->stats_in < cpi->stats_in_end)
    {
        // Accumulate kf group error
@@ -2419,15 +2316,40 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        // load a the next frame's stats
        vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
-        vp8_input_stats(cpi, this_frame);
+        input_stats(cpi, this_frame);

        // Provided that we are not at the end of the file...
        if (cpi->oxcf.auto_key
            && lookup_next_frame_stats(cpi, &next_frame) != EOF)
        {
+            // Normal scene cut check
            if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
                break;

+            // How fast is prediction quality decaying
+            loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+            // We want to know something about the recent past... rather than
+            // as used elsewhere where we are concened with decay in prediction
+            // quality since the last GF or KF.
+            recent_loop_decay[i%8] = loop_decay_rate;
+            decay_accumulator = 1.0;
+            for (j = 0; j < 8; j++)
+            {
+                decay_accumulator = decay_accumulator * recent_loop_decay[j];
+            }
+
+            // Special check for transition or high motion followed by a
+            // to a static scene.
+            if ( detect_transition_to_still( cpi, i,
+                                             (cpi->key_frame_frequency-i),
+                                             loop_decay_rate,
+                                             decay_accumulator ) )
+            {
+                break;
+            }
+
+
            // Step on to the next frame
            cpi->frames_to_key ++;

@@ -2437,6 +2359,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
                break;
        } else
            cpi->frames_to_key ++;
+
+        i++;
    }

    // If there is a max kf interval set by the user we must obey it.
@@ -2470,7 +2394,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            kf_group_coded_err += tmp_frame.coded_error;

            // Load a the next frame's stats
-            vp8_input_stats(cpi, &tmp_frame);
+            input_stats(cpi, &tmp_frame);
        }

        // Reset to the start of the group
@@ -2575,7 +2499,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        double motion_decay;
        double motion_pct;

-        if (EOF == vp8_input_stats(cpi, &next_frame))
+        if (EOF == input_stats(cpi, &next_frame))
            break;

        if (next_frame.intra_error > cpi->kf_intra_err_min)
@@ -2588,32 +2512,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        if (r > RMAX)
            r = RMAX;

-        // Adjust loop decay rate
-        //if ( next_frame.pcnt_inter < loop_decay_rate )
-        loop_decay_rate = next_frame.pcnt_inter;
-
-        // High % motion -> somewhat higher decay rate
-        motion_pct = next_frame.pcnt_motion;
-        motion_decay = (1.0 - (motion_pct / 20.0));
-        if (motion_decay < loop_decay_rate)
-            loop_decay_rate = motion_decay;
-
-        // Adjustment to decay rate based on speed of motion
-        {
-            double this_mv_rabs;
-            double this_mv_cabs;
-            double distance_factor;
-
-            this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct);
-            this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct);
-
-            distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
-                                   (this_mv_cabs * this_mv_cabs)) / 250.0;
-            distance_factor = ((distance_factor > 1.0)
-                                    ? 0.0 : (1.0 - distance_factor));
-            if (distance_factor < loop_decay_rate)
-                loop_decay_rate = distance_factor;
-        }
+        // How fast is prediction quality decaying
+        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);

        decay_accumulator = decay_accumulator * loop_decay_rate;
        decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
@@ -2634,16 +2534,16 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        FIRSTPASS_STATS sectionstats;
        double Ratio;

-        vp8_zero_stats(&sectionstats);
+        zero_stats(&sectionstats);
        reset_fpf_position(cpi, start_position);

        for (i = 0 ; i < cpi->frames_to_key ; i++)
        {
-            vp8_input_stats(cpi, &next_frame);
-            vp8_accumulate_stats(&sectionstats, &next_frame);
+            input_stats(cpi, &next_frame);
+            accumulate_stats(&sectionstats, &next_frame);
        }

-        vp8_avg_stats(&sectionstats);
+        avg_stats(&sectionstats);

         cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);

@@ -2859,7 +2759,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);

        // Work out if spatial resampling is necessary
-        kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, new_height, new_width, group_iiratio);
+        kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, group_iiratio);

        // If we project a required Q higher than the maximum allowed Q then make a guess at the actual size of frames in this section
        projected_bits_perframe = bits_per_frame;
@@ -2930,7 +2830,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
                effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0;

                // Now try again and see what Q we get with the smaller image size
-                kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, new_height, new_width, group_iiratio);
+                kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, group_iiratio);

                if (0)
                {
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -17,8 +17,6 @@
 void vp8_arch_x86_encoder_init(VP8_COMP *cpi);
 void vp8_arch_arm_encoder_init(VP8_COMP *cpi);

-
-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
 extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);

 void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
@@ -103,6 +101,10 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
    // Pure C:
    vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;

+#if CONFIG_PSNR
+    cpi->rtcd.variance.ssimpf_8x8            = ssim_parms_8x8_c;
+    cpi->rtcd.variance.ssimpf                = ssim_parms_c;
+#endif

 #if ARCH_X86 || ARCH_X86_64
    vp8_arch_x86_encoder_init(cpi);
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -43,7 +43,7 @@ int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight)
    return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * Weight) >> 7;
 }

-int vp8_mv_err_cost(MV *mv, MV *ref, int *mvcost[2], int error_per_bit)
+static int mv_err_cost(MV *mv, MV *ref, int *mvcost[2], int error_per_bit)
 {
    //int i;
    //return ((mvcost[0][(mv->row - ref->row)>>1] + mvcost[1][(mv->col - ref->col)>>1] + 128) * error_per_bit) >> 8;
@@ -221,7 +221,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

    // calculate central point error
    besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
-    besterr += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+    besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);

    // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
    while (--halfiters)
@@ -337,13 +337,13 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,

    // calculate central point error
    bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
-    bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+    bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);

    // go left then right and check error
    this_mv.row = startmv.row;
    this_mv.col = ((startmv.col - 8) | 4);
    left = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (left < bestmse)
    {
@@ -353,7 +353,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,

    this_mv.col += 8;
    right = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
-    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (right < bestmse)
    {
@@ -365,7 +365,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
    this_mv.col = startmv.col;
    this_mv.row = ((startmv.row - 8) | 4);
    up = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (up < bestmse)
    {
@@ -375,7 +375,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,

    this_mv.row += 8;
    down = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
-    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (down < bestmse)
    {
@@ -415,7 +415,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
        break;
    }

-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (diag < bestmse)
    {
@@ -451,7 +451,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
        left = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
    }

-    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (left < bestmse)
    {
@@ -461,7 +461,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,

    this_mv.col += 4;
    right = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (right < bestmse)
    {
@@ -483,7 +483,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
        up = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
    }

-    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (up < bestmse)
    {
@@ -493,7 +493,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,

    this_mv.row += 4;
    down = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (down < bestmse)
    {
@@ -582,7 +582,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
        break;
    }

-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (diag < bestmse)
    {
@@ -621,13 +621,13 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm

    // calculate central point error
    bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
-    bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+    bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);

    // go left then right and check error
    this_mv.row = startmv.row;
    this_mv.col = ((startmv.col - 8) | 4);
    left = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (left < bestmse)
    {
@@ -637,7 +637,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm

    this_mv.col += 8;
    right = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
-    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (right < bestmse)
    {
@@ -649,7 +649,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
    this_mv.col = startmv.col;
    this_mv.row = ((startmv.row - 8) | 4);
    up = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (up < bestmse)
    {
@@ -659,7 +659,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm

    this_mv.row += 8;
    down = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
-    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (down < bestmse)
    {
@@ -697,7 +697,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
        break;
    }

-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (diag < bestmse)
    {
@@ -709,7 +709,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
    this_mv.col = (this_mv.col - 8) | 4;
    this_mv.row = (this_mv.row - 8) | 4;
    diag = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (diag < bestmse)
    {
@@ -719,7 +719,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm

    this_mv.col += 8;
    diag = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (diag < bestmse)
    {
@@ -730,7 +730,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
    this_mv.col = (this_mv.col - 8) | 4;
    this_mv.row = startmv.row + 4;
    diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (diag < bestmse)
    {
@@ -740,7 +740,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm

    this_mv.col += 8;
    diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);

    if (diag < bestmse)
    {
@@ -894,7 +894,7 @@ cal_neighbors:
    best_mv->row = br;
    best_mv->col = bc;

-    return vfp->vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + vp8_mv_err_cost(best_mv, center_mv, mvcost, error_per_bit) ;
+    return vfp->vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + mv_err_cost(best_mv, center_mv, mvcost, error_per_bit) ;
 }
 #undef MVC
 #undef PRE
@@ -955,7 +955,7 @@ int vp8_diamond_search_sad
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }

    // search_param determines the length of the initial step and hence the number of iterations
@@ -986,7 +986,7 @@ int vp8_diamond_search_sad
                {
                    this_mv.row = this_row_offset << 3;
                    this_mv.col = this_col_offset << 3;
-                    thissad += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                    thissad += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                    if (thissad < bestsad)
                    {
@@ -1017,7 +1017,7 @@ int vp8_diamond_search_sad
        return INT_MAX;

    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+    + mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
 }

 int vp8_diamond_search_sadx4
@@ -1071,7 +1071,7 @@ int vp8_diamond_search_sadx4
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }

    // search_param determines the length of the initial step and hence the number of iterations
@@ -1113,7 +1113,7 @@ int vp8_diamond_search_sadx4
                    {
                        this_mv.row = (best_mv->row + ss[i].mv.row) << 3;
                        this_mv.col = (best_mv->col + ss[i].mv.col) << 3;
-                        sad_array[t] += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                        sad_array[t] += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                        if (sad_array[t] < bestsad)
                        {
@@ -1142,7 +1142,7 @@ int vp8_diamond_search_sadx4
                    {
                        this_mv.row = this_row_offset << 3;
                        this_mv.col = this_col_offset << 3;
-                        thissad += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                        thissad += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                        if (thissad < bestsad)
                        {
@@ -1173,7 +1173,7 @@ int vp8_diamond_search_sadx4
        return INT_MAX;

    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+    + mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
 }


@@ -1215,8 +1215,8 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
    {
        // Baseline value at the centre

-        //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+        //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }

    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1242,9 +1242,9 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);

            this_mv.col = c << 3;
-            //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
+            //thissad += (int)sqrt(mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
            //thissad  += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];
-            thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
+            thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);

            if (thissad < bestsad)
            {
@@ -1263,7 +1263,7 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro

    if (bestsad < INT_MAX)
        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+        + mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
    else
        return INT_MAX;
 }
@@ -1306,7 +1306,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Baseline value at the centre
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }

    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1341,7 +1341,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
                if (thissad < bestsad)
                {
                    this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                    thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                    if (thissad < bestsad)
                    {
@@ -1364,7 +1364,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
            if (thissad < bestsad)
            {
                this_mv.col = c << 3;
-                thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                if (thissad < bestsad)
                {
@@ -1386,7 +1386,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er

    if (bestsad < INT_MAX)
        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+        + mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
    else
        return INT_MAX;
 }
@@ -1415,7 +1415,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
    int col_min = ref_col - distance;
    int col_max = ref_col + distance;

-    unsigned short sad_array8[8];
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
    unsigned int sad_array[3];

    // Work out the mid point for the search
@@ -1430,7 +1430,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Baseline value at the centre
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }

    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1465,7 +1465,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
                if (thissad < bestsad)
                {
                    this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                    thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                    if (thissad < bestsad)
                    {
@@ -1494,7 +1494,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
                if (thissad < bestsad)
                {
                    this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                    thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                    if (thissad < bestsad)
                    {
@@ -1517,7 +1517,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
            if (thissad < bestsad)
            {
                this_mv.col = c << 3;
-                thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                if (thissad < bestsad)
                {
@@ -1538,7 +1538,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er

    if (bestsad < INT_MAX)
        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+        + mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
    else
        return INT_MAX;
 }
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -70,7 +70,6 @@ extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_

 int vp8_estimate_entropy_savings(VP8_COMP *cpi);
 int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
-int vp8_calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);

 extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi);

@@ -86,9 +85,11 @@ extern double vp8_calc_ssim
    YV12_BUFFER_CONFIG *source,
    YV12_BUFFER_CONFIG *dest,
    int lumamask,
-    double *weight
+    double *weight,
+    const vp8_variance_rtcd_vtable_t *rtcd
 );

+
 extern double vp8_calc_ssimg
 (
    YV12_BUFFER_CONFIG *source,
@@ -259,7 +260,7 @@ static void setup_features(VP8_COMP *cpi)
 }


-void vp8_dealloc_compressor_data(VP8_COMP *cpi)
+static void dealloc_compressor_data(VP8_COMP *cpi)
 {
    vpx_free(cpi->tplist);
    cpi->tplist = NULL;
@@ -281,12 +282,6 @@ void vp8_dealloc_compressor_data(VP8_COMP *cpi)
    vpx_free(cpi->active_map);
    cpi->active_map = 0;

-#if !(CONFIG_REALTIME_ONLY)
-    // Delete first pass motion map
-    vpx_free(cpi->fp_motion_map);
-    cpi->fp_motion_map = 0;
-#endif
-
    vp8_de_alloc_frame_buffers(&cpi->common);

    vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
@@ -1360,11 +1355,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
 #if !(CONFIG_REALTIME_ONLY)
        vpx_free(cpi->total_stats);

-    cpi->total_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs));
+    cpi->total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));

        vpx_free(cpi->this_frame_stats);

-    cpi->this_frame_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs));
+    cpi->this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));

    if(!cpi->total_stats || !cpi->this_frame_stats)
        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
@@ -1457,13 +1452,12 @@ rescale(int val, int num, int denom)
 }


-void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
+static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 {
    VP8_COMP *cpi = (VP8_COMP *)(ptr);
    VP8_COMMON *cm = &cpi->common;

-    if (!cpi)
-        return;
+    cpi->oxcf = *oxcf;

    cpi->auto_gold = 1;
    cpi->auto_adjust_gold_quantizer = 1;
@@ -1475,299 +1469,31 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
    cm->version = oxcf->Version;
    vp8_setup_version(cm);

-    if (oxcf == 0)
-    {
-        cpi->pass                     = 0;
+    // change includes all joint functionality
+    vp8_change_config(ptr, oxcf);

-        cpi->auto_worst_q              = 0;
-        cpi->oxcf.best_allowed_q            = MINQ;
-        cpi->oxcf.worst_allowed_q           = MAXQ;
-        cpi->oxcf.cq_level = MINQ;
+    // Initialize active best and worst q and average q values.
+    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
+    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
+    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;

-        cpi->oxcf.end_usage                = USAGE_STREAM_FROM_SERVER;
-        cpi->oxcf.starting_buffer_level     =   4000;
-        cpi->oxcf.optimal_buffer_level      =   5000;
-        cpi->oxcf.maximum_buffer_size       =   6000;
-        cpi->oxcf.under_shoot_pct           =  90;
-        cpi->oxcf.allow_df                 =   0;
-        cpi->oxcf.drop_frames_water_mark     =  20;
-
-        cpi->oxcf.allow_spatial_resampling  = 0;
-        cpi->oxcf.resample_down_water_mark   = 40;
-        cpi->oxcf.resample_up_water_mark     = 60;
-
-        cpi->oxcf.fixed_q = cpi->interquantizer;
-
-        cpi->filter_type = NORMAL_LOOPFILTER;
-
-        if (cm->simpler_lpf)
-            cpi->filter_type = SIMPLE_LOOPFILTER;
-
-        cpi->compressor_speed = 1;
-        cpi->horiz_scale = 0;
-        cpi->vert_scale = 0;
-        cpi->oxcf.two_pass_vbrbias = 50;
-        cpi->oxcf.two_pass_vbrmax_section = 400;
-        cpi->oxcf.two_pass_vbrmin_section = 0;
-
-        cpi->oxcf.Sharpness = 0;
-        cpi->oxcf.noise_sensitivity = 0;
-    }
-    else
-        cpi->oxcf = *oxcf;
-
-
-    switch (cpi->oxcf.Mode)
-    {
-
-    case MODE_REALTIME:
-        cpi->pass = 0;
-        cpi->compressor_speed = 2;
-
-        if (cpi->oxcf.cpu_used < -16)
-        {
-            cpi->oxcf.cpu_used = -16;
-        }
-
-        if (cpi->oxcf.cpu_used > 16)
-            cpi->oxcf.cpu_used = 16;
-
-        break;
-
-#if !(CONFIG_REALTIME_ONLY)
-    case MODE_GOODQUALITY:
-        cpi->pass = 0;
-        cpi->compressor_speed = 1;
-
-        if (cpi->oxcf.cpu_used < -5)
-        {
-            cpi->oxcf.cpu_used = -5;
-        }
-
-        if (cpi->oxcf.cpu_used > 5)
-            cpi->oxcf.cpu_used = 5;
-
-        break;
-
-    case MODE_BESTQUALITY:
-        cpi->pass = 0;
-        cpi->compressor_speed = 0;
-        break;
-
-    case MODE_FIRSTPASS:
-        cpi->pass = 1;
-        cpi->compressor_speed = 1;
-        break;
-    case MODE_SECONDPASS:
-        cpi->pass = 2;
-        cpi->compressor_speed = 1;
-
-        if (cpi->oxcf.cpu_used < -5)
-        {
-            cpi->oxcf.cpu_used = -5;
-        }
-
-        if (cpi->oxcf.cpu_used > 5)
-            cpi->oxcf.cpu_used = 5;
-
-        break;
-    case MODE_SECONDPASS_BEST:
-        cpi->pass = 2;
-        cpi->compressor_speed = 0;
-        break;
-#endif
-    }
-
-    if (cpi->pass == 0)
-        cpi->auto_worst_q = 1;
-
-    cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
-    cpi->oxcf.best_allowed_q  = q_trans[oxcf->best_allowed_q];
-    cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
-
-    if (oxcf->fixed_q >= 0)
-    {
-        if (oxcf->worst_allowed_q < 0)
-            cpi->oxcf.fixed_q = q_trans[0];
-        else
-            cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q];
-
-        if (oxcf->alt_q < 0)
-            cpi->oxcf.alt_q = q_trans[0];
-        else
-            cpi->oxcf.alt_q = q_trans[oxcf->alt_q];
-
-        if (oxcf->key_q < 0)
-            cpi->oxcf.key_q = q_trans[0];
-        else
-            cpi->oxcf.key_q = q_trans[oxcf->key_q];
-
-        if (oxcf->gold_q < 0)
-            cpi->oxcf.gold_q = q_trans[0];
-        else
-            cpi->oxcf.gold_q = q_trans[oxcf->gold_q];
-
-    }
-
-    cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
-    cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
-
-    //cpi->use_golden_frame_only = 0;
-    //cpi->use_last_frame_only = 0;
-    cm->refresh_golden_frame = 0;
-    cm->refresh_last_frame = 1;
-    cm->refresh_entropy_probs = 1;
-
-    if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
-        cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;
-
-    setup_features(cpi);
-
-    {
-        int i;
-
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-            cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
-    }
-
-    // At the moment the first order values may not be > MAXQ
-    if (cpi->oxcf.fixed_q > MAXQ)
-        cpi->oxcf.fixed_q = MAXQ;
-
-    // local file playback mode == really big buffer
-    if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
-    {
-        cpi->oxcf.starting_buffer_level   = 60000;
-        cpi->oxcf.optimal_buffer_level    = 60000;
-        cpi->oxcf.maximum_buffer_size     = 240000;
-
-    }
-
-
-    // Convert target bandwidth from Kbit/s to Bit/s
-    cpi->oxcf.target_bandwidth       *= 1000;
+    // Initialise the starting buffer levels
    cpi->oxcf.starting_buffer_level =
        rescale(cpi->oxcf.starting_buffer_level,
                cpi->oxcf.target_bandwidth, 1000);

-    if (cpi->oxcf.optimal_buffer_level == 0)
-        cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
-    else
-        cpi->oxcf.optimal_buffer_level =
-            rescale(cpi->oxcf.optimal_buffer_level,
-                    cpi->oxcf.target_bandwidth, 1000);
-
-    if (cpi->oxcf.maximum_buffer_size == 0)
-        cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
-    else
-        cpi->oxcf.maximum_buffer_size =
-            rescale(cpi->oxcf.maximum_buffer_size,
-                    cpi->oxcf.target_bandwidth, 1000);
-
-    cpi->buffer_level                = cpi->oxcf.starting_buffer_level;
+    cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;
    cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;

-    vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
-    cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
-    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
-    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
-    cpi->best_quality                = cpi->oxcf.best_allowed_q;
-    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
-    cpi->cq_target_quality = cpi->oxcf.cq_level;
-
-    cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
-
    cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
    cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_target_bits      = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_actual_bits      = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_actual_bits     = cpi->av_per_frame_bandwidth;

    cpi->total_actual_bits            = 0;
-    cpi->total_target_vs_actual        = 0;
-
-    // Only allow dropped frames in buffered mode
-    cpi->drop_frames_allowed          = cpi->oxcf.allow_df && cpi->buffered_mode;
-
-    cm->filter_type      = (LOOPFILTERTYPE) cpi->filter_type;
-
-    if (!cm->use_bilinear_mc_filter)
-        cm->mcomp_filter_type = SIXTAP;
-    else
-        cm->mcomp_filter_type = BILINEAR;
-
-    cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
-
-    cm->Width       = cpi->oxcf.Width     ;
-    cm->Height      = cpi->oxcf.Height    ;
-
-    cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8
-
-    cm->horiz_scale  = cpi->horiz_scale;
-    cm->vert_scale   = cpi->vert_scale ;
-
-    // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
-    if (cpi->oxcf.Sharpness > 7)
-        cpi->oxcf.Sharpness = 7;
-
-    cm->sharpness_level = cpi->oxcf.Sharpness;
-
-    if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL)
-    {
-        int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
-        int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
-
-        Scale2Ratio(cm->horiz_scale, &hr, &hs);
-        Scale2Ratio(cm->vert_scale, &vr, &vs);
-
-        // always go to the next whole number
-        cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
-        cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
-    }
-
-    if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width ||
-        ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height ||
-        cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
-    {
-        alloc_raw_frame_buffers(cpi);
-        vp8_alloc_compressor_data(cpi);
-    }
-
-    // Clamp KF frame size to quarter of data rate
-    if (cpi->intra_frame_target > cpi->target_bandwidth >> 2)
-        cpi->intra_frame_target = cpi->target_bandwidth >> 2;
-
-    if (cpi->oxcf.fixed_q >= 0)
-    {
-        cpi->last_q[0] = cpi->oxcf.fixed_q;
-        cpi->last_q[1] = cpi->oxcf.fixed_q;
-    }
-
-    cpi->Speed = cpi->oxcf.cpu_used;
-
-    // force to allowlag to 0 if lag_in_frames is 0;
-    if (cpi->oxcf.lag_in_frames == 0)
-    {
-        cpi->oxcf.allow_lag = 0;
-    }
-    // Limit on lag buffers as these are not currently dynamically allocated
-    else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
-        cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
-
-    // YX Temp
-    cpi->last_alt_ref_sei    = -1;
-    cpi->is_src_frame_alt_ref = 0;
-    cpi->is_next_src_alt_ref = 0;
-
-#if 0
-    // Experimental RD Code
-    cpi->frame_distortion = 0;
-    cpi->last_frame_distortion = 0;
-#endif
+    cpi->total_target_vs_actual       = 0;

 #if VP8_TEMPORAL_ALT_REF
-
-    cpi->use_weighted_temporal_filter = 0;
-
    {
        int i;

@@ -1779,12 +1505,6 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 #endif
 }

-/*
- * This function needs more clean up, i.e. be more tuned torwards
- * change_config rather than init_config  !!!!!!!!!!!!!!!!
- * YX - 5/28/2009
- *
- */

 void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 {
@@ -1897,7 +1617,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)

    }

-    cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
+    cpi->baseline_gf_interval =
+        cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;

    cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;

@@ -1908,7 +1629,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
    cm->refresh_entropy_probs = 1;

    if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
-        cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;
+        cm->multi_token_partition =
+            (TOKEN_PARTITION) cpi->oxcf.token_partitions;

    setup_features(cpi);

@@ -1929,16 +1651,12 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
        cpi->oxcf.starting_buffer_level   = 60000;
        cpi->oxcf.optimal_buffer_level    = 60000;
        cpi->oxcf.maximum_buffer_size     = 240000;
-
    }

    // Convert target bandwidth from Kbit/s to Bit/s
    cpi->oxcf.target_bandwidth       *= 1000;

-    cpi->oxcf.starting_buffer_level =
-        rescale(cpi->oxcf.starting_buffer_level,
-                cpi->oxcf.target_bandwidth, 1000);
-
+    // Set or reset optimal and maximum buffer levels.
    if (cpi->oxcf.optimal_buffer_level == 0)
        cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
    else
@@ -1953,31 +1671,41 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
            rescale(cpi->oxcf.maximum_buffer_size,
                    cpi->oxcf.target_bandwidth, 1000);

-    cpi->buffer_level                = cpi->oxcf.starting_buffer_level;
-    cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
-
+    // Set up frame rate and related parameters rate control values.
    vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+
+    // Set absolute upper and lower quality limits
    cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
-    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
-    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
    cpi->best_quality                = cpi->oxcf.best_allowed_q;
-    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
+
+    // active values should only be modified if out of new range
+    if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q)
+    {
+      cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+    }
+    // less likely
+    else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q)
+    {
+      cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
+    }
+    if (cpi->active_best_quality < cpi->oxcf.best_allowed_q)
+    {
+      cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+    }
+    // less likely
+    else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q)
+    {
+      cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
+    }
+
    cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;

    cpi->cq_target_quality = cpi->oxcf.cq_level;

-    cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
-    cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_target_bits      = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_actual_bits      = cpi->av_per_frame_bandwidth;
-
-    cpi->total_actual_bits            = 0;
-    cpi->total_target_vs_actual        = 0;
-
    // Only allow dropped frames in buffered mode
-    cpi->drop_frames_allowed          = cpi->oxcf.allow_df && cpi->buffered_mode;
+    cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;

-    cm->filter_type                  = (LOOPFILTERTYPE) cpi->filter_type;
+    cm->filter_type          = (LOOPFILTERTYPE) cpi->filter_type;

    if (!cm->use_bilinear_mc_filter)
        cm->mcomp_filter_type = SIXTAP;
@@ -1992,7 +1720,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
    cm->horiz_scale  = cpi->horiz_scale;
    cm->vert_scale   = cpi->vert_scale ;

-    cpi->intra_frame_target           = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8
+    // As per VP8
+    cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000;

    // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
    if (cpi->oxcf.Sharpness > 7)
@@ -2013,8 +1742,10 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
        cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
    }

-    if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width ||
-        ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height ||
+    if (((cm->Width + 15) & 0xfffffff0) !=
+          cm->yv12_fb[cm->lst_fb_idx].y_width ||
+        ((cm->Height + 15) & 0xfffffff0) !=
+          cm->yv12_fb[cm->lst_fb_idx].y_height ||
        cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
    {
        alloc_raw_frame_buffers(cpi);
@@ -2112,7 +1843,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    vp8_create_common(&cpi->common);
    vp8_cmachine_specific_config(cpi);

-    vp8_init_config((VP8_PTR)cpi, oxcf);
+    init_config((VP8_PTR)cpi, oxcf);

    memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, sizeof(vp8cx_base_skip_false_prob));
    cpi->common.current_video_frame   = 0;
@@ -2153,12 +1884,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    vpx_memset(cpi->active_map , 1, (cpi->common.mb_rows * cpi->common.mb_cols));
    cpi->active_map_enabled = 0;

-#if !(CONFIG_REALTIME_ONLY)
-    // Create the first pass motion map structure and set to 0
-    // Allocate space for maximum of 15 buffers
-    CHECK_MEM_ERROR(cpi->fp_motion_map, vpx_calloc(15*cpi->common.MBs, 1));
-#endif
-
 #if 0
    // Experimental code for lagged and one pass
    // Initialise one_pass GF frames stats
@@ -2308,7 +2033,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    }
    else if (cpi->pass == 2)
    {
-        size_t packet_sz = vp8_firstpass_stats_sz(cpi->common.MBs);
+        size_t packet_sz = sizeof(FIRSTPASS_STATS);
        int packets = oxcf->two_pass_stats_in.sz / packet_sz;

        cpi->stats_in = oxcf->two_pass_stats_in.buf;
@@ -2619,7 +2344,7 @@ void vp8_remove_compressor(VP8_PTR *ptr)
    vp8cx_remove_encoder_threads(cpi);
 #endif

-    vp8_dealloc_compressor_data(cpi);
+    dealloc_compressor_data(cpi);
    vpx_free(cpi->mb.ss);
    vpx_free(cpi->tok);
    vpx_free(cpi->cyclic_refresh_map);
@@ -3509,6 +3234,89 @@ static BOOL recode_loop_test( VP8_COMP *cpi,
    return force_recode;
 }

+void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
+{
+    if (cm->no_lpf)
+    {
+        cm->filter_level = 0;
+    }
+    else
+    {
+        struct vpx_usec_timer timer;
+
+        vp8_clear_system_state();
+
+        vpx_usec_timer_start(&timer);
+        if (cpi->sf.auto_filter == 0)
+            vp8cx_pick_filter_level_fast(cpi->Source, cpi);
+
+        else
+            vp8cx_pick_filter_level(cpi->Source, cpi);
+
+        vpx_usec_timer_mark(&timer);
+        cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+    }
+
+#if CONFIG_MULTITHREAD
+    sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
+#endif
+
+    if (cm->filter_level > 0)
+    {
+        vp8cx_set_alt_lf_level(cpi, cm->filter_level);
+        vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level);
+        cm->last_filter_type = cm->filter_type;
+        cm->last_sharpness_level = cm->sharpness_level;
+    }
+
+    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
+
+    {
+        YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
+        YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+        YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
+        YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx];
+        // At this point the new frame has been encoded.
+        // If any buffer copy / swapping is signaled it should be done here.
+        if (cm->frame_type == KEY_FRAME)
+        {
+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12);
+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12);
+        }
+        else    // For non key frames
+        {
+            // Code to copy between reference buffers
+            if (cm->copy_buffer_to_arf)
+            {
+                if (cm->copy_buffer_to_arf == 1)
+                {
+                    if (cm->refresh_last_frame)
+                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
+                        vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12);
+                    else
+                        vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12);
+                }
+                else if (cm->copy_buffer_to_arf == 2)
+                    vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12);
+            }
+
+            if (cm->copy_buffer_to_gf)
+            {
+                if (cm->copy_buffer_to_gf == 1)
+                {
+                    if (cm->refresh_last_frame)
+                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
+                        vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12);
+                    else
+                        vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
+                }
+                else if (cm->copy_buffer_to_gf == 2)
+                    vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12);
+            }
+        }
+    }
+}
+
 static void encode_frame_to_data_rate
 (
    VP8_COMP *cpi,
@@ -3542,6 +3350,7 @@ static void encode_frame_to_data_rate
    int drop_mark50 = drop_mark / 4;
    int drop_mark25 = drop_mark / 8;

+
    // Clear down mmx registers to allow floating point in what follows
    vp8_clear_system_state();

@@ -3862,11 +3671,12 @@ static void encode_frame_to_data_rate
            }
        }

-        // If CBR and the buffer is as full then it is reasonable to allow higher quality on the frames
-        // to prevent bits just going to waste.
+        // If CBR and the buffer is as full then it is reasonable to allow
+        // higher quality on the frames to prevent bits just going to waste.
        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
        {
-            // Note that the use of >= here elliminates the risk of a devide by 0 error in the else if clause
+            // Note that the use of >= here elliminates the risk of a devide
+            // by 0 error in the else if clause
            if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size)
                cpi->active_best_quality = cpi->best_quality;

@@ -3879,6 +3689,20 @@ static void encode_frame_to_data_rate
            }
        }
    }
+    // Make sure constrained quality mode limits are adhered to for the first
+    // few frames of one pass encodes
+    else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+    {
+        if ( (cm->frame_type == KEY_FRAME) ||
+             cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame )
+        {
+             cpi->active_best_quality = cpi->best_quality;
+        }
+        else if (cpi->active_best_quality < cpi->cq_target_quality)
+        {
+            cpi->active_best_quality = cpi->cq_target_quality;
+        }
+    }

    // Clip the active best and worst quality values to limits
    if (cpi->active_worst_quality > cpi->worst_quality)
@@ -4058,8 +3882,8 @@ static void encode_frame_to_data_rate
            vp8_setup_key_frame(cpi);

        // transform / motion compensation build reconstruction frame
-
        vp8_encode_frame(cpi);
+
        cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
        cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;

@@ -4408,92 +4232,43 @@ static void encode_frame_to_data_rate
    else
        cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];

-    if (cm->no_lpf)
+
+#if CONFIG_MULTITHREAD
+    if (cpi->b_multi_threaded)
    {
-        cm->filter_level = 0;
+        sem_post(&cpi->h_event_start_lpf); /* start loopfilter in separate thread */
    }
    else
+#endif
    {
-        struct vpx_usec_timer timer;
-
-        vpx_usec_timer_start(&timer);
-
-        if (cpi->sf.auto_filter == 0)
-            vp8cx_pick_filter_level_fast(cpi->Source, cpi);
-        else
-            vp8cx_pick_filter_level(cpi->Source, cpi);
-
-        vpx_usec_timer_mark(&timer);
-
-        cpi->time_pick_lpf +=  vpx_usec_timer_elapsed(&timer);
+        loopfilter_frame(cpi, cm);
    }

-    if (cm->filter_level > 0)
-    {
-        vp8cx_set_alt_lf_level(cpi, cm->filter_level);
-        vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level);
-        cm->last_filter_type = cm->filter_type;
-        cm->last_sharpness_level = cm->sharpness_level;
-    }
-
-    /* Move storing frame_type out of the above loop since it is also
-     * needed in motion search besides loopfilter */
-    cm->last_frame_type = cm->frame_type;
-
-    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
-
    if (cpi->oxcf.error_resilient_mode == 1)
    {
        cm->refresh_entropy_probs = 0;
    }

+#if CONFIG_MULTITHREAD
+    /* wait that filter_level is picked so that we can continue with stream packing */
+    if (cpi->b_multi_threaded)
+        sem_wait(&cpi->h_event_end_lpf);
+#endif
+
    // build the bitstream
    vp8_pack_bitstream(cpi, dest, size);

+#if CONFIG_MULTITHREAD
+    /* wait for loopfilter thread done */
+    if (cpi->b_multi_threaded)
    {
-        YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
-        YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
-        YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
-        YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx];
-        // At this point the new frame has been encoded coded.
-        // If any buffer copy / swaping is signalled it should be done here.
-        if (cm->frame_type == KEY_FRAME)
-        {
-            vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12);
-            vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12);
-        }
-        else    // For non key frames
-        {
-            // Code to copy between reference buffers
-            if (cm->copy_buffer_to_arf)
-            {
-                if (cm->copy_buffer_to_arf == 1)
-                {
-                    if (cm->refresh_last_frame)
-                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
-                        vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12);
-                    else
-                        vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12);
-                }
-                else if (cm->copy_buffer_to_arf == 2)
-                    vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12);
-            }
-
-            if (cm->copy_buffer_to_gf)
-            {
-                if (cm->copy_buffer_to_gf == 1)
-                {
-                    if (cm->refresh_last_frame)
-                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
-                        vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12);
-                    else
-                        vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
-                }
-                else if (cm->copy_buffer_to_gf == 2)
-                    vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12);
-            }
-        }
+        sem_wait(&cpi->h_event_end_lpf);
    }
+#endif
+
+    /* Move storing frame_type out of the above loop since it is also
+     * needed in motion search besides loopfilter */
+      cm->last_frame_type = cm->frame_type;

    // Update rate control heuristics
    cpi->total_byte_count += (*size);
@@ -4817,18 +4592,8 @@ static void encode_frame_to_data_rate

 }

-int vp8_is_gf_update_needed(VP8_PTR ptr)
-{
-    VP8_COMP *cpi = (VP8_COMP *) ptr;
-    int ret_val;

-    ret_val = cpi->gf_update_recommended;
-    cpi->gf_update_recommended = 0;
-
-    return ret_val;
-}
-
-void vp8_check_gf_quality(VP8_COMP *cpi)
+static void check_gf_quality(VP8_COMP *cpi)
 {
    VP8_COMMON *cm = &cpi->common;
    int gf_active_pct = (100 * cpi->gf_active_count) / (cm->mb_rows * cm->mb_cols);
@@ -5077,7 +4842,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
                if (start_frame < 0)
                    start_frame += cpi->oxcf.lag_in_frames;

-                besterr = vp8_calc_low_ss_err(&cpi->src_buffer[cpi->last_alt_ref_sei].source_buffer,
+                besterr = calc_low_ss_err(&cpi->src_buffer[cpi->last_alt_ref_sei].source_buffer,
                                              &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance));

                for (i = 0; i < 7; i++)
@@ -5086,7 +4851,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
                    cpi->oxcf.arnr_strength = i;
                    vp8_temporal_filter_prepare_c(cpi);

-                    thiserr = vp8_calc_low_ss_err(&cpi->alt_ref_buffer.source_buffer,
+                    thiserr = calc_low_ss_err(&cpi->alt_ref_buffer.source_buffer,
                                                  &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance));

                    if (10 * thiserr < besterr * 8)
@@ -5229,7 +4994,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon

    if (cpi->compressor_speed == 2)
    {
-        vp8_check_gf_quality(cpi);
+        check_gf_quality(cpi);
        vpx_usec_timer_start(&tsctimer);
        vpx_usec_timer_start(&ticktimer);
    }
@@ -5328,7 +5093,9 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
    cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);

    if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)
+    {
        generate_psnr_packet(cpi);
+    }

 #if CONFIG_PSNR

@@ -5344,12 +5111,35 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
            if (cpi->b_calculate_psnr)
            {
                double y, u, v;
-                double sq_error;
-                double frame_psnr = vp8_calc_psnr(cpi->Source, cm->frame_to_show, &y, &u, &v, &sq_error);
+                double ye,ue,ve;
+                double frame_psnr;
+                YV12_BUFFER_CONFIG      *orig = cpi->Source;
+                YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
+                YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;
+                int y_samples = orig->y_height * orig->y_width ;
+                int uv_samples = orig->uv_height * orig->uv_width ;
+                int t_samples = y_samples + 2 * uv_samples;
+                long long sq_error;

-                cpi->total_y += y;
-                cpi->total_u += u;
-                cpi->total_v += v;
+                ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+                  recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height,
+                  IF_RTCD(&cpi->rtcd.variance));
+
+                ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                  recon->u_buffer, recon->uv_stride, orig->uv_width, orig->uv_height,
+                  IF_RTCD(&cpi->rtcd.variance));
+
+                ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                  recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height,
+                  IF_RTCD(&cpi->rtcd.variance));
+
+                sq_error = ye + ue + ve;
+
+                frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error);
+
+                cpi->total_y += vp8_mse2psnr(y_samples, 255.0, ye);
+                cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, ue);
+                cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, ve);
                cpi->total_sq_error += sq_error;
                cpi->total  += frame_psnr;
                {
@@ -5358,18 +5148,36 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon

                    vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc));
                    vp8_clear_system_state();
-                    frame_psnr2 = vp8_calc_psnr(cpi->Source, &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error);
-                    frame_ssim2 = vp8_calc_ssim(cpi->Source, &cm->post_proc_buffer, 1, &weight);
+
+                    ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+                      pp->y_buffer, pp->y_stride, orig->y_width, orig->y_height,
+                      IF_RTCD(&cpi->rtcd.variance));
+
+                    ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                      pp->u_buffer, pp->uv_stride, orig->uv_width, orig->uv_height,
+                      IF_RTCD(&cpi->rtcd.variance));
+
+                    ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                      pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height,
+                      IF_RTCD(&cpi->rtcd.variance));
+
+                    sq_error = ye + ue + ve;
+
+                    frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error);
+
+                    cpi->totalp_y += vp8_mse2psnr(y_samples, 255.0, ye);
+                    cpi->totalp_u += vp8_mse2psnr(uv_samples, 255.0, ue);
+                    cpi->totalp_v += vp8_mse2psnr(uv_samples, 255.0, ve);
+                    cpi->total_sq_error2 += sq_error;
+                    cpi->totalp  += frame_psnr2;
+
+                    frame_ssim2 = vp8_calc_ssim(cpi->Source,
+                      &cm->post_proc_buffer, 1, &weight,
+                      IF_RTCD(&cpi->rtcd.variance));

                    cpi->summed_quality += frame_ssim2 * weight;
                    cpi->summed_weights += weight;

-                    cpi->totalp_y += y2;
-                    cpi->totalp_u += u2;
-                    cpi->totalp_v += v2;
-                    cpi->totalp  += frame_psnr2;
-                    cpi->total_sq_error2 += sq_error;
-
                }
            }

@@ -5565,7 +5373,9 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const

    return Total;
 }
-int vp8_calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd)
+
+
+static int calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd)
 {
    int i, j;
    int Total = 0;
@@ -5593,11 +5403,7 @@ int vp8_calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, co
    return Total;
 }

-int vp8_get_speed(VP8_PTR c)
-{
-    VP8_COMP   *cpi = (VP8_COMP *) c;
-    return cpi->Speed;
-}
+
 int vp8_get_quantizer(VP8_PTR c)
 {
    VP8_COMP   *cpi = (VP8_COMP *) c;
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -99,6 +99,7 @@ typedef struct
    double pcnt_inter;
    double pcnt_motion;
    double pcnt_second_ref;
+    double pcnt_neutral;
    double MVr;
    double mvr_abs;
    double MVc;
@@ -495,11 +496,6 @@ typedef struct
    struct vpx_codec_pkt_list  *output_pkt_list;
    int                          first_pass_done;

-#if !(CONFIG_REALTIME_ONLY)
-    unsigned char *fp_motion_map;
-    unsigned char *fp_motion_map_stats, *fp_motion_map_stats_save;
-#endif
-
 #if 0
    // Experimental code for lagged and one pass
    ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
@@ -603,12 +599,17 @@ typedef struct
    int encoding_thread_count;

    pthread_t *h_encoding_thread;
+    pthread_t h_filter_thread;
+
    MB_ROW_COMP *mb_row_ei;
    ENCODETHREAD_DATA *en_thread_data;
+    LPFTHREAD_DATA lpf_thread_data;

    //events
    sem_t *h_event_start_encoding;
    sem_t h_event_end_encoding;
+    sem_t h_event_start_lpf;
+    sem_t h_event_end_lpf;
 #endif

    TOKENLIST *tplist;
@@ -641,8 +642,6 @@ typedef struct
    YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
    int fixed_divide[512];
 #endif
-    // Flag to indicate temporal filter method
-    int use_weighted_temporal_filter;

 #if CONFIG_PSNR
    int    count;
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -664,7 +664,8 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
        case V_PRED:
        case H_PRED:
        case TM_PRED:
-            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+            RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
+                (&x->e_mbd);
            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
            rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
            this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
--- a/vp8/encoder/psnr.c
+++ b/vp8/encoder/psnr.c
@@ -29,89 +29,3 @@ double vp8_mse2psnr(double Samples, double Peak, double Mse)

    return psnr;
 }
-
-double vp8_calc_psnr(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, double *YPsnr, double *UPsnr, double *VPsnr, double *sq_error)
-{
-    int i, j;
-    int Diff;
-    double frame_psnr;
-    double Total;
-    double grand_total;
-    unsigned char *src = source->y_buffer;
-    unsigned char *dst = dest->y_buffer;
-
-    Total = 0.0;
-    grand_total = 0.0;
-
-    // Loop throught the Y plane raw and reconstruction data summing (square differences)
-    for (i = 0; i < source->y_height; i++)
-    {
-
-        for (j = 0; j < source->y_width; j++)
-        {
-            Diff        = (int)(src[j]) - (int)(dst[j]);
-            Total      += Diff * Diff;
-        }
-
-        src += source->y_stride;
-        dst += dest->y_stride;
-    }
-
-    // Work out Y PSNR
-    *YPsnr = vp8_mse2psnr(source->y_height * source->y_width, 255.0, Total);
-    grand_total += Total;
-    Total = 0;
-
-
-    // Loop through the U plane
-    src = source->u_buffer;
-    dst = dest->u_buffer;
-
-    for (i = 0; i < source->uv_height; i++)
-    {
-
-        for (j = 0; j < source->uv_width; j++)
-        {
-            Diff        = (int)(src[j]) - (int)(dst[j]);
-            Total      += Diff * Diff;
-        }
-
-        src += source->uv_stride;
-        dst += dest->uv_stride;
-    }
-
-    // Work out U PSNR
-    *UPsnr = vp8_mse2psnr(source->uv_height * source->uv_width, 255.0, Total);
-    grand_total += Total;
-    Total = 0;
-
-
-    // V PSNR
-    src = source->v_buffer;
-    dst = dest->v_buffer;
-
-    for (i = 0; i < source->uv_height; i++)
-    {
-
-        for (j = 0; j < source->uv_width; j++)
-        {
-            Diff        = (int)(src[j]) - (int)(dst[j]);
-            Total      += Diff * Diff;
-        }
-
-        src += source->uv_stride;
-        dst += dest->uv_stride;
-    }
-
-    // Work out UV PSNR
-    *VPsnr = vp8_mse2psnr(source->uv_height * source->uv_width, 255.0, Total);
-    grand_total += Total;
-    Total = 0;
-
-    // Work out total PSNR
-    frame_psnr = vp8_mse2psnr(source->y_height * source->y_width * 3 / 2 , 255.0, grand_total);
-
-    *sq_error = 1.0 * grand_total;
-
-    return frame_psnr;
-}
--- a/vp8/encoder/psnr.h
+++ b/vp8/encoder/psnr.h
@@ -13,6 +13,5 @@
 #define __INC_PSNR_H

 extern double vp8_mse2psnr(double Samples, double Peak, double Mse);
-extern double vp8_calc_psnr(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, double *YPsnr, double *UPsnr, double *VPsnr, double *sq_error);

 #endif
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -90,7 +90,7 @@ const int vp8_bits_per_mb[2][QINDEX_RANGE] =
    }
 };

-const int vp8_kf_boost_qadjustment[QINDEX_RANGE] =
+static const int kf_boost_qadjustment[QINDEX_RANGE] =
 {
    128, 129, 130, 131, 132, 133, 134, 135,
    136, 137, 138, 139, 140, 141, 142, 143,
@@ -154,7 +154,7 @@ const int vp8_gf_boost_qadjustment[QINDEX_RANGE] =
 };
 */

-const int vp8_kf_gf_boost_qlimits[QINDEX_RANGE] =
+static const int kf_gf_boost_qlimits[QINDEX_RANGE] =
 {
    150, 155, 160, 165, 170, 175, 180, 185,
    190, 195, 200, 205, 210, 215, 220, 225,
@@ -175,14 +175,14 @@ const int vp8_kf_gf_boost_qlimits[QINDEX_RANGE] =
 };

 // % adjustment to target kf size based on seperation from previous frame
-const int vp8_kf_boost_seperationt_adjustment[16] =
+static const int kf_boost_seperation_adjustment[16] =
 {
    30,   40,   50,   55,   60,   65,   70,   75,
    80,   85,   90,   95,  100,  100,  100,  100,
 };


-const int vp8_gf_adjust_table[101] =
+static const int gf_adjust_table[101] =
 {
    100,
    115, 130, 145, 160, 175, 190, 200, 210, 220, 230,
@@ -197,13 +197,13 @@ const int vp8_gf_adjust_table[101] =
    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
 };

-const int vp8_gf_intra_useage_adjustment[20] =
+static const int gf_intra_usage_adjustment[20] =
 {
    125, 120, 115, 110, 105, 100,  95,  85,  80,  75,
    70,  65,  60,  55,  50,  50,  50,  50,  50,  50,
 };

-const int vp8_gf_interval_table[101] =
+static const int gf_interval_table[101] =
 {
    7,
    7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
@@ -353,7 +353,7 @@ void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi)
        kf_boost = (int)(2 * cpi->output_frame_rate - 16);

        // adjustment up based on q
-        kf_boost = kf_boost * vp8_kf_boost_qadjustment[cpi->ni_av_qi] / 100;
+        kf_boost = kf_boost * kf_boost_qadjustment[cpi->ni_av_qi] / 100;

        // frame separation adjustment ( down)
        if (cpi->frames_since_key  < cpi->output_frame_rate / 2)
@@ -488,10 +488,10 @@ static void calc_gf_params(VP8_COMP *cpi)
            Boost = GFQ_ADJUSTMENT;

            // Adjust based upon most recently measure intra useage
-            Boost = Boost * vp8_gf_intra_useage_adjustment[(cpi->this_frame_percent_intra < 15) ? cpi->this_frame_percent_intra : 14] / 100;
+            Boost = Boost * gf_intra_usage_adjustment[(cpi->this_frame_percent_intra < 15) ? cpi->this_frame_percent_intra : 14] / 100;

            // Adjust gf boost based upon GF usage since last GF
-            Boost = Boost * vp8_gf_adjust_table[gf_frame_useage] / 100;
+            Boost = Boost * gf_adjust_table[gf_frame_useage] / 100;
 #endif
        }

@@ -503,8 +503,8 @@ static void calc_gf_params(VP8_COMP *cpi)
        }

        // Apply an upper limit based on Q for 1 pass encodes
-        if (Boost > vp8_kf_gf_boost_qlimits[Q] && (cpi->pass == 0))
-            Boost = vp8_kf_gf_boost_qlimits[Q];
+        if (Boost > kf_gf_boost_qlimits[Q] && (cpi->pass == 0))
+            Boost = kf_gf_boost_qlimits[Q];

        // Apply lower limits to boost.
        else if (Boost < 110)
@@ -539,8 +539,8 @@ static void calc_gf_params(VP8_COMP *cpi)
            if (cpi->last_boost >= 1500)
                cpi->frames_till_gf_update_due ++;

-            if (vp8_gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due)
-                cpi->frames_till_gf_update_due = vp8_gf_interval_table[gf_frame_useage];
+            if (gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due)
+                cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_useage];

            if (cpi->frames_till_gf_update_due > cpi->max_gf_interval)
                cpi->frames_till_gf_update_due = cpi->max_gf_interval;
@@ -594,17 +594,17 @@ void vp8_calc_iframe_target_size(VP8_COMP *cpi)
        // between key frames.

        // Adjust boost based upon ambient Q
-        Boost = vp8_kf_boost_qadjustment[Q];
+        Boost = kf_boost_qadjustment[Q];

        // Make the Key frame boost less if the seperation from the previous key frame is small
        if (cpi->frames_since_key < 16)
-            Boost = Boost * vp8_kf_boost_seperationt_adjustment[cpi->frames_since_key] / 100;
+            Boost = Boost * kf_boost_seperation_adjustment[cpi->frames_since_key] / 100;
        else
-            Boost = Boost * vp8_kf_boost_seperationt_adjustment[15] / 100;
+            Boost = Boost * kf_boost_seperation_adjustment[15] / 100;

        // Apply limits on boost
-        if (Boost > vp8_kf_gf_boost_qlimits[Q])
-            Boost = vp8_kf_gf_boost_qlimits[Q];
+        if (Boost > kf_gf_boost_qlimits[Q])
+            Boost = kf_gf_boost_qlimits[Q];
        else if (Boost < 120)
            Boost = 120;
    }
@@ -842,7 +842,8 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
        {
            int one_percent_bits = 1 + cpi->oxcf.optimal_buffer_level / 100;

-            if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))
+            if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) ||
+                (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))
            {
                int percent_low = 0;

@@ -851,9 +852,12 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
                // If we are are below the optimal buffer fullness level and adherence
                // to buffering contraints is important to the end useage then adjust
                // the per frame target.
-                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
+                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+                    (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
                {
-                    percent_low = (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / one_percent_bits;
+                    percent_low =
+                        (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) /
+                        one_percent_bits;

                    if (percent_low > 100)
                        percent_low = 100;
@@ -864,7 +868,8 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
                else if (cpi->bits_off_target < 0)
                {
                    // Adjust per frame data target downwards to compensate.
-                    percent_low = (int)(100 * -cpi->bits_off_target / (cpi->total_byte_count * 8));
+                    percent_low = (int)(100 * -cpi->bits_off_target /
+                                       (cpi->total_byte_count * 8));

                    if (percent_low > 100)
                        percent_low = 100;
@@ -873,39 +878,60 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
                }

                // lower the target bandwidth for this frame.
-                cpi->this_frame_target = (cpi->this_frame_target * (100 - (percent_low / 2))) / 100;
+                cpi->this_frame_target =
+                    (cpi->this_frame_target * (100 - (percent_low / 2))) / 100;

-                // Are we using allowing control of active_worst_allowed_q according to buffer level.
+                // Are we using allowing control of active_worst_allowed_q
+                // according to buffer level.
                if (cpi->auto_worst_q)
                {
                    int critical_buffer_level;

-                    // For streaming applications the most important factor is cpi->buffer_level as this takes
-                    // into account the specified short term buffering constraints. However, hitting the long
-                    // term clip data rate target is also important.
+                    // For streaming applications the most important factor is
+                    // cpi->buffer_level as this takes into account the
+                    // specified short term buffering constraints. However,
+                    // hitting the long term clip data rate target is also
+                    // important.
                    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
                    {
-                        // Take the smaller of cpi->buffer_level and cpi->bits_off_target
-                        critical_buffer_level = (cpi->buffer_level < cpi->bits_off_target) ? cpi->buffer_level : cpi->bits_off_target;
+                        // Take the smaller of cpi->buffer_level and
+                        // cpi->bits_off_target
+                        critical_buffer_level =
+                            (cpi->buffer_level < cpi->bits_off_target)
+                            ? cpi->buffer_level : cpi->bits_off_target;
                    }
-                    // For local file playback short term buffering contraints are less of an issue
+                    // For local file playback short term buffering contraints
+                    // are less of an issue
                    else
                    {
-                        // Consider only how we are doing for the clip as a whole
+                        // Consider only how we are doing for the clip as a
+                        // whole
                        critical_buffer_level = cpi->bits_off_target;
                    }

-                    // Set the active worst quality based upon the selected buffer fullness number.
+                    // Set the active worst quality based upon the selected
+                    // buffer fullness number.
                    if (critical_buffer_level < cpi->oxcf.optimal_buffer_level)
                    {
-                        if (critical_buffer_level > (cpi->oxcf.optimal_buffer_level / 4))
+                        if ( critical_buffer_level >
+                             (cpi->oxcf.optimal_buffer_level >> 2) )
                        {
-                            int qadjustment_range = cpi->worst_quality - cpi->ni_av_qi;
-                            int above_base = (critical_buffer_level - (cpi->oxcf.optimal_buffer_level / 4));
+                            INT64 qadjustment_range =
+                                      cpi->worst_quality - cpi->ni_av_qi;
+                            INT64 above_base =
+                                      (critical_buffer_level -
+                                       (cpi->oxcf.optimal_buffer_level >> 2));

-                            // Step active worst quality down from cpi->ni_av_qi when (critical_buffer_level == cpi->optimal_buffer_level)
-                            // to cpi->oxcf.worst_allowed_q when (critical_buffer_level == cpi->optimal_buffer_level/4)
-                            cpi->active_worst_quality = cpi->worst_quality - ((qadjustment_range * above_base) / (cpi->oxcf.optimal_buffer_level * 3 / 4));
+                            // Step active worst quality down from
+                            // cpi->ni_av_qi when (critical_buffer_level ==
+                            // cpi->optimal_buffer_level) to
+                            // cpi->worst_quality when
+                            // (critical_buffer_level ==
+                            //     cpi->optimal_buffer_level >> 2)
+                            cpi->active_worst_quality =
+                                cpi->worst_quality -
+                                ((qadjustment_range * above_base) /
+                                 (cpi->oxcf.optimal_buffer_level*3>>2));
                        }
                        else
                        {
@@ -965,6 +991,15 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
            // Set the active worst quality
            cpi->active_worst_quality = cpi->worst_quality;
        }
+
+        // Special trap for constrained quality mode
+        // "active_worst_quality" may never drop below cq level
+        // for any frame type.
+        if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+             cpi->active_worst_quality < cpi->cq_target_quality)
+        {
+            cpi->active_worst_quality = cpi->cq_target_quality;
+        }
    }

    // Test to see if we have to drop a frame
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -53,7 +53,7 @@ extern void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x);



-const int vp8_auto_speed_thresh[17] =
+static const int auto_speed_thresh[17] =
 {
    1000,
    200,
@@ -353,7 +353,7 @@ void vp8_auto_select_speed(VP8_COMP *cpi)
                }
            }

-            if (milliseconds_for_compress * 100 > cpi->avg_encode_time * vp8_auto_speed_thresh[cpi->Speed])
+            if (milliseconds_for_compress * 100 > cpi->avg_encode_time * auto_speed_thresh[cpi->Speed])
            {
                cpi->Speed          -= 1;
                cpi->avg_pick_mode_time = 0;
@@ -745,7 +745,8 @@ int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
    {
        x->e_mbd.mode_info_context->mbmi.mode = mode;

-        vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+        RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
+            (&x->e_mbd);

        macro_block_yrd(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd.encodemb));
        rate = ratey + x->mbmode_cost[x->e_mbd.frame_type]
@@ -999,13 +1000,6 @@ static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x, int const *labels
    return distortion;
 }

-unsigned char vp8_mbsplit_offset2[4][16] = {
-    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
-};
-

 static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};

@@ -1033,8 +1027,8 @@ typedef struct
 } BEST_SEG_INFO;


-void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
-                         unsigned int segmentation)
+static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
+                             BEST_SEG_INFO *bsi, unsigned int segmentation)
 {
    int i;
    int const *labels;
@@ -1152,7 +1146,7 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
                    int sadpb = x->sadperbit4;

                    // find first label
-                    n = vp8_mbsplit_offset2[segmentation][i];
+                    n = vp8_mbsplit_offset[segmentation][i];

                    c = &x->block[n];
                    e = &x->e_mbd.block[n];
@@ -1331,16 +1325,16 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
    {
        /* for now, we will keep the original segmentation order
           when in best quality mode */
-        vp8_rd_check_segment(cpi, x, &bsi, BLOCK_16X8);
-        vp8_rd_check_segment(cpi, x, &bsi, BLOCK_8X16);
-        vp8_rd_check_segment(cpi, x, &bsi, BLOCK_8X8);
-        vp8_rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
+        rd_check_segment(cpi, x, &bsi, BLOCK_16X8);
+        rd_check_segment(cpi, x, &bsi, BLOCK_8X16);
+        rd_check_segment(cpi, x, &bsi, BLOCK_8X8);
+        rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
    }
    else
    {
        int sr;

-        vp8_rd_check_segment(cpi, x, &bsi, BLOCK_8X8);
+        rd_check_segment(cpi, x, &bsi, BLOCK_8X8);

        if (bsi.segment_rd < best_rd)
        {
@@ -1379,7 +1373,7 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
                sr = MAXF((abs(bsi.sv_mvp[1].row - bsi.sv_mvp[3].row))>>3, (abs(bsi.sv_mvp[1].col - bsi.sv_mvp[3].col))>>3);
                vp8_cal_step_param(sr, &bsi.sv_istep[1]);

-                vp8_rd_check_segment(cpi, x, &bsi, BLOCK_8X16);
+                rd_check_segment(cpi, x, &bsi, BLOCK_8X16);
            }

            /* block 16X8 */
@@ -1390,7 +1384,7 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
                sr = MAXF((abs(bsi.sv_mvp[2].row - bsi.sv_mvp[3].row))>>3, (abs(bsi.sv_mvp[2].col - bsi.sv_mvp[3].col))>>3);
                vp8_cal_step_param(sr, &bsi.sv_istep[1]);

-                vp8_rd_check_segment(cpi, x, &bsi, BLOCK_16X8);
+                rd_check_segment(cpi, x, &bsi, BLOCK_16X8);
            }

            /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
@@ -1398,7 +1392,7 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
            if (cpi->sf.no_skip_block4x4_search || bsi.segment_num == BLOCK_8X8)  /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
            {
                bsi.mvp = &bsi.sv_mvp[0];
-                vp8_rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
+                rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
            }

            /* restore UMV window */
@@ -1431,7 +1425,7 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
    {
        int j;

-        j = vp8_mbsplit_offset2[bsi.segment_num][i];
+        j = vp8_mbsplit_offset[bsi.segment_num][i];

        x->partition_info->bmi[i].mode = x->e_mbd.block[j].bmi.mode;
        x->partition_info->bmi[i].mv.as_mv = x->e_mbd.block[j].bmi.mv.as_mv;
@@ -1968,7 +1962,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                    else
                        cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
                }
-                else if (vp8_ref_frame_order[mode_index] == SPLITMV)
+                else if (vp8_mode_order[mode_index] == SPLITMV)
                    cpi->zbin_mode_boost = 0;
                else
                    cpi->zbin_mode_boost = MV_ZBIN_BOOST;
@@ -2038,7 +2032,8 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
        case H_PRED:
        case TM_PRED:
            x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
-            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+            RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
+                (&x->e_mbd);
            macro_block_yrd(x, &rate_y, &distortion, IF_RTCD(&cpi->rtcd.encodemb)) ;
            rate2 += rate_y;
            distortion2 += distortion;
--- a/vp8/encoder/ssim.c
+++ b/vp8/encoder/ssim.c
@@ -11,298 +11,13 @@

 #include "vpx_scale/yv12config.h"
 #include "math.h"
+#include "onyx_int.h"

-#define C1 (float)(64 * 64 * 0.01*255*0.01*255)
-#define C2 (float)(64 * 64 * 0.03*255*0.03*255)
-
-static int width_y;
-static int height_y;
-static int height_uv;
-static int width_uv;
-static int stride_uv;
-static int stride;
-static int lumimask;
-static int luminance;
-static double plane_summed_weights = 0;
-
-static short img12_sum_block[8*4096*4096*2] ;
-
-static short img1_sum[8*4096*2];
-static short img2_sum[8*4096*2];
-static int   img1_sq_sum[8*4096*2];
-static int   img2_sq_sum[8*4096*2];
-static int   img12_mul_sum[8*4096*2];
-
-
-double vp8_similarity
-(
-    int mu_x,
-    int mu_y,
-    int pre_mu_x2,
-    int pre_mu_y2,
-    int pre_mu_xy2
-)
-{
-    int mu_x2, mu_y2, mu_xy, theta_x2, theta_y2, theta_xy;
-
-    mu_x2 = mu_x * mu_x;
-    mu_y2 = mu_y * mu_y;
-    mu_xy = mu_x * mu_y;
-
-    theta_x2 = 64 * pre_mu_x2 - mu_x2;
-    theta_y2 = 64 * pre_mu_y2 - mu_y2;
-    theta_xy = 64 * pre_mu_xy2 - mu_xy;
-
-    return (2 * mu_xy + C1) * (2 * theta_xy + C2) / ((mu_x2 + mu_y2 + C1) * (theta_x2 + theta_y2 + C2));
-}
-
-double vp8_ssim
-(
-    const unsigned char *img1,
-    const unsigned char *img2,
-    int stride_img1,
-    int stride_img2,
-    int width,
-    int height
-)
-{
-    int x, y, x2, y2, img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block, temp;
-
-    double plane_quality, weight, mean;
-
-    short *img1_sum_ptr1, *img1_sum_ptr2;
-    short *img2_sum_ptr1, *img2_sum_ptr2;
-    int *img1_sq_sum_ptr1, *img1_sq_sum_ptr2;
-    int *img2_sq_sum_ptr1, *img2_sq_sum_ptr2;
-    int *img12_mul_sum_ptr1, *img12_mul_sum_ptr2;
-
-    plane_quality = 0;
-
-    if (lumimask)
-        plane_summed_weights = 0.0f;
-    else
-        plane_summed_weights = (height - 7) * (width - 7);
-
-    //some prologue for the main loop
-    temp = 8 * width;
-
-    img1_sum_ptr1      = img1_sum + temp;
-    img2_sum_ptr1      = img2_sum + temp;
-    img1_sq_sum_ptr1   = img1_sq_sum + temp;
-    img2_sq_sum_ptr1   = img2_sq_sum + temp;
-    img12_mul_sum_ptr1 = img12_mul_sum + temp;
-
-    for (x = 0; x < width; x++)
-    {
-        img1_sum[x]      = img1[x];
-        img2_sum[x]      = img2[x];
-        img1_sq_sum[x]   = img1[x] * img1[x];
-        img2_sq_sum[x]   = img2[x] * img2[x];
-        img12_mul_sum[x] = img1[x] * img2[x];
-
-        img1_sum_ptr1[x]      = 0;
-        img2_sum_ptr1[x]      = 0;
-        img1_sq_sum_ptr1[x]   = 0;
-        img2_sq_sum_ptr1[x]   = 0;
-        img12_mul_sum_ptr1[x] = 0;
-    }
-
-    //the main loop
-    for (y = 1; y < height; y++)
-    {
-        img1 += stride_img1;
-        img2 += stride_img2;
-
-        temp = (y - 1) % 9 * width;
-
-        img1_sum_ptr1      = img1_sum + temp;
-        img2_sum_ptr1      = img2_sum + temp;
-        img1_sq_sum_ptr1   = img1_sq_sum + temp;
-        img2_sq_sum_ptr1   = img2_sq_sum + temp;
-        img12_mul_sum_ptr1 = img12_mul_sum + temp;
-
-        temp = y % 9 * width;
-
-        img1_sum_ptr2      = img1_sum + temp;
-        img2_sum_ptr2      = img2_sum + temp;
-        img1_sq_sum_ptr2   = img1_sq_sum + temp;
-        img2_sq_sum_ptr2   = img2_sq_sum + temp;
-        img12_mul_sum_ptr2 = img12_mul_sum + temp;
-
-        for (x = 0; x < width; x++)
-        {
-            img1_sum_ptr2[x]      = img1_sum_ptr1[x] + img1[x];
-            img2_sum_ptr2[x]      = img2_sum_ptr1[x] + img2[x];
-            img1_sq_sum_ptr2[x]   = img1_sq_sum_ptr1[x] + img1[x] * img1[x];
-            img2_sq_sum_ptr2[x]   = img2_sq_sum_ptr1[x] + img2[x] * img2[x];
-            img12_mul_sum_ptr2[x] = img12_mul_sum_ptr1[x] + img1[x] * img2[x];
-        }
-
-        if (y > 6)
-        {
-            //calculate the sum of the last 8 lines by subtracting the total sum of 8 lines back from the present sum
-            temp = (y + 1) % 9 * width;
-
-            img1_sum_ptr1      = img1_sum + temp;
-            img2_sum_ptr1      = img2_sum + temp;
-            img1_sq_sum_ptr1   = img1_sq_sum + temp;
-            img2_sq_sum_ptr1   = img2_sq_sum + temp;
-            img12_mul_sum_ptr1 = img12_mul_sum + temp;
-
-            for (x = 0; x < width; x++)
-            {
-                img1_sum_ptr1[x]      = img1_sum_ptr2[x] - img1_sum_ptr1[x];
-                img2_sum_ptr1[x]      = img2_sum_ptr2[x] - img2_sum_ptr1[x];
-                img1_sq_sum_ptr1[x]   = img1_sq_sum_ptr2[x] - img1_sq_sum_ptr1[x];
-                img2_sq_sum_ptr1[x]   = img2_sq_sum_ptr2[x] - img2_sq_sum_ptr1[x];
-                img12_mul_sum_ptr1[x] = img12_mul_sum_ptr2[x] - img12_mul_sum_ptr1[x];
-            }
-
-            //here we calculate the sum over the 8x8 block of pixels
-            //this is done by sliding a window across the column sums for the last 8 lines
-            //each time adding the new column sum, and subtracting the one which fell out of the window
-            img1_block      = 0;
-            img2_block      = 0;
-            img1_sq_block   = 0;
-            img2_sq_block   = 0;
-            img12_mul_block = 0;
-
-            //prologue, and calculation of simularity measure from the first 8 column sums
-            for (x = 0; x < 8; x++)
-            {
-                img1_block      += img1_sum_ptr1[x];
-                img2_block      += img2_sum_ptr1[x];
-                img1_sq_block   += img1_sq_sum_ptr1[x];
-                img2_sq_block   += img2_sq_sum_ptr1[x];
-                img12_mul_block += img12_mul_sum_ptr1[x];
-            }
-
-            if (lumimask)
-            {
-                y2 = y - 7;
-                x2 = 0;
-
-                if (luminance)
-                {
-                    mean = (img2_block + img1_block) / 128.0f;
-
-                    if (!(y2 % 2 || x2 % 2))
-                        *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block;
-                }
-                else
-                {
-                    mean = *(img12_sum_block + y2 * width_uv + x2);
-                    mean += *(img12_sum_block + y2 * width_uv + x2 + 4);
-                    mean += *(img12_sum_block + (y2 + 4) * width_uv + x2);
-                    mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4);
-
-                    mean /= 512.0f;
-                }
-
-                weight = mean < 40 ? 0.0f :
-                         (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f);
-                plane_summed_weights += weight;
-
-                plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
-            }
-            else
-                plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
-
-            //and for the rest
-            for (x = 8; x < width; x++)
-            {
-                img1_block      = img1_block + img1_sum_ptr1[x] - img1_sum_ptr1[x - 8];
-                img2_block      = img2_block + img2_sum_ptr1[x] - img2_sum_ptr1[x - 8];
-                img1_sq_block   = img1_sq_block + img1_sq_sum_ptr1[x] - img1_sq_sum_ptr1[x - 8];
-                img2_sq_block   = img2_sq_block + img2_sq_sum_ptr1[x] - img2_sq_sum_ptr1[x - 8];
-                img12_mul_block = img12_mul_block + img12_mul_sum_ptr1[x] - img12_mul_sum_ptr1[x - 8];
-
-                if (lumimask)
-                {
-                    y2 = y - 7;
-                    x2 = x - 7;
-
-                    if (luminance)
-                    {
-                        mean = (img2_block + img1_block) / 128.0f;
-
-                        if (!(y2 % 2 || x2 % 2))
-                            *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block;
-                    }
-                    else
-                    {
-                        mean = *(img12_sum_block + y2 * width_uv + x2);
-                        mean += *(img12_sum_block + y2 * width_uv + x2 + 4);
-                        mean += *(img12_sum_block + (y2 + 4) * width_uv + x2);
-                        mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4);
-
-                        mean /= 512.0f;
-                    }
-
-                    weight = mean < 40 ? 0.0f :
-                             (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f);
-                    plane_summed_weights += weight;
-
-                    plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
-                }
-                else
-                    plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
-            }
-        }
-    }
-
-    if (plane_summed_weights == 0)
-        return 1.0f;
-    else
-        return plane_quality / plane_summed_weights;
-}
-
-double vp8_calc_ssim
-(
-    YV12_BUFFER_CONFIG *source,
-    YV12_BUFFER_CONFIG *dest,
-    int lumamask,
-    double *weight
-)
-{
-    double a, b, c;
-    double frame_weight;
-    double ssimv;
-
-    width_y = source->y_width;
-    height_y = source->y_height;
-    height_uv = source->uv_height;
-    width_uv = source->uv_width;
-    stride_uv = dest->uv_stride;
-    stride = dest->y_stride;
-
-    lumimask = lumamask;
-
-    luminance = 1;
-    a = vp8_ssim(source->y_buffer, dest->y_buffer,
-                 source->y_stride, dest->y_stride, source->y_width, source->y_height);
-    luminance = 0;
-
-    frame_weight = plane_summed_weights / ((width_y - 7) * (height_y - 7));
-
-    if (frame_weight == 0)
-        a = b = c = 1.0f;
-    else
-    {
-        b = vp8_ssim(source->u_buffer, dest->u_buffer,
-                     source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height);
-
-        c = vp8_ssim(source->v_buffer, dest->v_buffer,
-                     source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height);
-    }
-
-    ssimv = a * .8 + .1 * (b + c);
-
-    *weight = frame_weight;
-
-    return ssimv;
-}
-
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x)  (x)
+#else
+#define IF_RTCD(x)  NULL
+#endif
 // Google version of SSIM
 // SSIM
 #define KERNEL 3
@@ -520,3 +235,174 @@ double vp8_calc_ssimg
    *ssim_v /= uvsize;
    return ssim_all;
 }
+
+
+void ssim_parms_c
+(
+    unsigned char *s,
+    int sp,
+    unsigned char *r,
+    int rp,
+    unsigned long *sum_s,
+    unsigned long *sum_r,
+    unsigned long *sum_sq_s,
+    unsigned long *sum_sq_r,
+    unsigned long *sum_sxr
+)
+{
+    int i,j;
+    for(i=0;i<16;i++,s+=sp,r+=rp)
+     {
+         for(j=0;j<16;j++)
+         {
+             *sum_s += s[j];
+             *sum_r += r[j];
+             *sum_sq_s += s[j] * s[j];
+             *sum_sq_r += r[j] * r[j];
+             *sum_sxr += s[j] * r[j];
+         }
+     }
+}
+void ssim_parms_8x8_c
+(
+    unsigned char *s,
+    int sp,
+    unsigned char *r,
+    int rp,
+    unsigned long *sum_s,
+    unsigned long *sum_r,
+    unsigned long *sum_sq_s,
+    unsigned long *sum_sq_r,
+    unsigned long *sum_sxr
+)
+{
+    int i,j;
+    for(i=0;i<8;i++,s+=sp,r+=rp)
+     {
+         for(j=0;j<8;j++)
+         {
+             *sum_s += s[j];
+             *sum_r += r[j];
+             *sum_sq_s += s[j] * s[j];
+             *sum_sq_r += r[j] * r[j];
+             *sum_sxr += s[j] * r[j];
+         }
+     }
+}
+
+const static long long c1 =  426148; // (256^2*(.01*255)^2
+const static long long c2 = 3835331; //(256^2*(.03*255)^2
+
+static double similarity
+(
+    unsigned long sum_s,
+    unsigned long sum_r,
+    unsigned long sum_sq_s,
+    unsigned long sum_sq_r,
+    unsigned long sum_sxr,
+    int count
+)
+{
+    long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2);
+
+    long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
+            (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ;
+
+    return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp,
+            const vp8_variance_rtcd_vtable_t *rtcd)
+{
+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+    rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
+}
+static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp,
+                const vp8_variance_rtcd_vtable_t *rtcd)
+{
+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+    rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+}
+
+// TODO: (jbb) tried to scale this function such that we may be able to use it
+// for distortion metric in mode selection code ( provided we do a reconstruction)
+long dssim(unsigned char *s,int sp, unsigned char *r,int rp,
+           const vp8_variance_rtcd_vtable_t *rtcd)
+{
+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+    double ssim3;
+    long long ssim_n;
+    long long ssim_d;
+
+    rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2);
+
+    ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
+            (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ;
+
+    ssim3 = 256 * (ssim_d-ssim_n) / ssim_d;
+    return (long)( 256*ssim3 * ssim3 );
+}
+// TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels
+// such that the window regions overlap block boundaries to penalize blocking
+// artifacts.
+
+double vp8_ssim2
+(
+    unsigned char *img1,
+    unsigned char *img2,
+    int stride_img1,
+    int stride_img2,
+    int width,
+    int height,
+    const vp8_variance_rtcd_vtable_t *rtcd
+)
+{
+    int i,j;
+
+    double ssim_total=0;
+
+    // we can sample points as frequently as we like start with 1 per 8x8
+    for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8)
+    {
+        for(j=0; j < width; j+=8 )
+        {
+            ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd);
+        }
+    }
+    ssim_total /= (width/8 * height /8);
+    return ssim_total;
+
+}
+double vp8_calc_ssim
+(
+    YV12_BUFFER_CONFIG *source,
+    YV12_BUFFER_CONFIG *dest,
+    int lumamask,
+    double *weight,
+    const vp8_variance_rtcd_vtable_t *rtcd
+)
+{
+    double a, b, c;
+    double ssimv;
+
+    a = vp8_ssim2(source->y_buffer, dest->y_buffer,
+                 source->y_stride, dest->y_stride, source->y_width,
+                 source->y_height, rtcd);
+
+    b = vp8_ssim2(source->u_buffer, dest->u_buffer,
+                 source->uv_stride, dest->uv_stride, source->uv_width,
+                 source->uv_height, rtcd);
+
+    c = vp8_ssim2(source->v_buffer, dest->v_buffer,
+                 source->uv_stride, dest->uv_stride, source->uv_width,
+                 source->uv_height, rtcd);
+
+    ssimv = a * .8 + .1 * (b + c);
+
+    *weight = 1;
+
+    return ssimv;
+}
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -287,8 +287,7 @@ static void vp8_temporal_filter_iterate_c
    int byte;
    int frame;
    int mb_col, mb_row;
-    unsigned int filter_weight[MAX_LAG_BUFFERS];
-    unsigned char *mm_ptr = cpi->fp_motion_map;
+    unsigned int filter_weight;
    int mb_cols = cpi->common.mb_cols;
    int mb_rows = cpi->common.mb_rows;
    int MBs  = cpi->common.MBs;
@@ -306,13 +305,6 @@ static void vp8_temporal_filter_iterate_c
    unsigned char *u_buffer = mbd->pre.u_buffer;
    unsigned char *v_buffer = mbd->pre.v_buffer;

-    if (!cpi->use_weighted_temporal_filter)
-    {
-        // Temporal filtering is unweighted
-        for (frame = 0; frame < frame_count; frame++)
-            filter_weight[frame] = 1;
-    }
-
    for (mb_row = 0; mb_row < mb_rows; mb_row++)
    {
 #if ALT_REF_MC_ENABLED
@@ -338,34 +330,9 @@ static void vp8_temporal_filter_iterate_c
                                    + (VP8BORDERINPIXELS - 19);
 #endif

-            // Read & process macroblock weights from motion map
-            if (cpi->use_weighted_temporal_filter)
-            {
-                weight_cap = 2;
-
-                for (frame = alt_ref_index-1; frame >= 0; frame--)
-                {
-                    w = *(mm_ptr + (frame+1)*MBs);
-                    filter_weight[frame] = w < weight_cap ? w : weight_cap;
-                    weight_cap = w;
-                }
-
-                filter_weight[alt_ref_index] = 2;
-
-                weight_cap = 2;
-
-                for (frame = alt_ref_index+1; frame < frame_count; frame++)
-                {
-                    w = *(mm_ptr + frame*MBs);
-                    filter_weight[frame] = w < weight_cap ? w : weight_cap;
-                    weight_cap = w;
-                }
-
-            }
-
            for (frame = 0; frame < frame_count; frame++)
            {
-                int err;
+                int err = 0;

                if (cpi->frames[frame] == NULL)
                    continue;
@@ -374,28 +341,25 @@ static void vp8_temporal_filter_iterate_c
                mbd->block[0].bmi.mv.as_mv.col = 0;

 #if ALT_REF_MC_ENABLED
-                //if (filter_weight[frame] == 0)
-                {
 #define THRESH_LOW   10000
 #define THRESH_HIGH  20000

-                    // Correlation has been lost try MC
-                    err = vp8_temporal_filter_find_matching_mb_c
-                        (cpi,
-                         cpi->frames[alt_ref_index],
-                         cpi->frames[frame],
-                         mb_y_offset,
-                         THRESH_LOW);
+                // Find best match in this frame by MC
+                err = vp8_temporal_filter_find_matching_mb_c
+                      (cpi,
+                       cpi->frames[alt_ref_index],
+                       cpi->frames[frame],
+                       mb_y_offset,
+                       THRESH_LOW);

-                    if (filter_weight[frame] < 2)
-                    {
-                        // Set weight depending on error
-                        filter_weight[frame] = err<THRESH_LOW
-                                                ? 2 : err<THRESH_HIGH ? 1 : 0;
-                    }
-                }
 #endif
-                if (filter_weight[frame] != 0)
+                // Assign higher weight to matching MB if it's error
+                // score is lower. If not applying MC default behavior
+                // is to weight all MBs equal.
+                filter_weight = err<THRESH_LOW
+                                  ? 2 : err<THRESH_HIGH ? 1 : 0;
+
+                if (filter_weight != 0)
                {
                    // Construct the predictors
                    vp8_temporal_filter_predictors_mb_c
@@ -415,7 +379,7 @@ static void vp8_temporal_filter_iterate_c
                         predictor,
                         16,
                         strength,
-                         filter_weight[frame],
+                         filter_weight,
                         accumulator,
                         count);

@@ -425,7 +389,7 @@ static void vp8_temporal_filter_iterate_c
                         predictor + 256,
                         8,
                         strength,
-                         filter_weight[frame],
+                         filter_weight,
                         accumulator + 256,
                         count + 256);

@@ -435,7 +399,7 @@ static void vp8_temporal_filter_iterate_c
                         predictor + 320,
                         8,
                         strength,
-                         filter_weight[frame],
+                         filter_weight,
                         accumulator + 320,
                         count + 320);
                }
@@ -491,7 +455,6 @@ static void vp8_temporal_filter_iterate_c
                byte += stride - 8;
            }

-            mm_ptr++;
            mb_y_offset += 16;
            mb_uv_offset += 8;
        }
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -26,9 +26,9 @@ _int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef
 void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
 void vp8_fix_contexts(MACROBLOCKD *x);

-TOKENVALUE vp8_dct_value_tokens[DCT_MAX_VALUE*2];
+static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE*2];
 const TOKENVALUE *vp8_dct_value_tokens_ptr;
-int vp8_dct_value_cost[DCT_MAX_VALUE*2];
+static int dct_value_cost[DCT_MAX_VALUE*2];
 const int *vp8_dct_value_cost_ptr;
 #if 0
 int skip_true_count = 0;
@@ -37,7 +37,7 @@ int skip_false_count = 0;
 static void fill_value_tokens()
 {

-    TOKENVALUE *const t = vp8_dct_value_tokens + DCT_MAX_VALUE;
+    TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
    vp8_extra_bit_struct *const e = vp8_extra_bits;

    int i = -DCT_MAX_VALUE;
@@ -81,7 +81,7 @@ static void fill_value_tokens()
                    cost += vp8_treed_cost(p->tree, p->prob, extra >> 1, Length);

                cost += vp8_cost_bit(vp8_prob_half, extra & 1); /* sign */
-                vp8_dct_value_cost[i + DCT_MAX_VALUE] = cost;
+                dct_value_cost[i + DCT_MAX_VALUE] = cost;
            }

        }
@@ -89,8 +89,8 @@ static void fill_value_tokens()
    }
    while (++i < DCT_MAX_VALUE);

-    vp8_dct_value_tokens_ptr = vp8_dct_value_tokens + DCT_MAX_VALUE;
-    vp8_dct_value_cost_ptr   = vp8_dct_value_cost + DCT_MAX_VALUE;
+    vp8_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
+    vp8_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
 }

 static void tokenize2nd_order_b
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -85,6 +85,19 @@
      unsigned int *sse \
    );

+#define prototype_ssimpf(sym) \
+    void (sym) \
+      ( \
+        unsigned char *s, \
+        int sp, \
+        unsigned char *r, \
+        int rp, \
+        unsigned long *sum_s, \
+        unsigned long *sum_r, \
+        unsigned long *sum_sq_s, \
+        unsigned long *sum_sq_r, \
+        unsigned long *sum_sxr \
+      );

 #define prototype_getmbss(sym) unsigned int (sym)(const short *)

@@ -306,6 +319,15 @@ extern prototype_variance2(vp8_variance_get16x16var);
 #endif
 extern prototype_sad(vp8_variance_get4x4sse_cs);

+#ifndef vp8_ssimpf
+#define vp8_ssimpf ssim_parms_c
+#endif
+extern prototype_ssimpf(vp8_ssimpf)
+
+#ifndef vp8_ssimpf_8x8
+#define vp8_ssimpf_8x8 ssim_parms_8x8_c
+#endif
+extern prototype_ssimpf(vp8_ssimpf_8x8)

 typedef prototype_sad(*vp8_sad_fn_t);
 typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
@@ -315,6 +337,10 @@ typedef prototype_variance(*vp8_variance_fn_t);
 typedef prototype_variance2(*vp8_variance2_fn_t);
 typedef prototype_subpixvariance(*vp8_subpixvariance_fn_t);
 typedef prototype_getmbss(*vp8_getmbss_fn_t);
+
+typedef prototype_ssimpf(*vp8_ssimpf_fn_t)
+
+
 typedef struct
 {
    vp8_sad_fn_t             sad4x4;
@@ -365,6 +391,11 @@ typedef struct
    vp8_sad_multi_d_fn_t     sad8x8x4d;
    vp8_sad_multi_d_fn_t     sad4x4x4d;

+#if CONFIG_PSNR
+    vp8_ssimpf_fn_t          ssimpf_8x8;
+    vp8_ssimpf_fn_t          ssimpf;
+#endif
+
 } vp8_variance_rtcd_vtable_t;

 typedef struct
@@ -378,6 +409,7 @@ typedef struct
    vp8_sad_multi_fn_t      sdx3f;
    vp8_sad_multi1_fn_t     sdx8f;
    vp8_sad_multi_d_fn_t    sdx4df;
+
 } vp8_variance_fn_ptr_t;

 #if CONFIG_RUNTIME_CPU_DETECT
--- a/vp8/encoder/variance_c.c
+++ b/vp8/encoder/variance_c.c
@@ -10,33 +10,8 @@


 #include "variance.h"
+#include "vp8/common/filter.h"

-const int vp8_six_tap[8][6] =
-{
-    { 0,  0,  128,    0,   0,  0 },         // note that 1/8 pel positions are just as per alpha -0.5 bicubic
-    { 0, -6,  123,   12,  -1,  0 },
-    { 2, -11, 108,   36,  -8,  1 },         // New 1/4 pel 6 tap filter
-    { 0, -9,   93,   50,  -6,  0 },
-    { 3, -16,  77,   77, -16,  3 },         // New 1/2 pel 6 tap filter
-    { 0, -6,   50,   93,  -9,  0 },
-    { 1, -8,   36,  108, -11,  2 },         // New 1/4 pel 6 tap filter
-    { 0, -1,   12,  123,  -6,  0 }
-};
-
-
-const int VP8_FILTER_WEIGHT = 128;
-const int VP8_FILTER_SHIFT  =   7;
-const int vp8_bilinear_taps[8][2] =
-{
-    { 128,   0 },
-    { 112,  16 },
-    {  96,  32 },
-    {  80,  48 },
-    {  64,  64 },
-    {  48,  80 },
-    {  32,  96 },
-    {  16, 112 }
-};

 unsigned int vp8_get_mb_ss_c
 (
@@ -56,7 +31,7 @@ unsigned int vp8_get_mb_ss_c
 }


-void  vp8_variance(
+static void variance(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
@@ -98,7 +73,7 @@ vp8_get8x8var_c
 )
 {

-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, SSE, Sum);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, SSE, Sum);
    return (*SSE - (((*Sum) * (*Sum)) >> 6));
 }

@@ -114,7 +89,7 @@ vp8_get16x16var_c
 )
 {

-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, SSE, Sum);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, SSE, Sum);
    return (*SSE - (((*Sum) * (*Sum)) >> 8));

 }
@@ -132,7 +107,7 @@ unsigned int vp8_variance16x16_c(
    int avg;


-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
    *sse = var;
    return (var - ((avg * avg) >> 8));
 }
@@ -148,7 +123,7 @@ unsigned int vp8_variance8x16_c(
    int avg;


-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
    *sse = var;
    return (var - ((avg * avg) >> 7));
 }
@@ -164,7 +139,7 @@ unsigned int vp8_variance16x8_c(
    int avg;


-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
    *sse = var;
    return (var - ((avg * avg) >> 7));
 }
@@ -181,7 +156,7 @@ unsigned int vp8_variance8x8_c(
    int avg;


-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
    *sse = var;
    return (var - ((avg * avg) >> 6));
 }
@@ -197,7 +172,7 @@ unsigned int vp8_variance4x4_c(
    int avg;


-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
    *sse = var;
    return (var - ((avg * avg) >> 4));
 }
@@ -213,7 +188,7 @@ unsigned int vp8_mse16x16_c(
    unsigned int var;
    int avg;

-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
    *sse = var;
    return var;
 }
@@ -247,7 +222,7 @@ unsigned int vp8_mse16x16_c(
 *                  to the next.
 *
 ****************************************************************************/
-void vp8e_filter_block2d_bil_first_pass
+static void var_filter_block2d_bil_first_pass
 (
    const unsigned char *src_ptr,
    unsigned short *output_ptr,
@@ -255,7 +230,7 @@ void vp8e_filter_block2d_bil_first_pass
    int pixel_step,
    unsigned int output_height,
    unsigned int output_width,
-    const int *vp8_filter
+    const short *vp8_filter
 )
 {
    unsigned int i, j;
@@ -305,7 +280,7 @@ void vp8e_filter_block2d_bil_first_pass
 *                  to the next.
 *
 ****************************************************************************/
-void vp8e_filter_block2d_bil_second_pass
+static void var_filter_block2d_bil_second_pass
 (
    const unsigned short *src_ptr,
    unsigned char  *output_ptr,
@@ -313,7 +288,7 @@ void vp8e_filter_block2d_bil_second_pass
    unsigned int  pixel_step,
    unsigned int  output_height,
    unsigned int  output_width,
-    const int *vp8_filter
+    const short *vp8_filter
 )
 {
    unsigned int  i, j;
@@ -338,52 +313,6 @@ void vp8e_filter_block2d_bil_second_pass
 }


-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil
- *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  INT32  *HFilter         : Array of 2 horizontal filter taps.
- *                  INT32  *VFilter         : Array of 2 vertical filter taps.
- *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 2-D filters an 8x8 input block by applying a 2-tap
- *                  bi-linear filter horizontally followed by a 2-tap
- *                  bi-linear filter vertically on the result.
- *
- *  SPECIAL NOTES : The intermediate horizontally filtered block must produce
- *                  1 more point than the input block in each column. This
- *                  is to ensure that the 2-tap filter has one extra data-point
- *                  at the top of each column so filter taps do not extend
- *                  beyond data. Thus the output of the first stage filter
- *                  is an 8x9 (hx_v) block.
- *
- ****************************************************************************/
-void vp8e_filter_block2d_bil
-(
-    const unsigned char  *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int src_pixels_per_line,
-    int  *HFilter,
-    int  *VFilter
-)
-{
-
-    unsigned short FData[20*16];    // Temp data bufffer used in filtering
-
-    // First filter 1-D horizontally...
-    vp8e_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, 9, 8, HFilter);
-
-    // then 1-D vertically...
-    vp8e_filter_block2d_bil_second_pass(FData, output_ptr, 8, 8, 8, 8, VFilter);
-}
-
-
-
 unsigned int vp8_sub_pixel_variance4x4_c
 (
    const unsigned char  *src_ptr,
@@ -396,17 +325,17 @@ unsigned int vp8_sub_pixel_variance4x4_c
 )
 {
    unsigned char  temp2[20*16];
-    const int *HFilter, *VFilter;
+    const short *HFilter, *VFilter;
    unsigned short FData3[5*4]; // Temp data bufffer used in filtering

-    HFilter = vp8_bilinear_taps[xoffset];
-    VFilter = vp8_bilinear_taps[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    // First filter 1d Horizontal
-    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);

    // Now filter Verticaly
-    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);
+    var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);

    return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
 }
@@ -425,13 +354,13 @@ unsigned int vp8_sub_pixel_variance8x8_c
 {
    unsigned short FData3[9*8]; // Temp data bufffer used in filtering
    unsigned char  temp2[20*16];
-    const int *HFilter, *VFilter;
+    const short *HFilter, *VFilter;

-    HFilter = vp8_bilinear_taps[xoffset];
-    VFilter = vp8_bilinear_taps[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

-    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
-    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
+    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);

    return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
@@ -449,13 +378,13 @@ unsigned int vp8_sub_pixel_variance16x16_c
 {
    unsigned short FData3[17*16];   // Temp data bufffer used in filtering
    unsigned char  temp2[20*16];
-    const int *HFilter, *VFilter;
+    const short *HFilter, *VFilter;

-    HFilter = vp8_bilinear_taps[xoffset];
-    VFilter = vp8_bilinear_taps[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

-    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
-    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
+    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);

    return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
@@ -525,13 +454,13 @@ unsigned int vp8_sub_pixel_variance16x8_c
 {
    unsigned short FData3[16*9];    // Temp data bufffer used in filtering
    unsigned char  temp2[20*16];
-    const int *HFilter, *VFilter;
+    const short *HFilter, *VFilter;

-    HFilter = vp8_bilinear_taps[xoffset];
-    VFilter = vp8_bilinear_taps[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

-    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
-    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
+    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);

    return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
@@ -549,15 +478,15 @@ unsigned int vp8_sub_pixel_variance8x16_c
 {
    unsigned short FData3[9*16];    // Temp data bufffer used in filtering
    unsigned char  temp2[20*16];
-    const int *HFilter, *VFilter;
+    const short *HFilter, *VFilter;


-    HFilter = vp8_bilinear_taps[xoffset];
-    VFilter = vp8_bilinear_taps[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];


-    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
-    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
+    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);

    return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
--- a/vp8/encoder/x86/sad_sse4.asm
+++ b/vp8/encoder/x86/sad_sse4.asm
@@ -186,7 +186,7 @@ sym(vp8_sad16x16x8_sse4):
        PROCESS_16X2X8 0

        mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1

    ; begin epilog
    pop         rdi
@@ -224,7 +224,7 @@ sym(vp8_sad16x8x8_sse4):
        PROCESS_16X2X8 0

        mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1

    ; begin epilog
    pop         rdi
@@ -262,7 +262,7 @@ sym(vp8_sad8x8x8_sse4):
        PROCESS_8X2X8 0

        mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1

    ; begin epilog
    pop         rdi
@@ -303,7 +303,7 @@ sym(vp8_sad8x16x8_sse4):
        PROCESS_8X2X8 0
        PROCESS_8X2X8 0
        mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1

    ; begin epilog
    pop         rdi
@@ -339,7 +339,7 @@ sym(vp8_sad4x4x8_sse4):
        PROCESS_4X2X8 0

        mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1

    ; begin epilog
    pop         rdi
--- a/vp8/encoder/x86/ssim_opt.asm
+++ b/vp8/encoder/x86/ssim_opt.asm
@@ -0,0 +1,215 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddq           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddq           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddq           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+;void ssim_parms_sse3(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp8_ssim_parms_16x16_sse3)
+sym(vp8_ssim_parms_16x16_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movq            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movq            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movq            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movq            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movq            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse3(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp8_ssim_parms_8x8_sse3)
+sym(vp8_ssim_parms_8x8_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+NextRow2:
+
+    ;grab source and reference pixels
+    movq            xmm5, [rsi]
+    movq            xmm6, [rdi]
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz NextRow2
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movq            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movq            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movq            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movq            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movq            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -627,6 +627,10 @@ filter_block2d_bil_var_sse2_loop:

 filter_block2d_bil_var_sse2_sp_only:
        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
+        je              filter_block2d_bil_var_sse2_full_pixel
+
        shl             rdx,            5
        lea             rdx,            [rdx + rcx]          ; VFilter

@@ -671,6 +675,35 @@ filter_block2d_bil_sp_only_loop:

        jmp             filter_block2d_bil_variance

+filter_block2d_bil_var_sse2_full_pixel:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        pxor            xmm0,           xmm0                 ;
+
+filter_block2d_bil_full_pixel_loop:
+        movq            xmm1,           QWORD PTR [rsi]               ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movq            xmm2,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm2,           xmm0                 ;
+
+        psubw           xmm1,           xmm2                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_full_pixel_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
 filter_block2d_bil_var_sse2_fp_only:
        mov             rsi,            arg(0)               ;ref_ptr
        mov             rdi,            arg(2)               ;src_ptr
@@ -757,7 +790,7 @@ filter_block2d_bil_variance:
    ret


-;void vp8_half_horiz_vert_variance16x_h_sse2
+;void vp8_half_horiz_vert_variance8x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
 ;    int ref_pixels_per_line,
@@ -767,8 +800,8 @@ filter_block2d_bil_variance:
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_horiz_vert_variance16x_h_sse2)
-sym(vp8_half_horiz_vert_variance16x_h_sse2):
+global sym(vp8_half_horiz_vert_variance8x_h_sse2)
+sym(vp8_half_horiz_vert_variance8x_h_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
@@ -802,7 +835,7 @@ sym(vp8_half_horiz_vert_variance16x_h_sse2):
        add             rsi, r8
 %endif

-vp8_half_horiz_vert_variance16x_h_1:
+vp8_half_horiz_vert_variance8x_h_1:

        movq            xmm1,           QWORD PTR [rsi]     ;
        movq            xmm2,           QWORD PTR [rsi+1]   ;
@@ -830,7 +863,7 @@ vp8_half_horiz_vert_variance16x_h_1:
 %endif

        sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
+        jnz             vp8_half_horiz_vert_variance8x_h_1     ;

        movdq2q         mm6,            xmm6                ;
        movdq2q         mm7,            xmm7                ;
@@ -877,8 +910,7 @@ vp8_half_horiz_vert_variance16x_h_1:
    pop         rbp
    ret

-
-;void vp8_half_vert_variance16x_h_sse2
+;void vp8_half_horiz_vert_variance16x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
 ;    int ref_pixels_per_line,
@@ -888,8 +920,124 @@ vp8_half_horiz_vert_variance16x_h_1:
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_vert_variance16x_h_sse2)
-sym(vp8_half_vert_variance16x_h_sse2):
+global sym(vp8_half_horiz_vert_variance16x_h_sse2)
+sym(vp8_half_horiz_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+        lea             rsi,            [rsi + rax]
+
+vp8_half_horiz_vert_variance16x_h_1:
+        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
+
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+
+        movq            xmm3,           QWORD PTR [rdi+8]
+        punpcklbw       xmm3,           xmm0
+        psubw           xmm4,           xmm3
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_vert_variance8x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance8x_h_sse2)
+sym(vp8_half_vert_variance8x_h_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
@@ -912,7 +1060,7 @@ sym(vp8_half_vert_variance16x_h_sse2):
        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line

        pxor            xmm0,           xmm0                ;
-vp8_half_vert_variance16x_h_1:
+vp8_half_vert_variance8x_h_1:
        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9

@@ -936,7 +1084,7 @@ vp8_half_vert_variance16x_h_1:
 %endif

        sub             rcx,            1                   ;
-        jnz             vp8_half_vert_variance16x_h_1          ;
+        jnz             vp8_half_vert_variance8x_h_1          ;

        movdq2q         mm6,            xmm6                ;
        movdq2q         mm7,            xmm7                ;
@@ -983,8 +1131,7 @@ vp8_half_vert_variance16x_h_1:
    pop         rbp
    ret

-
-;void vp8_half_horiz_variance16x_h_sse2
+;void vp8_half_vert_variance16x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
 ;    int ref_pixels_per_line,
@@ -994,8 +1141,116 @@ vp8_half_vert_variance16x_h_1:
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_horiz_variance16x_h_sse2)
-sym(vp8_half_horiz_variance16x_h_sse2):
+global sym(vp8_half_vert_variance16x_h_sse2)
+sym(vp8_half_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0)              ;ref_ptr
+
+        mov             rdi,            arg(2)              ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)    ;Height
+        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        lea             rsi,            [rsi + rax          ]
+        pxor            xmm0,           xmm0
+
+vp8_half_vert_variance16x_h_1:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm2,           QWORD PTR [rdi]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm5,           xmm2
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm4,           xmm2
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm3
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1
+        jnz             vp8_half_vert_variance16x_h_1
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_horiz_variance8x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance8x_h_sse2)
+sym(vp8_half_horiz_variance8x_h_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
@@ -1017,7 +1272,7 @@ sym(vp8_half_horiz_variance16x_h_sse2):
        movsxd          rcx,            dword ptr arg(4) ;Height              ;

        pxor            xmm0,           xmm0                ;
-vp8_half_horiz_variance16x16_1:
+vp8_half_horiz_variance8x_h_1:
        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9

@@ -1040,7 +1295,7 @@ vp8_half_horiz_variance16x16_1:
        add             rdi, r9
 %endif
        sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_variance16x16_1        ;
+        jnz             vp8_half_horiz_variance8x_h_1        ;

        movdq2q         mm6,            xmm6                ;
        movdq2q         mm7,            xmm7                ;
@@ -1087,6 +1342,109 @@ vp8_half_horiz_variance16x16_1:
    pop         rbp
    ret

+;void vp8_half_horiz_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance16x_h_sse2)
+sym(vp8_half_horiz_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+vp8_half_horiz_variance16x_h_1:
+        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm1,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm1,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        psubw           xmm1,           xmm2
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm1
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm1,           xmm1
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm1
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_variance16x_h_1        ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

 SECTION_RODATA
 ;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
--- a/vp8/encoder/x86/variance_impl_ssse3.asm
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -0,0 +1,348 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift            7
+
+
+;void vp8_filter_block2d_bil_var_ssse3
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int  xoffset,
+;    int  yoffset,
+;    int *sum,
+;    unsigned int *sumsquared;;
+;
+;)
+;Note: The filter coefficient at offset=0 is 128. Since the second register
+;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
+global sym(vp8_filter_block2d_bil_var_ssse3)
+sym(vp8_filter_block2d_bil_var_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    push rbx
+    ; end prolog
+
+        pxor            xmm6,           xmm6
+        pxor            xmm7,           xmm7
+
+        lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]
+        movsxd          rax,            dword ptr arg(5)     ; xoffset
+
+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
+        je              filter_block2d_bil_var_ssse3_sp_only
+
+        shl             rax,            4                    ; point to filter coeff with xoffset
+        lea             rax,            [rax + rcx]          ; HFilter
+
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
+        je              filter_block2d_bil_var_ssse3_fp_only
+
+        shl             rdx,            4
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+
+        movdqu          xmm0,           XMMWORD PTR [rsi]
+        movdqu          xmm1,           XMMWORD PTR [rsi+1]
+        movdqa          xmm2,           xmm0
+
+        punpcklbw       xmm0,           xmm1
+        punpckhbw       xmm2,           xmm1
+        pmaddubsw       xmm0,           [rax]
+        pmaddubsw       xmm2,           [rax]
+
+        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm0,           xmm_filter_shift
+        psraw           xmm2,           xmm_filter_shift
+
+        packuswb        xmm0,           xmm2
+
+        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
+        lea             rsi,            [rsi + rbx]
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_ssse3_loop:
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]
+        movdqa          xmm3,           xmm1
+
+        punpcklbw       xmm1,           xmm2
+        punpckhbw       xmm3,           xmm2
+        pmaddubsw       xmm1,           [rax]
+        pmaddubsw       xmm3,           [rax]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+        packuswb        xmm1,           xmm3
+
+        movdqa          xmm2,           xmm0
+        movdqa          xmm0,           xmm1
+        movdqa          xmm3,           xmm2
+
+        punpcklbw       xmm2,           xmm1
+        punpckhbw       xmm3,           xmm1
+        pmaddubsw       xmm2,           [rdx]
+        pmaddubsw       xmm3,           [rdx]
+
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm2,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+
+        movq            xmm1,           QWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm1,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm2,           xmm1
+        psubw           xmm3,           xmm5
+        paddw           xmm6,           xmm2
+        paddw           xmm6,           xmm3
+        pmaddwd         xmm2,           xmm2
+        pmaddwd         xmm3,           xmm3
+        paddd           xmm7,           xmm2
+        paddd           xmm7,           xmm3
+
+        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
+%if ABI_IS_32BIT
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
+%else
+        lea             rdi,            [rdi + r9]
+%endif
+
+        sub             rcx,            1
+        jnz             filter_block2d_bil_var_ssse3_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_sp_only:
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
+        je              filter_block2d_bil_var_ssse3_full_pixel
+
+        shl             rdx,            4
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqa          xmm0,           xmm1
+
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        lea             rsi,            [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+        movdqa          xmm2,           xmm1
+        movdqa          xmm0,           xmm3
+
+        punpcklbw       xmm1,           xmm3
+        punpckhbw       xmm2,           xmm3
+        pmaddubsw       xmm1,           [rdx]
+        pmaddubsw       xmm2,           [rdx]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm2,           xmm_filter_shift
+
+        movq            xmm3,           QWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm3,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm1,           xmm3
+        psubw           xmm2,           xmm5
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm2
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm2,           xmm2
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm2
+
+        movdqa          xmm1,           xmm0
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1
+        jnz             filter_block2d_bil_sp_only_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_full_pixel:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        pxor            xmm0,           xmm0
+
+filter_block2d_bil_full_pixel_loop:
+        movq            xmm1,           QWORD PTR [rsi]
+        punpcklbw       xmm1,           xmm0
+        movq            xmm2,           QWORD PTR [rsi+8]
+        punpcklbw       xmm2,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]
+        punpcklbw       xmm3,           xmm0
+        movq            xmm4,           QWORD PTR [rdi+8]
+        punpcklbw       xmm4,           xmm0
+
+        psubw           xmm1,           xmm3
+        psubw           xmm2,           xmm4
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm2
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm2,           xmm2
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm2
+
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+        sub             rcx,            1
+        jnz             filter_block2d_bil_full_pixel_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_fp_only:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]
+        movdqa          xmm3,           xmm1
+
+        punpcklbw       xmm1,           xmm2
+        punpckhbw       xmm3,           xmm2
+        pmaddubsw       xmm1,           [rax]
+        pmaddubsw       xmm3,           [rax]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+
+        movq            xmm2,           XMMWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm2,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm1,           xmm2
+        psubw           xmm3,           xmm5
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm3
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm3,           xmm3
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm3
+
+        lea             rsi,            [rsi + rdx]
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1
+        jnz             filter_block2d_bil_fp_only_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
+        pxor        xmm0,           xmm0
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(7) ;[Sum]
+        mov         rdi,            arg(8) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rbx
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+xmm_bi_rd:
+    times 8 dw 64
+align 16
+vp8_bilinear_filters_ssse3:
+    times 8 db 128, 0
+    times 8 db 112, 16
+    times 8 db 96,  32
+    times 8 db 80,  48
+    times 8 db 64,  64
+    times 8 db 48,  80
+    times 8 db 32,  96
+    times 8 db 16,  112
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -53,13 +53,6 @@ extern unsigned int vp8_get4x4var_mmx
    unsigned int *SSE,
    int *Sum
 );
-extern unsigned int vp8_get4x4sse_cs_mmx
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride
-);
 extern void vp8_filter_block2d_bil4x4_var_mmx
 (
    const unsigned char *ref_ptr,
@@ -92,39 +85,6 @@ extern unsigned int vp8_get16x16pred_error_mmx
 );


-void vp8_test_get_mb_ss(void)
-{
-    short zz[] =
-    {
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -2, -2, -2, -2, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, 2, 2,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -3, -3, -3, -3, 3, 3, 3, 3, -3, -3, -3, -3, 3, 3, 3, 3,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-    };
-    int s = 0, x = vp8_get_mb_ss_mmx(zz);
-    {
-        int y;
-
-        for (y = 0; y < 256; y++)
-            s += (zz[y] * zz[y]);
-    }
-
-    x += 0;
-}
-
-
 unsigned int vp8_get16x16var_mmx(
    const unsigned char *src_ptr,
    int  source_stride,
@@ -456,146 +416,6 @@ unsigned int vp8_sub_pixel_variance8x16_mmx
    return (xxsum - ((xsum * xsum) >> 7));
 }

-unsigned int vp8_i_variance16x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
-
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-    *sse = var;
-    return (var - ((avg * avg) >> 8));
-
-}
-
-unsigned int vp8_i_variance8x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-
-    *sse = var;
-    return (var - ((avg * avg) >> 7));
-
-}
-
-unsigned int vp8_i_sub_pixel_variance16x16_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-    int f2soffset = (src_pixels_per_line >> 1);
-    int f2doffset = (dst_pixels_per_line >> 1);
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum0, &xxsum0
-    );
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + f2soffset, src_pixels_per_line,
-        dst_ptr + f2doffset, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + f2soffset + 8, src_pixels_per_line,
-        dst_ptr + f2doffset + 8, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-    *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp8_i_sub_pixel_variance8x16_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-    int f2soffset = (src_pixels_per_line >> 1);
-    int f2doffset = (dst_pixels_per_line >> 1);
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum0, &xxsum0
-    );
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + f2soffset, src_pixels_per_line,
-        dst_ptr + f2doffset, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-    *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 7));
-}
-

 unsigned int vp8_variance_halfpixvar16x16_h_mmx(
    const unsigned char *src_ptr,
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -81,6 +81,16 @@ void vp8_filter_block2d_bil_var_sse2
    int *sum,
    unsigned int *sumsquared
 );
+void vp8_half_horiz_vert_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
 void vp8_half_horiz_vert_variance16x_h_sse2
 (
    const unsigned char *ref_ptr,
@@ -91,6 +101,16 @@ void vp8_half_horiz_vert_variance16x_h_sse2
    int *sum,
    unsigned int *sumsquared
 );
+void vp8_half_horiz_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
 void vp8_half_horiz_variance16x_h_sse2
 (
    const unsigned char *ref_ptr,
@@ -101,6 +121,16 @@ void vp8_half_horiz_variance16x_h_sse2
    int *sum,
    unsigned int *sumsquared
 );
+void vp8_half_vert_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
 void vp8_half_vert_variance16x_h_sse2
 (
    const unsigned char *ref_ptr,
@@ -262,21 +292,21 @@ unsigned int vp8_sub_pixel_variance8x8_wmt

    if (xoffset == 4 && yoffset == 0)
    {
-        vp8_half_horiz_variance16x_h_sse2(
+        vp8_half_horiz_variance8x_h_sse2(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 8,
            &xsum, &xxsum);
    }
    else if (xoffset == 0 && yoffset == 4)
    {
-        vp8_half_vert_variance16x_h_sse2(
+        vp8_half_vert_variance8x_h_sse2(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 8,
            &xsum, &xxsum);
    }
    else if (xoffset == 4 && yoffset == 4)
    {
-        vp8_half_horiz_vert_variance16x_h_sse2(
+        vp8_half_horiz_vert_variance8x_h_sse2(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 8,
            &xsum, &xxsum);
@@ -317,11 +347,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 16,
            &xsum0, &xxsum0);
-
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
    }
    else if (xoffset == 0 && yoffset == 4)
    {
@@ -329,11 +354,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 16,
            &xsum0, &xxsum0);
-
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
    }
    else if (xoffset == 4 && yoffset == 4)
    {
@@ -341,11 +361,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 16,
            &xsum0, &xxsum0);
-
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
    }
    else
    {
@@ -356,17 +371,16 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
            &xsum0, &xxsum0
        );

-
        vp8_filter_block2d_bil_var_sse2(
            src_ptr + 8, src_pixels_per_line,
            dst_ptr + 8, dst_pixels_per_line, 16,
            xoffset, yoffset,
            &xsum1, &xxsum1
        );
+        xsum0 += xsum1;
+        xxsum0 += xxsum1;
    }

-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
    *sse = xxsum0;
    return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
@@ -406,11 +420,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 8,
            &xsum0, &xxsum0);
-
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            &xsum1, &xxsum1);
    }
    else if (xoffset == 0 && yoffset == 4)
    {
@@ -418,11 +427,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 8,
            &xsum0, &xxsum0);
-
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            &xsum1, &xxsum1);
    }
    else if (xoffset == 4 && yoffset == 4)
    {
@@ -430,11 +434,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 8,
            &xsum0, &xxsum0);
-
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            &xsum1, &xxsum1);
    }
    else
    {
@@ -449,11 +448,10 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
            dst_ptr + 8, dst_pixels_per_line, 8,
            xoffset, yoffset,
            &xsum1, &xxsum1);
+        xsum0 += xsum1;
+        xxsum0 += xxsum1;
    }

-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
    *sse = xxsum0;
    return (xxsum0 - ((xsum0 * xsum0) >> 7));
 }
@@ -474,21 +472,21 @@ unsigned int vp8_sub_pixel_variance8x16_wmt

    if (xoffset == 4 && yoffset == 0)
    {
-        vp8_half_horiz_variance16x_h_sse2(
+        vp8_half_horiz_variance8x_h_sse2(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 16,
            &xsum, &xxsum);
    }
    else if (xoffset == 0 && yoffset == 4)
    {
-        vp8_half_vert_variance16x_h_sse2(
+        vp8_half_vert_variance8x_h_sse2(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 16,
            &xsum, &xxsum);
    }
    else if (xoffset == 4 && yoffset == 4)
    {
-        vp8_half_horiz_vert_variance16x_h_sse2(
+        vp8_half_horiz_vert_variance8x_h_sse2(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 16,
            &xsum, &xxsum);
@@ -506,81 +504,6 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
    return (xxsum - ((xsum * xsum) >> 7));
 }

-unsigned int vp8_i_variance16x16_wmt(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
-
-
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-
-    *sse = var;
-    return (var - ((avg * avg) >> 8));
-
-}
-
-unsigned int vp8_i_variance8x16_wmt(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-
-    *sse = var;
-    return (var - ((avg * avg) >> 7));
-
-}
-
-
-unsigned int vp8_i_sub_pixel_variance16x16_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
-}
-
-
-unsigned int vp8_i_sub_pixel_variance8x16_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-
-    return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
-}
-

 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
    const unsigned char *src_ptr,
@@ -589,21 +512,14 @@ unsigned int vp8_variance_halfpixvar16x16_h_wmt(
    int  dst_pixels_per_line,
    unsigned int *sse)
 {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
+    int xsum0;
+    unsigned int xxsum0;

    vp8_half_horiz_variance16x_h_sse2(
        src_ptr, src_pixels_per_line,
        dst_ptr, dst_pixels_per_line, 16,
        &xsum0, &xxsum0);

-    vp8_half_horiz_variance16x_h_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
    *sse = xxsum0;
    return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
@@ -616,21 +532,13 @@ unsigned int vp8_variance_halfpixvar16x16_v_wmt(
    int  dst_pixels_per_line,
    unsigned int *sse)
 {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
+    int xsum0;
+    unsigned int xxsum0;
    vp8_half_vert_variance16x_h_sse2(
        src_ptr, src_pixels_per_line,
        dst_ptr, dst_pixels_per_line, 16,
        &xsum0, &xxsum0);

-    vp8_half_vert_variance16x_h_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
    *sse = xxsum0;
    return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
@@ -643,21 +551,14 @@ unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
    int  dst_pixels_per_line,
    unsigned int *sse)
 {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
+    int xsum0;
+    unsigned int xxsum0;

    vp8_half_horiz_vert_variance16x_h_sse2(
        src_ptr, src_pixels_per_line,
        dst_ptr, dst_pixels_per_line, 16,
        &xsum0, &xxsum0);

-    vp8_half_horiz_vert_variance16x_h_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
    *sse = xxsum0;
    return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
--- a/vp8/encoder/x86/variance_ssse3.c
+++ b/vp8/encoder/x86/variance_ssse3.c
@@ -0,0 +1,165 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/encoder/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern unsigned int vp8_get16x16var_sse2
+(
+    const unsigned char *src_ptr,
+    int source_stride,
+    const unsigned char *ref_ptr,
+    int recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+extern void vp8_half_horiz_vert_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_half_horiz_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_half_vert_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_ssse3
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int  xoffset,
+    int  yoffset,
+    int *sum,
+    unsigned int *sumsquared
+);
+
+unsigned int vp8_sub_pixel_variance16x16_ssse3
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0;
+    unsigned int xxsum0;
+
+    // note we could avoid these if statements if the calling function
+    // just called the appropriate functions inside.
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_ssse3(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            xoffset, yoffset,
+            &xsum0, &xxsum0);
+    }
+
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp8_sub_pixel_variance16x8_ssse3
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+
+)
+{
+    int xsum0;
+    unsigned int xxsum0;
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_ssse3(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum0, &xxsum0);
+    }
+
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -286,6 +286,8 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
 #if HAVE_SSSE3
 extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
 extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_ssse3);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3);

 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_variance_sad16x16x3
@@ -294,6 +296,12 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
 #undef  vp8_variance_sad16x8x3
 #define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3

+#undef  vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_ssse3
+
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3
+
 #endif
 #endif

--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -16,7 +16,7 @@


 #if HAVE_MMX
-void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
+static void short_fdct8x4_mmx(short *input, short *output, int pitch)
 {
    vp8_short_fdct4x4_mmx(input,   output,    pitch);
    vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
@@ -26,7 +26,7 @@ int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
                                 short *qcoeff_ptr, short *dequant_ptr,
                                 short *scan_mask, short *round_ptr,
                                 short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
+static void fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
 {
    short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
    short *coeff_ptr   = b->coeff;
@@ -51,7 +51,7 @@ void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
 }

 int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc)
+static int mbblock_error_mmx(MACROBLOCK *mb, int dc)
 {
    short *coeff_ptr =  mb->block[0].coeff;
    short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
@@ -59,7 +59,7 @@ int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc)
 }

 int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-int vp8_mbuverror_mmx(MACROBLOCK *mb)
+static int mbuverror_mmx(MACROBLOCK *mb)
 {
    short *s_ptr = &mb->coeff[256];
    short *d_ptr = &mb->e_mbd.dqcoeff[256];
@@ -69,7 +69,7 @@ int vp8_mbuverror_mmx(MACROBLOCK *mb)
 void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
                             short *diff, unsigned char *predictor,
                             int pitch);
-void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
+static void subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
 {
    unsigned char *z = *(be->base_src) + be->src;
    unsigned int  src_stride = be->src_stride;
@@ -85,7 +85,7 @@ int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
                                 short *qcoeff_ptr, short *dequant_ptr,
                                 const short *inv_scan_order, short *round_ptr,
                                 short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
+static void fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
    short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
    short *coeff_ptr   = b->coeff;
@@ -115,7 +115,7 @@ int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
                                     short *zbin_boost_ptr,
                                     short *quant_shift_ptr);

-void vp8_regular_quantize_b_sse2(BLOCK *b,BLOCKD *d)
+static void regular_quantize_b_sse2(BLOCK *b,BLOCKD *d)
 {
    d->eob = vp8_regular_quantize_b_impl_sse2(b->coeff,
                                              b->zbin,
@@ -131,7 +131,7 @@ void vp8_regular_quantize_b_sse2(BLOCK *b,BLOCKD *d)
 }

 int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)
+static int mbblock_error_xmm(MACROBLOCK *mb, int dc)
 {
    short *coeff_ptr =  mb->block[0].coeff;
    short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
@@ -139,7 +139,7 @@ int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)
 }

 int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-int vp8_mbuverror_xmm(MACROBLOCK *mb)
+static int mbuverror_xmm(MACROBLOCK *mb)
 {
    short *s_ptr = &mb->coeff[256];
    short *d_ptr = &mb->e_mbd.dqcoeff[256];
@@ -149,7 +149,7 @@ int vp8_mbuverror_xmm(MACROBLOCK *mb)
 void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
                             short *diff, unsigned char *predictor,
                             int pitch);
-void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
+static void subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
 {
    unsigned char *z = *(be->base_src) + be->src;
    unsigned int  src_stride = be->src_stride;
@@ -165,7 +165,7 @@ int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr,
                                 short *qcoeff_ptr, short *dequant_ptr,
                                 short *round_ptr,
                                 short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
+static void fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
 {
    d->eob = vp8_fast_quantize_b_impl_ssse3(
                    b->coeff,
@@ -176,6 +176,25 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
                    d->dqcoeff
               );
 }
+#if CONFIG_PSNR
+#if ARCH_X86_64
+typedef void ssimpf
+(
+    unsigned char *s,
+    int sp,
+    unsigned char *r,
+    int rp,
+    unsigned long *sum_s,
+    unsigned long *sum_r,
+    unsigned long *sum_sq_s,
+    unsigned long *sum_sq_r,
+    unsigned long *sum_sxr
+);
+
+extern ssimpf vp8_ssim_parms_16x16_sse3;
+extern ssimpf vp8_ssim_parms_8x8_sse3;
+#endif
+#endif
 #endif


@@ -232,20 +251,20 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_mmx;

        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_mmx;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_mmx;
+        cpi->rtcd.fdct.short8x4                  = short_fdct8x4_mmx;
        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_mmx;
-        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_mmx;
+        cpi->rtcd.fdct.fast8x4                   = short_fdct8x4_mmx;

        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;

        cpi->rtcd.encodemb.berr                  = vp8_block_error_mmx;
-        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_mmx;
-        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_mmx;
-        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_mmx;
+        cpi->rtcd.encodemb.mberr                 = mbblock_error_mmx;
+        cpi->rtcd.encodemb.mbuverr               = mbuverror_mmx;
+        cpi->rtcd.encodemb.subb                  = subtract_b_mmx;
        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_mmx;
        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_mmx;

-        /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_mmx;*/
+        /*cpi->rtcd.quantize.fastquantb            = fast_quantize_b_mmx;*/
    }
 #endif

@@ -280,6 +299,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_sse2;
        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_sse2;
        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_sse2;
+
+
        /* cpi->rtcd.variance.get4x4sse_cs  not implemented for wmt */;

        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_sse2;
@@ -290,16 +311,16 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_sse2 ;

        cpi->rtcd.encodemb.berr                  = vp8_block_error_xmm;
-        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_xmm;
-        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_xmm;
-        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_sse2;
+        cpi->rtcd.encodemb.mberr                 = mbblock_error_xmm;
+        cpi->rtcd.encodemb.mbuverr               = mbuverror_xmm;
+        cpi->rtcd.encodemb.subb                  = subtract_b_sse2;
        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_sse2;
        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_sse2;

 #if ARCH_X86
-        cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b_sse2;
+        cpi->rtcd.quantize.quantb                = regular_quantize_b_sse2;
 #endif
-        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse2;
+        cpi->rtcd.quantize.fastquantb            = fast_quantize_b_sse2;

 #if !(CONFIG_REALTIME_ONLY)
        cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_sse2;
@@ -334,11 +355,23 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;
        cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;

-        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_ssse3;
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_ssse3;
+
+        cpi->rtcd.quantize.fastquantb            = fast_quantize_b_ssse3;
+
+#if CONFIG_PSNR
+#if ARCH_X86_64
+        cpi->rtcd.variance.ssimpf_8x8            = vp8_ssim_parms_8x8_sse3;
+        cpi->rtcd.variance.ssimpf                = vp8_ssim_parms_16x16_sse3;
+#endif
+#endif

    }
 #endif

+
+
 #if HAVE_SSE4_1
    if (SSE4_1Enabled)
    {
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -24,6 +24,7 @@ VP8_COMMON_SRCS-yes += common/entropymode.c
 VP8_COMMON_SRCS-yes += common/entropymv.c
 VP8_COMMON_SRCS-yes += common/extend.c
 VP8_COMMON_SRCS-yes += common/filter.c
+VP8_COMMON_SRCS-yes += common/filter.h
 VP8_COMMON_SRCS-yes += common/findnearmv.c
 VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
 VP8_COMMON_SRCS-yes += common/idctllm.c
@@ -68,7 +69,7 @@ VP8_COMMON_SRCS-yes += common/reconintra.c
 VP8_COMMON_SRCS-yes += common/reconintra4x4.c
 VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
-VP8_COMMON_SRCS-yes += common/textblit.c
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
 VP8_COMMON_SRCS-yes += common/treecoder.c

 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -199,7 +199,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
    {
        int              mb_r = (cfg->g_h + 15) / 16;
        int              mb_c = (cfg->g_w + 15) / 16;
-        size_t           packet_sz = vp8_firstpass_stats_sz(mb_r * mb_c);
+        size_t           packet_sz = sizeof(FIRSTPASS_STATS);
        int              n_packets = cfg->rc_twopass_stats_in.sz / packet_sz;
        FIRSTPASS_STATS *stats;

--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -168,7 +168,7 @@ static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id)
 {
    int i;

-    for (i = 0; i < NELEMENTS(vp8_mem_req_segs); i++)
+    for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
        if (ctx->mmaps[i].id == id)
            return ctx->mmaps[i].base;

@@ -176,25 +176,7 @@ static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id)
 }
 static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx)
 {
-    /*
-    ctx->pbi = mmap_lkup(ctx, VP6_SEG_PB_INSTANCE);
-    ctx->pbi->mbi.block_dx_info[0].idct_output_ptr = mmap_lkup(ctx, VP6_SEG_IDCT_BUFFER);
-    ctx->pbi->loop_filtered_block = mmap_lkup(ctx, VP6_SEG_LF_BLOCK);
-    ctx->pbi->huff = mmap_lkup(ctx, VP6_SEG_HUFF);
-    ctx->pbi->mbi.coeffs_base_ptr = mmap_lkup(ctx, VP6_SEG_COEFFS);
-    ctx->pbi->fc.above_y = mmap_lkup(ctx, VP6_SEG_ABOVEY);
-    ctx->pbi->fc.above_u = mmap_lkup(ctx, VP6_SEG_ABOVEU);
-    ctx->pbi->fc.above_v = mmap_lkup(ctx, VP6_SEG_ABOVEV);
-    ctx->pbi->prediction_mode = mmap_lkup(ctx, VP6_SEG_PRED_MODES);
-    ctx->pbi->mbmotion_vector = mmap_lkup(ctx, VP6_SEG_MV_FIELD);
-    ctx->pbi->fb_storage_ptr[0] = mmap_lkup(ctx, VP6_SEG_IMG0_STRG);
-    ctx->pbi->fb_storage_ptr[1] = mmap_lkup(ctx, VP6_SEG_IMG1_STRG);
-    ctx->pbi->fb_storage_ptr[2] = mmap_lkup(ctx, VP6_SEG_IMG2_STRG);
-    #if CONFIG_POSTPROC
-    ctx->pbi->postproc.deblock.fragment_variances = mmap_lkup(ctx, VP6_SEG_DEBLOCKER);
-    ctx->pbi->fb_storage_ptr[3] = mmap_lkup(ctx, VP6_SEG_PP_IMG_STRG);
-    #endif
-    */
+    /* nothing to clean up */
 }

 static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx)
@@ -543,7 +525,7 @@ static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t         *ctx,

    if (!res && ctx->priv->alg_priv)
    {
-        for (i = 0; i < NELEMENTS(vp8_mem_req_segs); i++)
+        for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)
        {
            if (ctx->priv->alg_priv->mmaps[i].id == mmap->id)
                if (!ctx->priv->alg_priv->mmaps[i].base)
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -110,10 +110,13 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
+VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm

 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -19,6 +19,7 @@ VP8_CX_SRCS-$(ARCH_ARM)  += encoder/asm_enc_offsets.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/encodemb_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/quantize_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/picklpf_arm.c
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/dct_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/variance_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/variance_arm.h
 VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
@@ -34,8 +35,12 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_partitions_ar

 #File list for armv6
 # encoder
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_fdct4x4_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)

 #File list for neon
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -332,7 +332,7 @@ typedef struct vpx_codec_priv_cb_pair
 * extended in one of two ways. First, a second, algorithm specific structure
 * can be allocated and the priv member pointed to it. Alternatively, this
 * structure can be made the first member of the algorithm specific structure,
- * and the pointer casted to the proper type.
+ * and the pointer cast to the proper type.
 */
 struct vpx_codec_priv
 {
--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c
@@ -36,6 +36,8 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
        res = VPX_CODEC_INCAPABLE;
    else if ((flags & VPX_CODEC_USE_POSTPROC) && !(iface->caps & VPX_CODEC_CAP_POSTPROC))
        res = VPX_CODEC_INCAPABLE;
+    else if (!(iface->caps & VPX_CODEC_CAP_DECODER))
+        res = VPX_CODEC_INCAPABLE;
    else
    {
        memset(ctx, 0, sizeof(*ctx));
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -498,7 +498,7 @@ extern "C" {
     * Iterates over a list of the segments to allocate. The iterator storage
     * should be initialized to NULL to start the iteration. Iteration is complete
     * when this function returns VPX_CODEC_LIST_END. The amount of memory needed to
-     * allocate is dependant upon the size of the encoded stream. In cases where the
+     * allocate is dependent upon the size of the encoded stream. In cases where the
     * stream is not available at allocation time, a fixed size must be requested.
     * The codec will not be able to operate on streams larger than the size used at
     * allocation time.
--- a/vpx/vpx_decoder_compat.h
+++ b/vpx/vpx_decoder_compat.h
@@ -527,7 +527,7 @@ extern "C" {
     * Iterates over a list of the segments to allocate. The iterator storage
     * should be initialized to NULL to start the iteration. Iteration is complete
     * when this function returns VPX_DEC_LIST_END. The amount of memory needed to
-     * allocate is dependant upon the size of the encoded stream. This means that
+     * allocate is dependent upon the size of the encoded stream. This means that
     * the stream info structure must be known at allocation time. It can be
     * populated with the vpx_dec_peek_stream_info() function. In cases where the
     * stream to be decoded is not available at allocation time, a fixed size must
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -168,15 +168,10 @@
    %macro GET_GOT 1
      push %1
      call %%get_got
-      %%sub_offset:
-      jmp  %%exitGG
      %%get_got:
-      mov  %1, [esp]
-      add %1, fake_got - %%sub_offset
-      ret
-      %%exitGG:
+      pop  %1
      %undef GLOBAL
-      %define GLOBAL(x) x + %1 - fake_got
+      %define GLOBAL(x) x + %1 - %%get_got
      %undef RESTORE_GOT
      %define RESTORE_GOT pop %1
    %endmacro
@@ -289,7 +284,6 @@
 %elifidn __OUTPUT_FORMAT__,macho32
 %macro SECTION_RODATA 0
 section .text
-fake_got:
 %endmacro
 %else
 %define SECTION_RODATA section .rodata
--- a/vpx_scale/arm/scalesystemdependent.c
+++ b/vpx_scale/arm/scalesystemdependent.c
--- a/vpx_scale/generic/scalesystemdependent.c
+++ b/vpx_scale/generic/scalesystemdependent.c
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -88,24 +88,3 @@ vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int

    return 0;
 }
-
-/****************************************************************************
- *
- ****************************************************************************/
-int
-vp8_yv12_black_frame_buffer(YV12_BUFFER_CONFIG *ybf)
-{
-    if (ybf)
-    {
-        if (ybf->buffer_alloc)
-        {
-            duck_memset(ybf->y_buffer, 0x0, ybf->y_stride * ybf->y_height);
-            duck_memset(ybf->u_buffer, 0x80, ybf->uv_stride * ybf->uv_height);
-            duck_memset(ybf->v_buffer, 0x80, ybf->uv_stride * ybf->uv_height);
-        }
-
-        return 0;
-    }
-
-    return -1;
-}
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -145,8 +145,8 @@ vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
 }


-void
-vp8_yv12_extend_frame_borders_yonly(YV12_BUFFER_CONFIG *ybf)
+static void
+extend_frame_borders_yonly(YV12_BUFFER_CONFIG *ybf)
 {
    int i;
    unsigned char *src_ptr1, *src_ptr2;
@@ -276,5 +276,5 @@ vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_y
        dest   += dst_ybc->y_stride;
    }

-    vp8_yv12_extend_frame_borders_yonly(dst_ybc);
+    extend_frame_borders_yonly(dst_ybc);
 }
--- a/vpx_scale/vpx_scale.mk
+++ b/vpx_scale/vpx_scale.mk
@@ -6,13 +6,13 @@ SCALE_SRCS-yes += vpxscale.h
 SCALE_SRCS-yes += generic/vpxscale.c
 SCALE_SRCS-yes += generic/yv12config.c
 SCALE_SRCS-yes += generic/yv12extend.c
-SCALE_SRCS-yes += generic/scalesystemdependant.c
+SCALE_SRCS-yes += generic/scalesystemdependent.c
 SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c

 #arm
-SCALE_SRCS-$(HAVE_ARMV7)         += arm/scalesystemdependant.c
+SCALE_SRCS-$(HAVE_ARMV7)         += arm/scalesystemdependent.c
 SCALE_SRCS-$(HAVE_ARMV7)         += arm/yv12extend_arm.c
-SCALE_SRCS_REMOVE-$(HAVE_ARMV7)  += generic/scalesystemdependant.c
+SCALE_SRCS_REMOVE-$(HAVE_ARMV7)  += generic/scalesystemdependent.c

 #neon
 SCALE_SRCS-$(HAVE_ARMV7)  += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM)
--- a/vpx_scale/win32/scalesystemdependent.c
+++ b/vpx_scale/win32/scalesystemdependent.c
@@ -11,9 +11,9 @@

 /****************************************************************************
 *
-*   Module Title :     system_dependant.c
+*   Module Title :     system_dependent.c
 *
-*   Description  :     Miscellaneous system dependant functions
+*   Description  :     Miscellaneous system dependent functions
 *
 ****************************************************************************/

--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -63,7 +63,6 @@ extern "C"

    int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border);
    int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
-    int vp8_yv12_black_frame_buffer(YV12_BUFFER_CONFIG *ybf);

 #ifdef __cplusplus
 }
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -163,8 +163,8 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass)

        if (!stats->buf.buf)
        {
-            fprintf(stderr, "Failed to allocate first-pass stats buffer (%d bytes)\n",
-                    stats->buf_alloc_sz);
+            fprintf(stderr, "Failed to allocate first-pass stats buffer (%lu bytes)\n",
+                    (unsigned long)stats->buf_alloc_sz);
            exit(EXIT_FAILURE);
        }

@@ -924,8 +924,14 @@ static const arg_def_t resize_up_thresh   = ARG_DEF(NULL, "resize-up", 1,
        "Upscale threshold (buf %)");
 static const arg_def_t resize_down_thresh = ARG_DEF(NULL, "resize-down", 1,
        "Downscale threshold (buf %)");
-static const arg_def_t end_usage          = ARG_DEF(NULL, "end-usage", 1,
-        "VBR=0 | CBR=1 | CQ=2");
+static const struct arg_enum_list end_usage_enum[] = {
+    {"vbr", VPX_VBR},
+    {"cbr", VPX_CBR},
+    {"cq",  VPX_CQ},
+    {NULL, 0}
+};
+static const arg_def_t end_usage          = ARG_DEF_ENUM(NULL, "end-usage", 1,
+        "Rate control mode", end_usage_enum);
 static const arg_def_t target_bitrate     = ARG_DEF(NULL, "target-bitrate", 1,
        "Bitrate (kbps)");
 static const arg_def_t min_quantizer      = ARG_DEF(NULL, "min-q", 1,
@@ -1256,7 +1262,7 @@ int main(int argc, const char **argv_)
        else if (arg_match(&arg, &resize_down_thresh, argi))
            cfg.rc_resize_down_thresh = arg_parse_uint(&arg);
        else if (arg_match(&arg, &end_usage, argi))
-            cfg.rc_end_usage = arg_parse_uint(&arg);
+            cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
        else if (arg_match(&arg, &target_bitrate, argi))
            cfg.rc_target_bitrate = arg_parse_uint(&arg);
        else if (arg_match(&arg, &min_quantizer, argi))
Author	SHA1	Message	Date
John Koleszar	744a58bc1c	vpx_codec_dec_init: check that the iface is a decoder Make sure the given interface is actually a decoder interface before initializing it. Change-Id: Ie48d737f2956cc2f0891666de5ea87251e96bc49	2011-03-24 15:05:10 +02:00
John Koleszar	86b5556f5a	Remove unused vp8_get4x4sse_cs_mmx declaration This declaration did not match the prototype_sad() prototype, but was unused in this translation unit, so it is removed instead. Fixes issue 290. Change-Id: I168854f88a85f73ca9aaf61d1e5dc0f43fc3fdb3	2011-03-24 15:05:10 +02:00
John Koleszar	4375b4ac39	Allow specifying --end-usage by enum name Map an enum to the --end-usage values, so you can specify --end-usage=cq instead of --end-usage=2. The numerical values still work for historical scripts, etc, but this is more user friendly. Change-Id: I445ecd9638f801f5924a71eabf449bee293cdd34	2011-03-24 15:05:10 +02:00
Tero Rintaluoma	71595edd47	ARMv6 optimized fdct4x4 Optimized fdct4x4 (8x4) for ARMv6 instruction set. - No interlocks in Cortex-A8 pipeline - One interlock cycle in ARM11 pipeline - About 2.16 times faster than current C-code compiled with -O3 Change-Id: I60484ecd144365da45bb68a960d30196b59952b8	2011-03-24 15:05:10 +02:00
Attila Nagy	848dddee15	Fix multithreaded encoding for 1 MB wide frame Thread synchronization was not correct when frame width was 1 MB. Number of allocated encoding threads is limited by the sync_range. There is no point having more because each thread lags sync_range MBs behind the thread processing the row above. http://code.google.com/p/webm/issues/detail?id=302 Change-Id: Icaf67a883beecc5ebf2f11e9be47b6997fdf6f26	2011-03-24 15:05:09 +02:00
John Koleszar	f1ba70e199	Increase static linkage, remove unused functions A large number of functions were defined with external linkage, even though they were only used from within one file. This patch changes their linkage to static and removes the vp8_ prefix from their names, which should make it more obvious to the reader that the function is contained within the current translation unit. Functions that were not referenced were removed. These symbols were identified by: $ nm -A libvpx.a \| sort -k3 \| uniq -c -f2 \| grep ' [A-Z] ' \ \| sort \| grep '^ *1 ' Change-Id: I59609f58ab65312012c047036ae1e0634f795779	2011-03-24 15:05:09 +02:00
Attila Nagy	a22df2e29d	use semaphore for partition thread synch Change-Id: If368371097d93614ae497d99be2d39c7b0eb5f47	2011-03-18 13:25:51 +02:00
Ralph Giles	185557344a	Set bounds from the array when iterating mmaps. The mmap allocation code in vp8_dx_iface.c was inconsistent. The static array vp8_mem_req_segs defines two descriptors, but only the first is real. The second is a sentinel and isn't actually allocated, so vpx_codec_alg_priv is declared with mmaps[NELEMENTS(vp8_mem_req_segs)-1]. Some functions use this reduced upper bound when iterating though the mmap array, but these two functions did not. Instead, this commit calls NELEMENTS(...->mmaps) to directly query the bounds of the dereferenced array. This fixes an array-bounds warning from gcc 4.6 on vp8_xma_set_mmap. Change-Id: I918e2721b401d134c1a9764c978912bdb3188be1	2011-03-17 14:52:05 -07:00
Ralph Giles	de5182eef3	Remove commented-out VP6 code from vp8_finalize_mmaps Change-Id: I48642c380353043bed96026f56de5908fcee270a	2011-03-17 14:51:31 -07:00
John Koleszar	8431e768c9	Merge "Fix "used uninitialized" warning in vp8_pack_bitstream()"	2011-03-17 14:25:04 -07:00
John Koleszar	de50520a8c	apple: include proper mach primatives Fixes implicit declaration warning for 'mach_task_self'. This change is an update to Change I9991dedd1ccfddc092eca86705ecbc3b764b799d, which fixed this issue for the decoder but not the encoder. Change-Id: I9df033e81f9520c4f975b7a7cf6c643d12e87c96	2011-03-16 13:59:32 -04:00
Attila Nagy	346b3e7ce9	Remove echoing in obj_int_extract rule Change-Id: I9965170b40e2f32e9d84895c33a529b0d7dacdc1	2011-03-16 12:56:52 +02:00
Attila Nagy	71bcd9f1af	Add vp8_variance8x8_armv6 and vp8_sub_pixel_variance8x8_armv6 functions Change-Id: I08edaffc62514907fa5e90e1689269e467c857f5	2011-03-15 15:50:44 +02:00
Gaute Strokkenes	6795e256c1	Avoid misspelling "dependent". Change-Id: Ib0c280e1fcfd977e11e4390807b2c8077a87500c	2011-03-15 12:58:29 +00:00
John Koleszar	8c48c943e7	Merge "Fix an unused variable warning."	2011-03-14 14:13:53 -07:00
Ralph Giles	aa4a90c880	Improve grammar in a comment. Change-Id: I18bfda6d420626f2718e096e338c1d0bf0ba029d	2011-03-14 13:41:50 -07:00
Johann	2ec0cfbe99	obj_int_extract: win64 does not prefix symbols obj_int_extract was unconditionally skipping the first character in the symbol. make sure it's actually an '_' first Change-Id: Icfe527eb8a0028faeabaa1dcedf8cd8f51c92754	2011-03-14 16:11:02 -04:00
Johann	d0ec28b3d3	Merge "Add vp8_mse16x16_armv6 function"	2011-03-14 12:47:42 -07:00
Attila Nagy	e54dcfe88d	Add vp8_mse16x16_armv6 function Change-Id: I77e9f2f521a71089228f96e2db72524189364ffb	2011-03-14 14:38:31 +02:00
Rafael Ávila de Espíndola	52f6e28e9e	Fix build with xcode4 and simplify GLOBAL. Without this change I get link errors in firefox's libxul. It looks like the linker expect a particular pattern for getting the GOT. This patch changes webm to use the same pattern used by the compiler. Change-Id: Iea8c2e134ad45c1dc7d221ff885a8429bfa4e057	2011-03-12 10:45:22 -05:00
Johann	3788b3564c	Merge "Move build_intra_predictors_mby to RTCD framework"	2011-03-11 10:23:48 -08:00
John Koleszar	27972d2c1d	Move build_intra_predictors_mby to RTCD framework The vp8_build_intra_predictors_mby and vp8_build_intra_predictors_mby_s functions had global function pointers rather than using the RTCD framework. This can show up as a potential data race with tools such as helgrind. See https://bugzilla.mozilla.org/show_bug.cgi?id=640935 for an example. Change-Id: I29c407f828ac2bddfc039f852f138de5de888534	2011-03-11 13:04:50 -05:00
Johann	5c60a646f3	Merge "ARMv6 optimized quantization"	2011-03-11 08:29:00 -08:00
John Koleszar	75051c8b59	Merge "Only enable ssim_opt.asm on X86_64"	2011-03-11 08:28:05 -08:00
John Koleszar	5db0eeea21	Only enable ssim_opt.asm on X86_64 Fix compiling on 32 bit x86. Change-Id: I6210573e1d9287ac49acbe3d7e5181e309316107	2011-03-11 11:27:08 -05:00
Paul Wilkins	6e73748492	Clean up of vp8_init_config() Clean up vp8_init_config() a bit and remove null pointer case, as this code can't be called any more and is not an adequate trap anyway, as a null pointer would cause exceptions before hitting the test. Change-Id: I937c00167cc039b3aa3f645f29c319d58ae8d3ee	2011-03-11 11:06:51 -05:00
John Koleszar	170b87390e	Merge "1 Pass CQ and VBR bug fixes"	2011-03-11 08:06:09 -08:00
Paul Wilkins	2ae91fbef0	1 Pass CQ and VBR bug fixes Issue 291 highlighted the fact that CQ mode was not working as expected in 1 pass mode, This commit fixes that specific problem but in so doing I also uncovered an overflow issue in the VBR code for 1 pass and some data values not being correctly initialized. For some clips (particularly short clips), the resulting improvement is dramatic. Change-Id: Ieefd6c6e4776eb8f1b0550dbfdfb72f86b33c960	2011-03-11 10:59:34 -05:00
John Koleszar	e34e417d94	Merge "Fix incorrect macroblock counts in twopass rate control"	2011-03-11 06:06:04 -08:00
Yunqing Wang	3c9dd6c3ef	Merge "Align SAD output array to be 16-byte aligned"	2011-03-11 05:56:02 -08:00
John Koleszar	c5c5dcd0be	Merge "vp8cx - psnr converted to call assemblerized sse"	2011-03-11 05:54:00 -08:00
John Koleszar	29c46b64a2	Merge "vp8cx- alternate ssim function with optimizations"	2011-03-11 05:53:41 -08:00
Jim Bankoski	3dc382294b	vp8cx - psnr converted to call assemblerized sse Change-Id: Ie388d4618c44b131f96b9fe526618b457f020dfa	2011-03-11 08:51:22 -05:00
Jim Bankoski	3f6f7289aa	vp8cx- alternate ssim function with optimizations Change-Id: I91921b0a90dbaddc7010380b038955be347964b3	2011-03-11 08:51:21 -05:00
Yunqing Wang	b2aa401776	Align SAD output array to be 16-byte aligned Use aligned store. Change-Id: Icab4c0c53da811d0c52bb7e8134927f249ba2499	2011-03-11 08:24:23 -05:00
Yunqing Wang	76ec21928c	Merge "Encoder loopfilter running in its own thread"	2011-03-11 04:55:05 -08:00
Attila Nagy	9c836daf65	Fix "used uninitialized" warning in vp8_pack_bitstream() Change-Id: Iadcbdba717439f47a2c24e65fd69a3a1464174b5	2011-03-11 12:36:28 +02:00
Attila Nagy	3ae2465788	Encoder loopfilter running in its own thread In multithreaded mode the loopfilter is running in its own thread (filter level calculation and frame filtering). Filtering is mostly done in parallel with the bitstream packing. Before starting the packing the loopfilter level has to be calculated. Also any needed reference frame copying is done in the filter thread. Currently the encoder will create n+1 threads, where n > 1 is the number of threads specified by application and 1 is the extra filter thread. With n = 1 the encoder runs in single thread mode. There will never be more than n threads running concurrently. Change-Id: I4fb29b559a40275d6d3babb8727245c40fba931b	2011-03-11 10:52:51 +02:00
Tero Rintaluoma	7ab08e1fee	ARMv6 optimized quantization Adds new ARMv6 optimized function vp8_fast_quantize_b_armv6 to the encoder. Change-Id: I40277ec8f82e8a6cbc453cf295a0cc9b2504b21e	2011-03-11 10:48:42 +02:00
Johann	128d2c23b3	obj_int_extract for Visual Studio Enable extraction of assembly offsets from compiled examples in MSVS. This will allow us to remove some stub functions from x86 assembly since we will be able to reliably determine structure offsets at compile time. see ARM code for examples: vp8/encoder/arm/armv5te/ vpx_scale/arm/neon/ Change-Id: I1852dc6b56ede0bf1dddb5552196222a7c6a902f	2011-03-10 18:49:54 -05:00
Adrian Grange	6daacdb785	Added missing format specifier in print statement Printout of firstpass stats for frame had one fewer format specifiers than arguments. Change-Id: I5a42c85aa79c471e1a70afd75e24a91546b7a1cd	2011-03-10 12:43:49 -08:00
Adrian Grange	ed40ff9e2d	Removed firstpass motion map The firstpass motion map consists of an 8-bit flag for each MB indicating how strongly the firstpass code believes it should be filtered during the second pass ARNR filtering. For long or large format material the motion map can become extremely large and hamper the operation of the encoding process. This change removes the motion map altogether, leaving the second pass to rely on the magnitude of the motion compensated error to determine the filter weight to use for the MB during ARNR filtering. Tests on the derf set indicate that the effect of this change is neutral, with some small wins and losses. The motion map has therefore been removed based on a cost/benefit evaluation. Change-Id: I53e07d236f5ce09a6f0c54e7c4ffbb490fb870f6	2011-03-10 11:32:48 -08:00
James Berry	f3e9e2a0f8	Fix incorrect macroblock counts in twopass rate control The previous calculation of macroblock count (w*h)/256 is not correct when the width/height are not multiples of 16. Use the precalculated macroblock count from cpi->common instead. This manifested itself as a divide by zero when the number of pixels was less than 256. num_mbs updated in estimate_max_q, estimate_q, estimate_kf_group_q, and estimate_cq Change-Id: I92ff98587864c801b1ee5485cfead964673a9973	2011-03-10 13:33:06 -05:00
Yunqing Wang	a0306ea660	Merge "Add vp8_sub_pixel_variance16x8_ssse3 function"	2011-03-09 12:26:37 -08:00
John Koleszar	c5a049babd	Merge branch 'bali' Change-Id: Icf18b4981afb12ef255fca431d4ba45860dd22c9	2011-03-09 14:11:54 -05:00
John Koleszar	5c24071504	Add missing filter.h to build system Missing file causes 'make dist' to not include a complete copy of the source. Change-Id: I3f55aeb5a86d0e81234e4e4588cb8086ba4cfc4a	2011-03-09 13:43:31 -05:00
Johann	43baf7ff21	Merge "fix obj_int_extract for MinGW"	2011-03-09 09:44:49 -08:00
Yunqing Wang	7b8e7f0f3a	Add vp8_sub_pixel_variance16x8_ssse3 function Added SSSE3 function Change-Id: I8c304c92458618d93fda3a2f62bd09ccb63e75ad	2011-03-09 12:33:21 -05:00
Yunqing Wang	4561109a69	Remove unused functions Removed some unused functions Change-Id: Ifdfc27453e53cfc75997b38492901d193a16b245	2011-03-09 10:45:03 -05:00
Yunqing Wang	7966dd5287	Merge "Improve SSE2 half-pixel filter funtions"	2011-03-09 07:23:06 -08:00
John Koleszar	fa836faede	Merge "Configuration updates:Making a clear distinction between Init and Change"	2011-03-09 05:07:11 -08:00
Ralph Giles	56efffdcd1	Fix an unused variable warning. Move the update of the loopfilter info to the same block where it is used. GCC 4.5 is not able trace the initialization of the local filter_info across the other calls between the two conditionals on pbi->common and issues an uninitialized variable warning. Change-Id: Ie4487b3714a096b3fb21608f6b0c74e745e3c6fc	2011-03-08 14:56:15 -08:00
Johann	fb037ec05b	fix obj_int_extract for MinGW failed to find headers in the source directory output to stdout instead of a hardcoded file MinGW doesn't support _sopen_s _fstat catches non-existant files Change-Id: I24e0aacc6f6f26e6bcfc25f9ee7821aa3c8cc7e7	2011-03-08 17:44:52 -05:00
Yunqing Wang	419f638910	Improve SSE2 half-pixel filter funtions Rewrote these functions to process 16 pixels once instead of 8. Change-Id: Ic67e80124467a446a3df4cfecfb76a4248602adb	2011-03-08 16:25:06 -05:00
Johann	95adf3df77	Merge "64bit mach-o support"	2011-03-08 12:29:32 -08:00
Yunqing Wang	859abd6b5d	Merge "Add zero offset checking in SSE2 sub-pixel filter function"	2011-03-08 12:26:58 -08:00
Yunqing Wang	8432a1729f	Add zero offset checking in SSE2 sub-pixel filter function Skip filter at zero offset. Change-Id: I95fc7e211869bc0ab5bcfb7ab2e3259d1c0ccf38	2011-03-08 15:22:07 -05:00
Yunqing Wang	e8f7b0f7f5	Merge "Write SSSE3 sub-pixel filter function"	2011-03-08 10:58:30 -08:00
Yunqing Wang	244e2e1451	Write SSSE3 sub-pixel filter function 1. Process 16 pixels at one time instead of 8. 2. Add check for both xoffset =0 and yoffset=0, which happens during motion search. This change gave encoder 1%~3% performance gain. Change-Id: Idaa39506b48f4f8b2fbbeb45aae8226fa32afb3e	2011-03-08 13:29:01 -05:00
Johann	5091e01ea1	64bit mach-o support enable parsing 64bit mach-o files (OS X) also fixes --enable-debug issue! Change-Id: I250ee69745cd2365e3e63264f9365cd58fbb6678	2011-03-08 11:42:30 -05:00
Johann	ddd260eb62	64bit elf support enable parsing 64bit elf files Change-Id: I7981f4769cf1b822f288fe2e32166254e4394bab	2011-03-08 10:49:35 -05:00
Ralph Giles	e6948bf0f9	Fix a multi-line format-string warning. GCC 4.5 and 4.6 both issue a warning about the multi-line format string introduced in `bc9c30a0`, which also changed the whitespace in the associated stt file by line-wrapping the long format string. Instead, use multiple string constants, which the compiler will concatenate. This maintains the original formatting, but remains legible within the standard line length. Change-Id: I27c9f92d46be82d408105a3a4091f145f677e00e	2011-03-08 07:14:12 -08:00
Paul Wilkins	de87c420ef	Corrected minor typos. Change-Id: Icc9f12bd1e1bdaf51256dc8a90d08aa9be89ef34	2011-03-08 14:46:22 +00:00
Paul Wilkins	0eccee4378	Merge changes I00c3e823,If8bca004 * changes: Improved key frame detection. Improved KF insertion after fades to still.	2011-03-08 06:40:11 -08:00
John Koleszar	5d1d9911cb	correct zbin boost for splitmv mode Disable zbin boost in SPLITMV mode as intended. Was incorrectly looking at vp8_ref_frame_order instead of vp8_mode_order when comparing against SPLITMV. This condition should have always been false, as SPLITMV is not in the range of valid reference frames. Change-Id: I0408cc7595eff68f00efef6d008e79f5b60d14bf	2011-03-07 20:58:37 -05:00
John Koleszar	1016b856d1	Merge "Fix format-string warning"	2011-03-07 13:25:28 -08:00
Ralph Giles	fe9a604b1e	Fix format-string warning Cast size_t to (unsigned long) and print it with the %lu format string, which is more portable than C99's explict %zu for size_t. This truncates on Windows x64 but otherwise works on 32 and 64 bit platforms. In practice the stats file is unlikely to be so large. Change-Id: I0432b3acf85fc6ba4ad50640942e1ca4614b21cb	2011-03-07 13:21:39 -08:00
Paul Wilkins	bc9c30a003	Improved key frame detection. In some cases where clips have been encoded with borders (eg. some wide-screen content where there is a border top and bottom and slide shows containing portrait format photographs (border left and right)) key frames were not being correctly detected. The new code looks to measure cases where a portion of the image can be coded equally easily using intra or inter modes and where the resulting error score is also very low. These "neutral" areas are then discounted in the key frame detection code. Change-Id: I00c3e8230772b8213cdc08020e1990cf83b780d8	2011-03-07 15:58:07 +00:00
Paul Wilkins	9fc8cb39aa	Improved KF insertion after fades to still. This code extends what was previously done for GFs, to pick cases where insertion of a key frame after a fade (or other transition or complex motion) followed by a still section, will be beneficial and will reduce the number of forced key frames. Change-Id: If8bca00457f0d5f83dc3318a587f61c17d90f135	2011-03-07 15:11:09 +00:00
Mikhal Shemer	84f7f20985	Configuration updates:Making a clear distinction between Init and Change Change-Id: I7b2fb326e1aabc08b032177a7b914a5b8bb7376f	2011-03-03 10:35:09 -08:00