Initial OpenCL implementation of the VP8 decoder.

Change-Id: I74c334af09f13473ce07bbac74b0f9ea57573347 Note: very slow, but functional. Encoder is untested, but should still work.
Merge "store quant_shift as an unsigned char"
2011-04-18 13:50:23 -04:00 · 2011-04-18 10:03:40 -07:00 · 2011-04-18 09:53:26 -07:00 · 2011-04-18 07:44:34 -07:00 · 2011-04-18 09:09:57 -04:00 · 2011-04-18 06:07:57 -07:00
140 changed files with 10144 additions and 3067 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -60,3 +60,9 @@
 /vpx_config.h
 /vpx_version.h
 TAGS
+vpxdec
+vpxenc
+.project
+.cproject
+*.csv
+*.oclpj
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -331,11 +331,8 @@ ifneq ($(call enabled,DIST-SRCS),)
    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_sln.sh
    DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/yasm.rules
    DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
-    #
-    # This isn't really ARCH_ARM dependent, it's dependent on whether we're
-    # using assembly code or not (CONFIG_OPTIMIZATIONS maybe). Just use
-    # this for now.
-    DIST-SRCS-$(ARCH_ARM)    += build/make/obj_int_extract.c
+    # Include obj_int_extract if we use offsets from asm_*_offsets
+    DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64)    += build/make/obj_int_extract.c
    DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas.pl
    DIST-SRCS-yes            += $(target:-$(TOOLCHAIN)=).mk
 endif
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -729,19 +729,18 @@ process_common_toolchain() {
            add_cflags -arch ${tgt_isa}
            add_ldflags -arch_only ${tgt_isa}

-            add_cflags  "-isysroot /Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.sdk"
+            add_cflags  "-isysroot ${SDK_PATH}/SDKs/iPhoneOS4.3.sdk"

            # This should be overridable
-            alt_libc=${SDK_PATH}/SDKs/iPhoneOS4.2.sdk
+            alt_libc=${SDK_PATH}/SDKs/iPhoneOS4.3.sdk

            # Add the paths for the alternate libc
-#            for d in usr/include usr/include/gcc/darwin/4.0/; do
-            for d in usr/include usr/include/gcc/darwin/4.0/ usr/lib/gcc/arm-apple-darwin10/4.2.1/include/; do
+            for d in usr/include usr/include/gcc/darwin/4.2/ usr/lib/gcc/arm-apple-darwin10/4.2.1/include/; do
                try_dir="${alt_libc}/${d}"
                [ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
            done

-            for d in lib usr/lib; do
+            for d in lib usr/lib usr/lib/system; do
                try_dir="${alt_libc}/${d}"
                [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
            done
@@ -885,6 +884,8 @@ process_common_toolchain() {
                link_with_cc=gcc
                tune_cflags="-march="
            setup_gnu_toolchain
+                #for 32 bit x86 builds, -O3 did not turn on this flag
+                enabled optimizations && check_add_cflags -fomit-frame-pointer
                ;;
        esac

@@ -956,6 +957,38 @@ process_common_toolchain() {
        enabled rvct && check_add_cflags -Otime
        enabled small && check_add_cflags -O2 || check_add_cflags -O3
    fi
+    
+    if enabled opencl; then
+        disable multithread
+        echo "  disabling multithread"
+        soft_enable opencl #Provide output to make user comfortable
+        enable runtime_cpu_detect
+	
+        #Use dlopen() to load OpenCL when possible.
+        case ${toolchain} in
+            *darwin10*)
+                check_add_cflags -D__APPLE__
+                add_extralibs -framework OpenCL
+                ;;
+            *-win32-gcc)
+                if check_header dlfcn.h; then
+                    add_extralibs -ldl 
+                    enable dlopen
+                else
+                    #This shouldn't be a hard-coded path in the long term
+                    add_extralibs -L/cygdrive/c/Windows/System32 -lOpenCL
+                fi
+                ;;
+            *)
+                if check_header dlfcn.h; then
+                    add_extralibs -ldl 
+                    enable dlopen
+                else
+                    add_extralibs -lOpenCL
+                fi
+                ;;
+        esac
+    fi

    # Position Independent Code (PIC) support, for building relocatable
    # shared objects
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
@@ -9,25 +9,13 @@
 */


+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>

 #include "vpx_config.h"
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <io.h>
-#include <share.h>
 #include "vpx/vpx_integer.h"
-#else
-#include <stdint.h>
-#include <unistd.h>
-#endif
-
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdarg.h>

 typedef enum
 {
@@ -47,7 +35,6 @@ int log_msg(const char *fmt, ...)
 }

 #if defined(__GNUC__) && __GNUC__
-
 #if defined(__MACH__)

 #include <mach-o/loader.h>
@@ -225,73 +212,6 @@ bail:

 }

-int main(int argc, char **argv)
-{
-    int fd;
-    char *f;
-    struct stat stat_buf;
-    uint8_t *file_buf;
-    int res;
-
-    if (argc < 2 || argc > 3)
-    {
-        fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
-        fprintf(stderr, "  <obj file>\tMachO format object file to parse\n");
-        fprintf(stderr, "Output Formats:\n");
-        fprintf(stderr, "  gas  - compatible with GNU assembler\n");
-        fprintf(stderr, "  rvds - compatible with armasm\n");
-        goto bail;
-    }
-
-    f = argv[2];
-
-    if (!((!strcmp(argv[1], "rvds")) || (!strcmp(argv[1], "gas"))))
-        f = argv[1];
-
-    fd = open(f, O_RDONLY);
-
-    if (fd < 0)
-    {
-        perror("Unable to open file");
-        goto bail;
-    }
-
-    if (fstat(fd, &stat_buf))
-    {
-        perror("stat");
-        goto bail;
-    }
-
-    file_buf = malloc(stat_buf.st_size);
-
-    if (!file_buf)
-    {
-        perror("malloc");
-        goto bail;
-    }
-
-    if (read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)
-    {
-        perror("read");
-        goto bail;
-    }
-
-    if (close(fd))
-    {
-        perror("close");
-        goto bail;
-    }
-
-    res = parse_macho(file_buf, stat_buf.st_size);
-    free(file_buf);
-
-    if (!res)
-        return EXIT_SUCCESS;
-
-bail:
-    return EXIT_FAILURE;
-}
-
 #elif defined(__ELF__)
 #include "elf.h"

@@ -740,96 +660,24 @@ bail:
    return 1;
 }

-int main(int argc, char **argv)
-{
-    int fd;
-    output_fmt_t mode;
-    char *f;
-    struct stat stat_buf;
-    uint8_t *file_buf;
-    int res;
-
-    if (argc < 2 || argc > 3)
-    {
-        fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
-        fprintf(stderr, "  <obj file>\tELF format object file to parse\n");
-        fprintf(stderr, "Output Formats:\n");
-        fprintf(stderr, "  gas  - compatible with GNU assembler\n");
-        fprintf(stderr, "  rvds - compatible with armasm\n");
-        goto bail;
-    }
-
-    f = argv[2];
-
-    if (!strcmp(argv[1], "rvds"))
-        mode = OUTPUT_FMT_RVDS;
-    else if (!strcmp(argv[1], "gas"))
-        mode = OUTPUT_FMT_GAS;
-    else
-        f = argv[1];
-
-
-    fd = open(f, O_RDONLY);
-
-    if (fd < 0)
-    {
-        perror("Unable to open file");
-        goto bail;
-    }
-
-    if (fstat(fd, &stat_buf))
-    {
-        perror("stat");
-        goto bail;
-    }
-
-    file_buf = malloc(stat_buf.st_size);
-
-    if (!file_buf)
-    {
-        perror("malloc");
-        goto bail;
-    }
-
-    if (read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)
-    {
-        perror("read");
-        goto bail;
-    }
-
-    if (close(fd))
-    {
-        perror("close");
-        goto bail;
-    }
-
-    res = parse_elf(file_buf, stat_buf.st_size, mode);
-    free(file_buf);
-
-    if (!res)
-        return EXIT_SUCCESS;
-
-bail:
-    return EXIT_FAILURE;
-}
-#endif
 #endif
+#endif /* defined(__GNUC__) && __GNUC__ */


-#if defined(_MSC_VER) || defined(__MINGW32__)
+#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__)
 /*  See "Microsoft Portable Executable and Common Object File Format Specification"
    for reference.
 */
 #define get_le32(x) ((*(x)) | (*(x+1)) << 8 |(*(x+2)) << 16 | (*(x+3)) << 24 )
 #define get_le16(x) ((*(x)) | (*(x+1)) << 8)

-int parse_coff(unsigned __int8 *buf, size_t sz)
+int parse_coff(uint8_t *buf, size_t sz)
 {
    unsigned int nsections, symtab_ptr, symtab_sz, strtab_ptr;
    unsigned int sectionrawdata_ptr;
    unsigned int i;
-    unsigned __int8 *ptr;
-    unsigned __int32 symoffset;
+    uint8_t *ptr;
+    uint32_t symoffset;

    char **sectionlist;  //this array holds all section names in their correct order.
    //it is used to check if the symbol is in .bss or .data section.
@@ -907,7 +755,7 @@ int parse_coff(unsigned __int8 *buf, size_t sz)

    for (i = 0; i < symtab_sz; i++)
    {
-        __int16 section = get_le16(ptr + 12); //section number
+        int16_t section = get_le16(ptr + 12); //section number

        if (section > 0 && ptr[16] == 2)
        {
@@ -978,20 +826,21 @@ bail:

    return 1;
 }
+#endif /* defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__) */

 int main(int argc, char **argv)
 {
-    int fd;
-    output_fmt_t mode;
+    output_fmt_t mode = OUTPUT_FMT_PLAIN;
    const char *f;
-    struct _stat stat_buf;
-    unsigned __int8 *file_buf;
+    uint8_t *file_buf;
    int res;
+    FILE *fp;
+    long int file_size;

    if (argc < 2 || argc > 3)
    {
        fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
-        fprintf(stderr, "  <obj file>\tELF format object file to parse\n");
+        fprintf(stderr, "  <obj file>\tobject file to parse\n");
        fprintf(stderr, "Output Formats:\n");
        fprintf(stderr, "  gas  - compatible with GNU assembler\n");
        fprintf(stderr, "  rvds - compatible with armasm\n");
@@ -1007,15 +856,22 @@ int main(int argc, char **argv)
    else
        f = argv[1];

-    fd = _sopen(f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE);
+    fp = fopen(f, "rb");

-    if (_fstat(fd, &stat_buf))
+    if (!fp)
+    {
+        perror("Unable to open file");
+        goto bail;
+    }
+
+    if (fseek(fp, 0, SEEK_END))
    {
        perror("stat");
        goto bail;
    }

-    file_buf = malloc(stat_buf.st_size);
+    file_size = ftell(fp);
+    file_buf = malloc(file_size);

    if (!file_buf)
    {
@@ -1023,19 +879,30 @@ int main(int argc, char **argv)
        goto bail;
    }

-    if (_read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)
+    rewind(fp);
+
+    if (fread(file_buf, sizeof(char), file_size, fp) != file_size)
    {
        perror("read");
        goto bail;
    }

-    if (_close(fd))
+    if (fclose(fp))
    {
        perror("close");
        goto bail;
    }

-    res = parse_coff(file_buf, stat_buf.st_size);
+#if defined(__GNUC__) && __GNUC__
+#if defined(__MACH__)
+    res = parse_macho(file_buf, file_size);
+#elif defined(__ELF__)
+    res = parse_elf(file_buf, file_size, mode);
+#endif
+#endif
+#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__)
+    res = parse_coff(file_buf, file_size);
+#endif

    free(file_buf);

@@ -1045,4 +912,3 @@ int main(int argc, char **argv)
 bail:
    return EXIT_FAILURE;
 }
-#endif
--- a/14
+++ b/14
@@ -40,6 +40,7 @@ Advanced options:
  ${toggle_runtime_cpu_detect}    runtime cpu detection
  ${toggle_shared}                shared library support
  ${toggle_small}                 favor smaller size over speed
+  ${toggle_opencl}                support for OpenCL-assisted VP8 decoding (experimental)
  ${toggle_postproc_visualizer}   macro block / block level visualizers

 Codecs:
@@ -105,6 +106,7 @@ all_platforms="${all_platforms} x86-darwin8-gcc"
 all_platforms="${all_platforms} x86-darwin8-icc"
 all_platforms="${all_platforms} x86-darwin9-gcc"
 all_platforms="${all_platforms} x86-darwin9-icc"
+all_platforms="${all_platforms} x86-darwin10-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
 all_platforms="${all_platforms} x86-solaris-gcc"
@@ -211,6 +213,7 @@ HAVE_LIST="
    alt_tree_layout
    pthread_h
    sys_mman_h
+    dlopen
 "
 CONFIG_LIST="
    external_build
@@ -250,6 +253,7 @@ CONFIG_LIST="
    realtime_only
    shared
    small
+    opencl
    postproc_visualizer
    os_support
 "
@@ -290,6 +294,7 @@ CMDLINE_SELECT="
    realtime_only
    shared
    small
+    opencl
    postproc_visualizer
 "

@@ -377,6 +382,7 @@ process_targets() {
    if [ -f "${source_path}/build/make/version.sh" ]; then
        local ver=`"$source_path/build/make/version.sh" --bare $source_path`
        DIST_DIR="${DIST_DIR}-${ver}"
+        VERSION_STRING=${ver}
        ver=${ver%%-*}
        VERSION_PATCH=${ver##*.}
        ver=${ver%.*}
@@ -385,6 +391,8 @@ process_targets() {
        VERSION_MAJOR=${ver%.*}
    fi
    enabled child || cat <<EOF >> config.mk
+
+PREFIX=${prefix}
 ifeq (\$(MAKECMDGOALS),dist)
 DIST_DIR?=${DIST_DIR}
 else
@@ -392,6 +400,8 @@ DIST_DIR?=\$(DESTDIR)${prefix}
 endif
 LIBSUBDIR=${libdir##${prefix}/}

+VERSION_STRING=${VERSION_STRING}
+
 VERSION_MAJOR=${VERSION_MAJOR}
 VERSION_MINOR=${VERSION_MINOR}
 VERSION_PATCH=${VERSION_PATCH}
@@ -486,7 +496,7 @@ process_toolchain() {
        check_add_cflags -Wpointer-arith
        check_add_cflags -Wtype-limits
        check_add_cflags -Wcast-qual
-        enabled extra_warnings || check_add_cflags -Wno-unused
+        enabled extra_warnings || check_add_cflags -Wno-unused-function
    fi

    if enabled icc; then
@@ -551,4 +561,6 @@ process "$@"
 cat <<EOF > ${BUILD_PFX}vpx_config.c
 static const char* const cfg = "$CONFIGURE_ARGS";
 const char *vpx_codec_build_config(void) {return cfg;}
+static const char* const libdir = "$libdir";
+const char *vpx_codec_lib_dir(void) {return libdir;}
 EOF
--- a/libs.mk
+++ b/libs.mk
@@ -123,6 +123,18 @@ endif
 else
 INSTALL-LIBS-yes += $(LIBSUBDIR)/libvpx.a
 INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(LIBSUBDIR)/libvpx_g.a
+
+#Install the OpenCL kernels if CL enabled.
+ifeq ($(CONFIG_OPENCL),yes)
+INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/common/opencl/filter_cl.cl
+INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/common/opencl/idctllm_cl.cl
+INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/common/opencl/loopfilter.cl
+#only install decoder CL files if VP8 decoder enabled
+ifeq ($(CONFIG_VP8_DECODER),yes)
+INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/decoder/opencl/dequantize_cl.cl
+endif
+endif #CONFIG_OPENCL=yes
+
 endif

 CODEC_SRCS=$(call enabled,CODEC_SRCS)
@@ -204,6 +216,26 @@ $(addprefix $(DIST_DIR)/,$(LIBVPX_SO_SYMLINKS)):

 INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBVPX_SO_SYMLINKS)
 INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBSUBDIR)/$(LIBVPX_SO)
+
+LIBS-$(BUILD_LIBVPX) += vpx.pc
+vpx.pc: config.mk libs.mk
+	@echo "    [CREATE] $@"
+	$(qexec)echo '# pkg-config file from libvpx $(VERSION_STRING)' > $@
+	$(qexec)echo 'prefix=$(PREFIX)' >> $@
+	$(qexec)echo 'exec_prefix=$${prefix}' >> $@
+	$(qexec)echo 'libdir=$${prefix}/lib' >> $@
+	$(qexec)echo 'includedir=$${prefix}/include' >> $@
+	$(qexec)echo '' >> $@
+	$(qexec)echo 'Name: vpx' >> $@
+	$(qexec)echo 'Description: WebM Project VPx codec implementation' >> $@
+	$(qexec)echo 'Version: $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)' >> $@
+	$(qexec)echo 'Requires:' >> $@
+	$(qexec)echo 'Conflicts:' >> $@
+	$(qexec)echo 'Libs: -L$${libdir} -lvpx' >> $@
+	$(qexec)echo 'Cflags: -I$${includedir}' >> $@
+INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc
+INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
+CLEAN-OBJS += vpx.pc
 endif

 LIBS-$(LIPO_LIBVPX) += libvpx.a
@@ -245,7 +277,9 @@ ifeq ($(CONFIG_EXTERNAL_BUILD),) # Visual Studio uses obj_int_extract.bat
    OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
    CLEAN-OBJS += asm_com_offsets.asm
    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
+  endif

+  ifeq ($(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64), yes)
    ifeq ($(CONFIG_VP8_ENCODER), yes)
      asm_enc_offsets.asm: obj_int_extract
      asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
@@ -254,7 +288,9 @@ ifeq ($(CONFIG_EXTERNAL_BUILD),) # Visual Studio uses obj_int_extract.bat
      CLEAN-OBJS += asm_enc_offsets.asm
      $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
    endif
+  endif

+  ifeq ($(ARCH_ARM), yes)
    ifeq ($(CONFIG_VP8_DECODER), yes)
      asm_dec_offsets.asm: obj_int_extract
      asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -20,7 +20,7 @@

 extern  void vp8_init_scan_order_mask();

-void vp8_update_mode_info_border(MODE_INFO *mi, int rows, int cols)
+static void update_mode_info_border(MODE_INFO *mi, int rows, int cols)
 {
    int i;
    vpx_memset(mi - cols - 2, 0, sizeof(MODE_INFO) * (cols + 1));
@@ -119,7 +119,7 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
        return 1;
    }

-    vp8_update_mode_info_border(oci->mi, oci->mb_rows, oci->mb_cols);
+    update_mode_info_border(oci->mi, oci->mb_rows, oci->mb_cols);

    return 0;
 }
@@ -130,32 +130,32 @@ void vp8_setup_version(VP8_COMMON *cm)
    case 0:
        cm->no_lpf = 0;
        cm->simpler_lpf = 0;
-        cm->use_bilinear_mc_filter = 0;
+        cm->mcomp_filter_type = SIXTAP;
        cm->full_pixel = 0;
        break;
    case 1:
        cm->no_lpf = 0;
        cm->simpler_lpf = 1;
-        cm->use_bilinear_mc_filter = 1;
+        cm->mcomp_filter_type = BILINEAR;
        cm->full_pixel = 0;
        break;
    case 2:
        cm->no_lpf = 1;
        cm->simpler_lpf = 0;
-        cm->use_bilinear_mc_filter = 1;
+        cm->mcomp_filter_type = BILINEAR;
        cm->full_pixel = 0;
        break;
    case 3:
        cm->no_lpf = 1;
        cm->simpler_lpf = 1;
-        cm->use_bilinear_mc_filter = 1;
+        cm->mcomp_filter_type = BILINEAR;
        cm->full_pixel = 1;
        break;
    default:
        /*4,5,6,7 are reserved for future use*/
        cm->no_lpf = 0;
        cm->simpler_lpf = 0;
-        cm->use_bilinear_mc_filter = 0;
+        cm->mcomp_filter_type = SIXTAP;
        cm->full_pixel = 0;
        break;
    }
@@ -170,7 +170,7 @@ void vp8_create_common(VP8_COMMON *oci)
    oci->mb_no_coeff_skip = 1;
    oci->no_lpf = 0;
    oci->simpler_lpf = 0;
-    oci->use_bilinear_mc_filter = 0;
+    oci->mcomp_filter_type = SIXTAP;
    oci->full_pixel = 0;
    oci->multi_token_partition = ONE_PARTITION;
    oci->clr_type = REG_YUV;
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -14,12 +14,17 @@

 void vpx_log(const char *format, ...);

-#include "vpx_ports/config.h"
-#include "vpx_scale/yv12config.h"
+#include "../../vpx_ports/config.h"
+#include "../../vpx_scale/yv12config.h"
 #include "mv.h"
 #include "treecoder.h"
 #include "subpixel.h"
-#include "vpx_ports/mem.h"
+#include "../../vpx_ports/mem.h"
+
+#include "../../vpx_config.h"
+#if CONFIG_OPENCL
+#include "opencl/vp8_opencl.h"
+#endif

 #define TRUE    1
 #define FALSE   0
@@ -73,19 +78,19 @@ typedef enum

 typedef enum
 {
-    DC_PRED,            /* average of above and left pixels */
-    V_PRED,             /* vertical prediction */
-    H_PRED,             /* horizontal prediction */
-    TM_PRED,            /* Truemotion prediction */
-    B_PRED,             /* block based prediction, each block has its own prediction mode */
+    DC_PRED = 0,            /* average of above and left pixels */
+    V_PRED = 1,             /* vertical prediction */
+    H_PRED = 2,             /* horizontal prediction */
+    TM_PRED = 3,            /* Truemotion prediction */
+    B_PRED = 4,             /* block based prediction, each block has its own prediction mode */

-    NEARESTMV,
-    NEARMV,
-    ZEROMV,
-    NEWMV,
-    SPLITMV,
+    NEARESTMV = 5,
+    NEARMV = 6,
+    ZEROMV = 7,
+    NEWMV = 8,
+    SPLITMV = 9,

-    MB_MODE_COUNT
+    MB_MODE_COUNT = 10
 } MB_PREDICTION_MODE;

 /* Macroblock level features */
@@ -187,24 +192,47 @@ typedef struct

 typedef struct
 {
-    short *qcoeff;
-    short *dqcoeff;
-    unsigned char  *predictor;
-    short *diff;
-    short *reference;
+    short *qcoeff_base;
+    int qcoeff_offset;
+
+    short *dqcoeff_base;
+    int dqcoeff_offset;
+
+    unsigned char *predictor_base;
+    int predictor_offset;
+
+    short *diff_base;
+    int diff_offset;

    short *dequant;

+#if CONFIG_OPENCL
+    cl_command_queue cl_commands; //pointer to macroblock CL command queue
+
+    cl_mem cl_diff_mem;
+    cl_mem cl_predictor_mem;
+    cl_mem cl_qcoeff_mem;
+    cl_mem cl_dqcoeff_mem;
+    cl_mem cl_eobs_mem;
+
+    cl_mem cl_dequant_mem; //Block-specific, not shared
+
+    cl_bool sixtap_filter; //Subpixel Prediction type (true=sixtap, false=bilinear)
+
+#endif
+
    /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
-    unsigned char **base_pre;
+    unsigned char **base_pre; //previous frame, same Macroblock, base pointer
    int pre;
    int pre_stride;

-    unsigned char **base_dst;
+    unsigned char **base_dst; //destination base pointer
    int dst;
    int dst_stride;

-    int eob;
+    int eob; //only used in encoder? Decoder uses MBD.eobs
+
+    char *eobs_base; //beginning of MB.eobs

    B_MODE_INFO bmi;

@@ -214,16 +242,26 @@ typedef struct
 {
    DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */
    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
-/* not used    DECLARE_ALIGNED(16, short, reference[384]); */
    DECLARE_ALIGNED(16, short, qcoeff[400]);
    DECLARE_ALIGNED(16, short, dqcoeff[400]);
    DECLARE_ALIGNED(16, char,  eobs[25]);

+#if CONFIG_OPENCL
+    cl_command_queue cl_commands; //Each macroblock gets its own command queue.
+    cl_mem cl_diff_mem;
+    cl_mem cl_predictor_mem;
+    cl_mem cl_qcoeff_mem;
+    cl_mem cl_dqcoeff_mem;
+    cl_mem cl_eobs_mem;
+
+    cl_bool sixtap_filter;
+#endif
+
    /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
    BLOCKD block[25];

    YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
-    YV12_BUFFER_CONFIG dst;
+    YV12_BUFFER_CONFIG dst; /* Destination buffer for current frame */

    MODE_INFO *mode_info_context;
    int mode_info_stride;
@@ -273,6 +311,7 @@ typedef struct

    unsigned int frames_since_golden;
    unsigned int frames_till_alt_ref_frame;
+
    vp8_subpix_fn_t  subpixel_predict;
    vp8_subpix_fn_t  subpixel_predict8x4;
    vp8_subpix_fn_t  subpixel_predict8x8;
--- a/vp8/common/entropymv.h
+++ b/vp8/common/entropymv.h
@@ -18,6 +18,8 @@ enum
 {
    mv_max  = 1023,              /* max absolute value of a MV component */
    MVvals = (2 * mv_max) + 1,   /* # possible values "" */
+    mvfp_max  = 255,              /* max absolute value of a full pixel MV component */
+    MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */

    mvlong_width = 10,       /* Large MVs have 9 bit magnitudes */
    mvnum_short = 8,         /* magnitudes 0 through 7 */
--- a/vp8/common/extend.c
+++ b/vp8/common/extend.c
@@ -13,10 +13,12 @@
 #include "vpx_mem/vpx_mem.h"


-static void extend_plane_borders
+static void copy_and_extend_plane
 (
    unsigned char *s, /* source */
-    int sp,           /* pitch */
+    int sp,           /* source pitch */
+    unsigned char *d, /* destination */
+    int dp,           /* destination pitch */
    int h,            /* height */
    int w,            /* width */
    int et,           /* extend top border */
@@ -25,7 +27,6 @@ static void extend_plane_borders
    int er            /* extend right border */
 )
 {
-
    int i;
    unsigned char *src_ptr1, *src_ptr2;
    unsigned char *dest_ptr1, *dest_ptr2;
@@ -34,68 +35,73 @@ static void extend_plane_borders
    /* copy the left and right most columns out */
    src_ptr1 = s;
    src_ptr2 = s + w - 1;
-    dest_ptr1 = s - el;
-    dest_ptr2 = s + w;
+    dest_ptr1 = d - el;
+    dest_ptr2 = d + w;

-    for (i = 0; i < h - 0 + 1; i++)
+    for (i = 0; i < h; i++)
    {
-        /* Some linkers will complain if we call vpx_memset with el set to a
-         * constant 0.
-         */
-        if (el)
-            vpx_memset(dest_ptr1, src_ptr1[0], el);
+        vpx_memset(dest_ptr1, src_ptr1[0], el);
+        vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
        vpx_memset(dest_ptr2, src_ptr2[0], er);
        src_ptr1  += sp;
        src_ptr2  += sp;
-        dest_ptr1 += sp;
-        dest_ptr2 += sp;
+        dest_ptr1 += dp;
+        dest_ptr2 += dp;
    }

-    /* Now copy the top and bottom source lines into each line of the respective borders */
-    src_ptr1 = s - el;
-    src_ptr2 = s + sp * (h - 1) - el;
-    dest_ptr1 = s + sp * (-et) - el;
-    dest_ptr2 = s + sp * (h) - el;
-    linesize = el + er + w + 1;
+    /* Now copy the top and bottom lines into each line of the respective
+     * borders
+     */
+    src_ptr1 = d - el;
+    src_ptr2 = d + dp * (h - 1) - el;
+    dest_ptr1 = d + dp * (-et) - el;
+    dest_ptr2 = d + dp * (h) - el;
+    linesize = el + er + w;

-    for (i = 0; i < (int)et; i++)
+    for (i = 0; i < et; i++)
    {
        vpx_memcpy(dest_ptr1, src_ptr1, linesize);
-        dest_ptr1 += sp;
+        dest_ptr1 += dp;
    }

-    for (i = 0; i < (int)eb; i++)
+    for (i = 0; i < eb; i++)
    {
        vpx_memcpy(dest_ptr2, src_ptr2, linesize);
-        dest_ptr2 += sp;
+        dest_ptr2 += dp;
    }
 }


-void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height)
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst)
 {
-    int er = 0xf & (16 - (width & 0xf));
-    int eb = 0xf & (16 - (height & 0xf));
+    int et = dst->border;
+    int el = dst->border;
+    int eb = dst->border + dst->y_height - src->y_height;
+    int er = dst->border + dst->y_width - src->y_width;

-    /* check for non multiples of 16 */
-    if (er != 0 || eb != 0)
-    {
-        extend_plane_borders(ybf->y_buffer, ybf->y_stride, height, width, 0, 0, eb, er);
+    copy_and_extend_plane(src->y_buffer, src->y_stride,
+                          dst->y_buffer, dst->y_stride,
+                          src->y_height, src->y_width,
+                          et, el, eb, er);

-        /* adjust for uv */
-        height = (height + 1) >> 1;
-        width  = (width  + 1) >> 1;
-        er = 0x7 & (8 - (width  & 0x7));
-        eb = 0x7 & (8 - (height & 0x7));
+    et = (et + 1) >> 1;
+    el = (el + 1) >> 1;
+    eb = (eb + 1) >> 1;
+    er = (er + 1) >> 1;

-        if (er || eb)
-        {
-            extend_plane_borders(ybf->u_buffer, ybf->uv_stride, height, width, 0, 0, eb, er);
-            extend_plane_borders(ybf->v_buffer, ybf->uv_stride, height, width, 0, 0, eb, er);
-        }
-    }
+    copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                          dst->u_buffer, dst->uv_stride,
+                          src->uv_height, src->uv_width,
+                          et, el, eb, er);
+
+    copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                          dst->v_buffer, dst->uv_stride,
+                          src->uv_height, src->uv_width,
+                          et, el, eb, er);
 }

+
 /* note the extension is only for the last row, for intra prediction purpose */
 void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr)
 {
--- a/vp8/common/extend.h
+++ b/vp8/common/extend.h
@@ -14,8 +14,8 @@

 #include "vpx_scale/yv12config.h"

-void Extend(YV12_BUFFER_CONFIG *ybf);
 void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
-void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height);
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst);

 #endif
--- a/vp8/common/filter.c
+++ b/vp8/common/filter.c
@@ -10,6 +10,29 @@


 #include <stdlib.h>
+#include <stdio.h>
+
+#define REGISTER_FILTER 1
+#define CLAMP(x,min,max) if (x < min) x = min; else if ( x > max ) x = max;
+
+#if REGISTER_FILTER
+#define FILTER0 filter0
+#define FILTER1 filter1
+#define FILTER2 filter2
+#define FILTER3 filter3
+#define FILTER4 filter4
+#define FILTER5 filter5
+#else
+#define FILTER0 vp8_filter[0]
+#define FILTER1 vp8_filter[1]
+#define FILTER2 vp8_filter[2]
+#define FILTER3 vp8_filter[3]
+#define FILTER4 vp8_filter[4]
+#define FILTER5 vp8_filter[5]
+#endif
+
+#define SRC_INCREMENT src_increment
+
 #include "filter.h"
 #include "vpx_ports/mem.h"

@@ -27,7 +50,6 @@ DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =

 DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
 {
-
    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
    { 0, -6,  123,   12,  -1,  0 },
    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
@@ -38,7 +60,7 @@ DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
    { 0, -1,   12,  123,  -6,  0 },
 };

-void vp8_filter_block2d_first_pass
+static void filter_block2d_first_pass
 (
    unsigned char *src_ptr,
    int *output_ptr,
@@ -49,40 +71,50 @@ void vp8_filter_block2d_first_pass
    const short *vp8_filter
 )
 {
-    unsigned int i, j;
-    int  Temp;

+    unsigned int i, j;
+    int Temp;
+
+#if REGISTER_FILTER
+    short filter0 = vp8_filter[0];
+    short filter1 = vp8_filter[1];
+    short filter2 = vp8_filter[2];
+    short filter3 = vp8_filter[3];
+    short filter4 = vp8_filter[4];
+    short filter5 = vp8_filter[5];
+#endif
+
+    int ps2 = 2*(int)pixel_step;
+    int ps3 = 3*(int)pixel_step;
+
+    unsigned int src_increment = src_pixels_per_line - output_width;
    for (i = 0; i < output_height; i++)
    {
        for (j = 0; j < output_width; j++)
        {
-            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
-                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
-                   ((int)src_ptr[0]                 * vp8_filter[2]) +
-                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
-                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
-                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
-                   (VP8_FILTER_WEIGHT >> 1);      /* Rounding */
+            Temp = ((int)src_ptr[-1*ps2]         * FILTER0);
+            Temp += ((int)src_ptr[-1*(int)pixel_step] * FILTER1) +
+               ((int)src_ptr[0]                * FILTER2) +
+               ((int)src_ptr[pixel_step]       * FILTER3) +
+               ((int)src_ptr[ps2]              * FILTER4) +
+               ((int)src_ptr[ps3]              * FILTER5) +
+               (VP8_FILTER_WEIGHT >> 1);      /* Rounding */

            /* Normalize back to 0-255 */
            Temp = Temp >> VP8_FILTER_SHIFT;
-
-            if (Temp < 0)
-                Temp = 0;
-            else if (Temp > 255)
-                Temp = 255;
+            CLAMP(Temp, 0, 255);

            output_ptr[j] = Temp;
            src_ptr++;
        }

        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
+        src_ptr    += SRC_INCREMENT;
        output_ptr += output_width;
    }
 }

-void vp8_filter_block2d_second_pass
+static void filter_block2d_second_pass
 (
    int *src_ptr,
    unsigned char *output_ptr,
@@ -94,42 +126,51 @@ void vp8_filter_block2d_second_pass
    const short *vp8_filter
 )
 {
-    unsigned int i, j;
-    int  Temp;
+	unsigned int i, j;
+	int  Temp;
+
+#if REGISTER_FILTER
+    short filter0 = vp8_filter[0];
+    short filter1 = vp8_filter[1];
+    short filter2 = vp8_filter[2];
+    short filter3 = vp8_filter[3];
+    short filter4 = vp8_filter[4];
+    short filter5 = vp8_filter[5];
+#endif
+
+    int ps2 = ((int)pixel_step) << 1;
+    int ps3 = ps2 + (int)pixel_step;
+    unsigned int src_increment = src_pixels_per_line - output_width;

    for (i = 0; i < output_height; i++)
    {
        for (j = 0; j < output_width; j++)
        {
            /* Apply filter */
-            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
-                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
-                   ((int)src_ptr[0]                 * vp8_filter[2]) +
-                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
-                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
-                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
+            Temp = ((int)src_ptr[-1*ps2] * FILTER0) +
+                   ((int)src_ptr[-1*(int)pixel_step] * FILTER1) +
+                   ((int)src_ptr[0]                  * FILTER2) +
+                   ((int)src_ptr[pixel_step]         * FILTER3) +
+                   ((int)src_ptr[ps2]       * FILTER4) +
+                   ((int)src_ptr[ps3]       * FILTER5) +
                   (VP8_FILTER_WEIGHT >> 1);   /* Rounding */

            /* Normalize back to 0-255 */
            Temp = Temp >> VP8_FILTER_SHIFT;
-
-            if (Temp < 0)
-                Temp = 0;
-            else if (Temp > 255)
-                Temp = 255;
+            CLAMP(Temp, 0, 255);

            output_ptr[j] = (unsigned char)Temp;
            src_ptr++;
        }

        /* Start next row */
-        src_ptr    += src_pixels_per_line - output_width;
+        src_ptr    += src_increment;
        output_ptr += output_pitch;
    }
 }


-void vp8_filter_block2d
+static void filter_block2d
 (
    unsigned char  *src_ptr,
    unsigned char  *output_ptr,
@@ -142,39 +183,13 @@ void vp8_filter_block2d
    int FData[9*4]; /* Temp data buffer used in filtering */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);

    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
+    filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
 }


-void vp8_block_variation_c
-(
-    unsigned char  *src_ptr,
-    int   src_pixels_per_line,
-    int *HVar,
-    int *VVar
-)
-{
-    int i, j;
-    unsigned char *Ptr = src_ptr;
-
-    for (i = 0; i < 4; i++)
-    {
-        for (j = 0; j < 4; j++)
-        {
-            *HVar += abs((int)Ptr[j] - (int)Ptr[j+1]);
-            *VVar += abs((int)Ptr[j] - (int)Ptr[j+src_pixels_per_line]);
-        }
-
-        Ptr += src_pixels_per_line;
-    }
-}
-
-
-
-
 void vp8_sixtap_predict_c
 (
    unsigned char  *src_ptr,
@@ -191,8 +206,9 @@ void vp8_sixtap_predict_c
    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

-    vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
+    filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
 }
+
 void vp8_sixtap_predict8x8_c
 (
    unsigned char  *src_ptr,
@@ -211,11 +227,11 @@ void vp8_sixtap_predict8x8_c
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);


    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);

 }

@@ -237,11 +253,11 @@ void vp8_sixtap_predict8x4_c
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);


    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);

 }

@@ -264,10 +280,10 @@ void vp8_sixtap_predict16x16_c
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);

    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
+    filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);

 }

@@ -294,7 +310,7 @@ void vp8_sixtap_predict16x16_c
 *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
 *
 ****************************************************************************/
-void vp8_filter_block2d_bil_first_pass
+static void filter_block2d_bil_first_pass
 (
    unsigned char  *src_ptr,
    unsigned short *dst_ptr,
@@ -345,7 +361,7 @@ void vp8_filter_block2d_bil_first_pass
 *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
 *
 ****************************************************************************/
-void vp8_filter_block2d_bil_second_pass
+static void filter_block2d_bil_second_pass
 (
    unsigned short *src_ptr,
    unsigned char  *dst_ptr,
@@ -399,7 +415,7 @@ void vp8_filter_block2d_bil_second_pass
 *  SPECIAL NOTES : The largest block size can be handled here is 16x16
 *
 ****************************************************************************/
-void vp8_filter_block2d_bil
+static void filter_block2d_bil
 (
    unsigned char *src_ptr,
    unsigned char *dst_ptr,
@@ -415,10 +431,10 @@ void vp8_filter_block2d_bil
    unsigned short FData[17*16];    /* Temp data buffer used in filtering */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+    filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

    /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+    filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }


@@ -444,19 +460,19 @@ void vp8_bilinear_predict4x4_c
        unsigned char temp2[16];

        bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
-        vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+        filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);

        for (i = 0; i < 16; i++)
        {
            if (temp1[i] != temp2[i])
            {
                bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
-                vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+                filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
            }
        }
    }
 #endif
-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);

 }

@@ -476,7 +492,7 @@ void vp8_bilinear_predict8x8_c
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];

-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);

 }

@@ -496,7 +512,7 @@ void vp8_bilinear_predict8x4_c
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];

-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);

 }

@@ -516,5 +532,5 @@ void vp8_bilinear_predict16x16_c
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];

-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -11,6 +11,13 @@

 #include "findnearmv.h"

+const unsigned char vp8_mbsplit_offset[4][16] = {
+    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+};
+
 /* Predict motion vectors using those from already-decoded nearby blocks.
   Note that we only consider one 4x4 subblock from each candidate 16x16
   macroblock.   */
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -70,4 +70,6 @@ const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b);

 const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride);

+extern const unsigned char vp8_mbsplit_offset[4][16];
+
 #endif
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -19,6 +19,7 @@

 extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);
 extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);
+extern void vp8_arch_opencl_common_init(VP8_COMMON *ctx);

 void vp8_machine_specific_config(VP8_COMMON *ctx)
 {
@@ -82,4 +83,8 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
    vp8_arch_arm_common_init(ctx);
 #endif

+#if CONFIG_OPENCL && (ENABLE_CL_IDCT_DEQUANT || ENABLE_CL_SUBPIXEL || ENABLE_CL_LOOPFILTER)
+    vp8_arch_opencl_common_init(ctx);
+#endif
+
 }
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h
@@ -31,6 +31,10 @@
 #include "arm/idct_arm.h"
 #endif

+#if CONFIG_OPENCL
+#include "opencl/idct_cl.h"
+#endif
+
 #ifndef vp8_idct_idct1
 #define vp8_idct_idct1 vp8_short_idct4x4llm_1_c
 #endif
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -13,6 +13,10 @@
 #include "loopfilter.h"
 #include "onyxc_int.h"

+#if CONFIG_OPENCL
+#include "opencl/loopfilter_cl.h"
+#endif
+
 typedef unsigned char uc;


@@ -312,6 +316,13 @@ void vp8_loop_filter_frame
    int i;
    unsigned char *y_ptr, *u_ptr, *v_ptr;

+#if CONFIG_OPENCL && ENABLE_CL_LOOPFILTER
+    if ( cl_initialized == CL_SUCCESS ){
+        vp8_loop_filter_frame_cl(cm,mbd,default_filt_lvl);
+        return;
+    }
+#endif
+
    mbd->mode_info_context = cm->mi;          /* Point at base of Mb MODE_INFO list */

    /* Note the baseline filter values for each segment */
@@ -394,6 +405,7 @@ void vp8_loop_filter_frame
 }


+/* Encoder only... */
 void vp8_loop_filter_frame_yonly
 (
    VP8_COMMON *cm,
@@ -489,7 +501,7 @@ void vp8_loop_filter_frame_yonly

 }

-
+/* Encoder only... */
 void vp8_loop_filter_partial_frame
 (
    VP8_COMMON *cm,
--- a/vp8/common/loopfilter_filters.c
+++ b/vp8/common/loopfilter_filters.c
@@ -49,7 +49,6 @@ static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0,
 }

 static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *op0, uc *oq0, uc *oq1)
-
 {
    signed char ps0, qs0;
    signed char ps1, qs1;
@@ -94,6 +93,7 @@ static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *
    *op1 = u ^ 0x80;

 }
+
 void vp8_loop_filter_horizontal_edge_c
 (
    unsigned char *s,
--- a/vp8/common/mbpitch.c
+++ b/vp8/common/mbpitch.c
@@ -11,16 +11,21 @@

 #include "blockd.h"

+#include "stdio.h"
+#include "vpx_config.h"
+#if CONFIG_OPENCL
+#include "opencl/vp8_opencl.h"
+#endif
+
 typedef enum
 {
    PRED = 0,
    DEST = 1
 } BLOCKSET;

-void vp8_setup_block
+static void setup_block
 (
    BLOCKD *b,
-    int mv_stride,
    unsigned char **base,
    int Stride,
    int offset,
@@ -43,87 +48,183 @@ void vp8_setup_block

 }

-void vp8_setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)
+
+static void setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)
 {
    int block;

    unsigned char **y, **u, **v;
+    unsigned char **buf_base;
+    int y_off, u_off, v_off;

    if (bs == DEST)
    {
+        buf_base = &x->dst.buffer_alloc;
+        y_off = x->dst.y_buffer - x->dst.buffer_alloc;
+        u_off = x->dst.u_buffer - x->dst.buffer_alloc;
+        v_off = x->dst.v_buffer - x->dst.buffer_alloc;
        y = &x->dst.y_buffer;
        u = &x->dst.u_buffer;
        v = &x->dst.v_buffer;
+        y_off = 0;
+
+        //y = buf_base;
+        //y_off = x->dst.y_buffer - x->dst.buffer_alloc;
+        
+        u = buf_base;
+        v = buf_base;
+
+        u_off = x->dst.u_buffer - x->dst.buffer_alloc;
+        v_off = x->dst.v_buffer - x->dst.buffer_alloc;
+
    }
    else
    {
+        buf_base = &x->pre.buffer_alloc;
        y = &x->pre.y_buffer;
        u = &x->pre.u_buffer;
        v = &x->pre.v_buffer;
+        y_off = u_off = v_off = 0;
+
+        //y = buf_base;
+        //y_off = x->pre.y_buffer - x->pre.buffer_alloc;
+        //u = buf_base;
+        //u_off = x->pre.u_buffer - x->pre.buffer_alloc;
+        //v = buf_base;
+        //v_off = x->pre.v_buffer - x->pre.buffer_alloc;
    }

    for (block = 0; block < 16; block++) /* y blocks */
    {
-        vp8_setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride,
-                        (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4, bs);
+        setup_block(&x->block[block], y, x->dst.y_stride,
+                        y_off + ((block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4), bs);
    }

    for (block = 16; block < 20; block++) /* U and V blocks */
    {
-        vp8_setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride,
-                        ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
+        int block_off = ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4;

-        vp8_setup_block(&x->block[block+4], x->dst.uv_stride, v, x->dst.uv_stride,
-                        ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
+        setup_block(&x->block[block], u, x->dst.uv_stride,
+                        u_off + block_off, bs);
+
+        setup_block(&x->block[block+4], v, x->dst.uv_stride,
+                        v_off + block_off, bs);
    }
 }

 void vp8_setup_block_dptrs(MACROBLOCKD *x)
 {
    int r, c;
+    unsigned int offset;

+#if CONFIG_OPENCL && !ONE_CQ_PER_MB
+    cl_command_queue y_cq, u_cq, v_cq;
+    int err;
+    if (cl_initialized == CL_SUCCESS){
+        //Create command queue for Y/U/V Planes
+        y_cq = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err);
+        if (!y_cq || err != CL_SUCCESS) {
+            printf("Error: Failed to create a command queue!\n");
+            cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
+        }
+        u_cq = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err);
+        if (!u_cq || err != CL_SUCCESS) {
+            printf("Error: Failed to create a command queue!\n");
+            cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
+        }
+        v_cq = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err);
+        if (!v_cq || err != CL_SUCCESS) {
+            printf("Error: Failed to create a command queue!\n");
+            cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
+        }
+    }
+#endif
+
+    /* 16 Y blocks */
    for (r = 0; r < 4; r++)
    {
        for (c = 0; c < 4; c++)
        {
-            x->block[r*4+c].diff      = &x->diff[r * 4 * 16 + c * 4];
-            x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4;
+            offset = r * 4 * 16 + c * 4;
+            x->block[r*4+c].diff_offset      = offset;
+            x->block[r*4+c].predictor_offset = offset;
+#if CONFIG_OPENCL && !ONE_CQ_PER_MB
+            if (cl_initialized == CL_SUCCESS)
+                x->block[r*4+c].cl_commands = y_cq;
+#endif
        }
    }

+    /* 4 U Blocks */
    for (r = 0; r < 2; r++)
    {
        for (c = 0; c < 2; c++)
        {
-            x->block[16+r*2+c].diff      = &x->diff[256 + r * 4 * 8 + c * 4];
-            x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4;
+            offset = 256 + r * 4 * 8 + c * 4;
+            x->block[16+r*2+c].diff_offset      = offset;
+            x->block[16+r*2+c].predictor_offset = offset;

+#if CONFIG_OPENCL && !ONE_CQ_PER_MB
+            if (cl_initialized == CL_SUCCESS)
+                x->block[16+r*2+c].cl_commands = u_cq;
+#endif
        }
    }

+    /* 4 V Blocks */
    for (r = 0; r < 2; r++)
    {
        for (c = 0; c < 2; c++)
        {
-            x->block[20+r*2+c].diff      = &x->diff[320+ r * 4 * 8 + c * 4];
-            x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4;
+            offset = 320+ r * 4 * 8 + c * 4;
+            x->block[20+r*2+c].diff_offset      = offset;
+            x->block[20+r*2+c].predictor_offset = offset;

+#if CONFIG_OPENCL && !ONE_CQ_PER_MB
+            if (cl_initialized == CL_SUCCESS)
+                x->block[20+r*2+c].cl_commands = v_cq;
+#endif
        }
    }

-    x->block[24].diff = &x->diff[384];
+    x->block[24].diff_offset = 384;

    for (r = 0; r < 25; r++)
    {
-        x->block[r].qcoeff  = x->qcoeff  + r * 16;
-        x->block[r].dqcoeff = x->dqcoeff + r * 16;
+    	x->block[r].qcoeff_base = x->qcoeff;
+    	x->block[r].qcoeff_offset = r * 16;
+        x->block[r].dqcoeff_base = x->dqcoeff;
+        x->block[r].dqcoeff_offset = r * 16;
+        
+        x->block[r].predictor_base = x->predictor;
+        x->block[r].diff_base = x->diff;
+        x->block[r].eobs_base = x->eobs;
+
+#if CONFIG_OPENCL
+        if (cl_initialized == CL_SUCCESS){
+            /* Copy command queue reference from macroblock */
+#if ONE_CQ_PER_MB
+            x->block[r].cl_commands = x->cl_commands;
+#endif
+
+            /* Set up CL memory buffers as appropriate */
+            x->block[r].cl_diff_mem = x->cl_diff_mem;
+            x->block[r].cl_dqcoeff_mem = x->cl_dqcoeff_mem;
+            x->block[r].cl_eobs_mem = x->cl_eobs_mem;
+            x->block[r].cl_predictor_mem = x->cl_predictor_mem;
+            x->block[r].cl_qcoeff_mem = x->cl_qcoeff_mem;
+        }
+
+        //Copy filter type to block.
+        x->block[r].sixtap_filter = x->sixtap_filter;
+#endif
    }
+
 }

 void vp8_build_block_doffsets(MACROBLOCKD *x)
 {
-
    /* handle the destination pitch features */
-    vp8_setup_macroblock(x, DEST);
-    vp8_setup_macroblock(x, PRED);
+    setup_macroblock(x, DEST);
+    setup_macroblock(x, PRED);
 }
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -120,7 +120,6 @@ typedef struct VP8Common
    int mb_no_coeff_skip;
    int no_lpf;
    int simpler_lpf;
-    int use_bilinear_mc_filter;
    int full_pixel;

    int base_qindex;
--- a/vp8/common/opencl/blockd_cl.c
+++ b/vp8/common/opencl/blockd_cl.c
@@ -0,0 +1,233 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "../../decoder/onyxd_int.h"
+#include "../../../vpx_ports/config.h"
+#include "../../common/idct.h"
+#include "blockd_cl.h"
+#include "../../decoder/opencl/dequantize_cl.h"
+
+
+int vp8_cl_mb_prep(MACROBLOCKD *x, int flags){
+        int err;
+
+    if (cl_initialized != CL_SUCCESS){
+        return cl_initialized;
+    }
+
+    //Copy all blockd.cl_*_mem objects
+    if (flags & DIFF)
+        VP8_CL_SET_BUF(x->cl_commands, x->cl_diff_mem, sizeof(cl_short)*400, x->diff,
+            ,err
+        );
+
+    if (flags & PREDICTOR)
+        VP8_CL_SET_BUF(x->cl_commands, x->cl_predictor_mem, sizeof(cl_uchar)*384, x->predictor,
+            ,err
+        );
+
+    if (flags & QCOEFF)
+        VP8_CL_SET_BUF(x->cl_commands, x->cl_qcoeff_mem, sizeof(cl_short)*400, x->qcoeff,
+            ,err
+        );
+
+    if (flags & DQCOEFF)
+        VP8_CL_SET_BUF(x->cl_commands, x->cl_dqcoeff_mem, sizeof(cl_short)*400, x->dqcoeff,
+            ,err
+        );
+
+    if (flags & EOBS)
+        VP8_CL_SET_BUF(x->cl_commands, x->cl_eobs_mem, sizeof(cl_char)*25, x->eobs,
+            ,err
+        );
+
+    if (flags & PRE_BUF){
+        VP8_CL_SET_BUF(x->cl_commands, x->pre.buffer_mem, x->pre.buffer_size, x->pre.buffer_alloc,
+            ,err
+        );
+    }
+
+    if (flags & DST_BUF){
+        VP8_CL_SET_BUF(x->cl_commands, x->dst.buffer_mem, x->dst.buffer_size, x->dst.buffer_alloc,
+            ,err
+        );
+    }
+
+
+    return CL_SUCCESS;
+}
+
+int vp8_cl_mb_finish(MACROBLOCKD *x, int flags){
+    int err;
+
+    if (cl_initialized != CL_SUCCESS){
+        return cl_initialized;
+    }
+
+    if (flags & DIFF){
+        err = clEnqueueReadBuffer(x->cl_commands, x->cl_diff_mem, CL_FALSE, 0, sizeof(cl_short)*400, x->diff, 0, NULL, NULL);
+        VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to read from GPU!\n",
+            , err
+        );
+    }
+
+    if (flags & PREDICTOR){
+    err = clEnqueueReadBuffer(x->cl_commands, x->cl_predictor_mem, CL_FALSE, 0, sizeof(cl_uchar)*384, x->predictor, 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to read from GPU!\n",
+            , err
+    );
+    }
+
+    if (flags & QCOEFF){
+    err = clEnqueueReadBuffer(x->cl_commands, x->cl_qcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, x->qcoeff, 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to read from GPU!\n",
+            , err
+    );
+    }
+
+    if (flags & DQCOEFF){
+    err = clEnqueueReadBuffer(x->cl_commands, x->cl_dqcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, x->dqcoeff, 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to read from GPU!\n",
+            , err
+    );
+    }
+
+    if (flags & EOBS){
+        err = clEnqueueReadBuffer(x->cl_commands, x->cl_eobs_mem, CL_FALSE, 0, sizeof(cl_char)*25, x->eobs, 0, NULL, NULL);
+        VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
+          "Error: Failed to read from GPU!\n",
+            , err
+        );
+    }
+
+    if (flags & PRE_BUF){
+        err = clEnqueueReadBuffer(x->cl_commands, x->pre.buffer_mem, CL_FALSE, 
+                0, x->pre.buffer_size, x->pre.buffer_alloc, 0, NULL, NULL);
+        VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
+          "Error: Failed to read from GPU!\n",
+            , err
+        );
+    }
+
+    if (flags & DST_BUF){
+        err = clEnqueueReadBuffer(x->cl_commands, x->dst.buffer_mem, CL_FALSE,
+                0, x->dst.buffer_size, x->dst.buffer_alloc, 0, NULL, NULL);
+        VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
+          "Error: Failed to read from GPU!\n",
+            , err
+        );
+    }
+
+
+    return CL_SUCCESS;
+}
+
+int vp8_cl_block_prep(BLOCKD *b, int flags){
+    int err;
+
+    if (cl_initialized != CL_SUCCESS){
+        return cl_initialized;
+    }
+
+    //Copy all blockd.cl_*_mem objects
+    if (flags & DIFF)
+        VP8_CL_SET_BUF(b->cl_commands, b->cl_diff_mem, sizeof(cl_short)*400, b->diff_base,
+            ,err
+        );
+
+    if (flags & PREDICTOR)
+        VP8_CL_SET_BUF(b->cl_commands, b->cl_predictor_mem, sizeof(cl_uchar)*384, b->predictor_base,
+            ,err
+        );
+
+    if (flags & QCOEFF)
+        VP8_CL_SET_BUF(b->cl_commands, b->cl_qcoeff_mem, sizeof(cl_short)*400, b->qcoeff_base,
+            ,err
+        );
+
+    if (flags & DQCOEFF)
+        VP8_CL_SET_BUF(b->cl_commands, b->cl_dqcoeff_mem, sizeof(cl_short)*400, b->dqcoeff_base,
+            ,err
+        );
+
+    if (flags & EOBS)
+        VP8_CL_SET_BUF(b->cl_commands, b->cl_eobs_mem, sizeof(cl_char)*25, b->eobs_base,
+            ,err
+        );
+
+    if (flags & DEQUANT)
+        VP8_CL_SET_BUF(b->cl_commands, b->cl_dequant_mem, sizeof(cl_short)*16 ,b->dequant,
+            ,err
+        );
+
+    return CL_SUCCESS;
+}
+
+int vp8_cl_block_finish(BLOCKD *b, int flags){
+    int err;
+
+    if (cl_initialized != CL_SUCCESS){
+        return cl_initialized;
+    }
+
+    if (flags & DIFF){
+        err = clEnqueueReadBuffer(b->cl_commands, b->cl_diff_mem, CL_FALSE, 0, sizeof(cl_short)*400, b->diff_base, 0, NULL, NULL);
+        VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to read from GPU!\n",
+            , err
+        );
+    }
+
+    if (flags & PREDICTOR){
+    err = clEnqueueReadBuffer(b->cl_commands, b->cl_predictor_mem, CL_FALSE, 0, sizeof(cl_uchar)*384, b->predictor_base, 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to read from GPU!\n",
+            , err
+    );
+    }
+
+    if (flags & QCOEFF){
+    err = clEnqueueReadBuffer(b->cl_commands, b->cl_qcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, b->qcoeff_base, 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to read from GPU!\n",
+            , err
+    );
+    }
+
+    if (flags & DQCOEFF){
+    err = clEnqueueReadBuffer(b->cl_commands, b->cl_dqcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, b->dqcoeff_base, 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to read from GPU!\n",
+            , err
+    );
+    }
+
+    if (flags & EOBS){
+    err = clEnqueueReadBuffer(b->cl_commands, b->cl_eobs_mem, CL_FALSE, 0, sizeof(cl_char)*25, b->eobs_base, 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to read from GPU!\n",
+            , err
+    );
+    }
+
+    if (flags & DEQUANT){
+    err = clEnqueueReadBuffer(b->cl_commands, b->cl_dequant_mem, CL_FALSE, 0, sizeof(cl_short)*16 ,b->dequant, 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to read from GPU!\n",
+            , err
+    );
+    }
+
+    return CL_SUCCESS;
+}
--- a/vp8/common/opencl/blockd_cl.h
+++ b/vp8/common/opencl/blockd_cl.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef BLOCKD_OPENCL_H
+#define BLOCKD_OPENCL_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include "vp8_opencl.h"
+#include "../blockd.h"
+
+#define DIFF 0x0001
+#define PREDICTOR 0x0002
+#define QCOEFF 0x0004
+#define DQCOEFF 0x0008
+#define EOBS 0x0010
+#define DEQUANT 0x0020
+#define PRE_BUF 0x0040
+#define DST_BUF 0x0080
+    
+#define BLOCK_COPY_ALL 0xffff
+
+/*
+#define BLOCK_MEM_SIZE 6
+enum {
+    DIFF_MEM = 0,
+    PRED_MEM = 1,
+    QCOEFF_MEM = 2,
+    DQCOEFF_MEM = 3,
+    EOBS_MEM = 4,
+    DEQUANT_MEM = 5
+} BLOCK_MEM_TYPES;
+
+
+struct cl_block_mem{
+    cl_mem gpu_mem;
+    size_t size;
+    void *host_mem;
+};
+
+typedef struct cl_block_mem block_mem;
+*/
+    
+extern int vp8_cl_block_finish(BLOCKD *b, int flags);
+extern int vp8_cl_block_prep(BLOCKD *b, int flags);
+
+extern int vp8_cl_mb_prep(MACROBLOCKD *x, int flags);
+extern int vp8_cl_mb_finish(MACROBLOCKD *x, int flags);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif
--- a/vp8/common/opencl/dynamic_cl.c
+++ b/vp8/common/opencl/dynamic_cl.c
@@ -0,0 +1,106 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8_opencl.h"
+
+#include <stdio.h>
+
+CL_FUNCTIONS cl;
+void *dll = NULL;
+int cl_loaded = VP8_CL_NOT_INITIALIZED;
+
+int close_cl(){
+    int ret = dlclose(dll);
+
+    if (ret != 0)
+        fprintf(stderr, "Error closing OpenCL library: %s", dlerror());
+
+    return ret;
+}
+
+int load_cl(char *lib_name){
+
+    //printf("Loading OpenCL library\n");
+    dll = dlopen(lib_name, RTLD_NOW|RTLD_LOCAL);
+    if (dll != NULL){
+        //printf("Found CL library\n");
+    } else {
+        //printf("Didn't find CL library\n");
+        return VP8_CL_TRIED_BUT_FAILED;
+    }
+
+    CL_LOAD_FN("clGetPlatformIDs", cl.getPlatformIDs);
+    CL_LOAD_FN("clGetPlatformInfo", cl.getPlatformInfo);
+    CL_LOAD_FN("clGetDeviceIDs", cl.getDeviceIDs);
+    CL_LOAD_FN("clGetDeviceInfo", cl.getDeviceInfo);
+    CL_LOAD_FN("clCreateContext", cl.createContext);
+//    CL_LOAD_FN("clCreateContextFromType", cl.createContextFromType);
+//    CL_LOAD_FN("clRetainContext", cl.retainContext);
+    CL_LOAD_FN("clReleaseContext", cl.releaseContext);
+//    CL_LOAD_FN("clGetContextInfo", cl.getContextInfo);
+    CL_LOAD_FN("clCreateCommandQueue", cl.createCommandQueue);
+//    CL_LOAD_FN("clRetainCommandQueue", cl.retainCommandQueue);
+    CL_LOAD_FN("clReleaseCommandQueue", cl.releaseCommandQueue);
+//    CL_LOAD_FN("clGetCommandQueueInfo", cl.getCommandQueue);
+    CL_LOAD_FN("clCreateBuffer", cl.createBuffer);
+//    CL_LOAD_FN("clCreateImage2D", cl.createImage2D);
+//    CL_LOAD_FN("clCreateImage3D", cl.createImage3D);
+//    CL_LOAD_FN("clRetainMemObject", cl.retainMemObject);
+    CL_LOAD_FN("clReleaseMemObject", cl.releaseMemObject);
+//    CL_LOAD_FN("clGetSupportedImageFormats", cl.getSupportedImageFormats);
+//    CL_LOAD_FN("clGetMemObjectInfo", cl.getMemObjectInfo);
+//    CL_LOAD_FN("clGetImageInfo", cl.getImageInfo);
+//    CL_LOAD_FN("clCreateSampler", cl.createSampler);
+//    CL_LOAD_FN("clRetainSampler", cl.retainSampler);
+//    CL_LOAD_FN("clReleaseSampler", cl.releaseSampler);
+//    CL_LOAD_FN("clGetSamplerInfo", cl.getSamplerInfo);
+    CL_LOAD_FN("clCreateProgramWithSource", cl.createProgramWithSource);
+//    CL_LOAD_FN("clCreateProgramWithBinary", cl.createProgramWithBinary);
+//    CL_LOAD_FN("clRetainProgram", cl.retainProgram);
+    CL_LOAD_FN("clReleaseProgram", cl.releaseProgram);
+    CL_LOAD_FN("clBuildProgram", cl.buildProgram);
+//    CL_LOAD_FN("clUnloadCompiler", cl.unloadCompiler);
+    CL_LOAD_FN("clGetProgramInfo", cl.getProgramInfo);
+    CL_LOAD_FN("clGetProgramBuildInfo", cl.getProgramBuildInfo);
+    CL_LOAD_FN("clCreateKernel", cl.createKernel);
+//    CL_LOAD_FN("clCreateKernelsInProgram", cl.createKernelsInProgram);
+//    CL_LOAD_FN("clRetainKernel", cl.retainKernel);
+    CL_LOAD_FN("clReleaseKernel", cl.releaseKernel);
+    CL_LOAD_FN("clSetKernelArg", cl.setKernelArg);
+//    CL_LOAD_FN("clGetKernelInfo", cl.getKernelInfo);
+    CL_LOAD_FN("clGetKernelWorkGroupInfo", cl.getKernelWorkGroupInfo);
+//    CL_LOAD_FN("clWaitForEvents", cl.waitForEvents);
+//    CL_LOAD_FN("clGetEventInfo", cl.getEventInfo);
+//    CL_LOAD_FN("clRetainEvent", cl.retainEvent);
+//    CL_LOAD_FN("clReleaseEvent", cl.releaseEvent);
+//    CL_LOAD_FN("clGetEventProfilingInfo", cl.getEventProfilingInfo);
+    CL_LOAD_FN("clFlush", cl.flush);
+    CL_LOAD_FN("clFinish", cl.finish);
+    CL_LOAD_FN("clEnqueueReadBuffer", cl.enqueueReadBuffer);
+    CL_LOAD_FN("clEnqueueWriteBuffer", cl.enqueueWriteBuffer);
+    CL_LOAD_FN("clEnqueueCopyBuffer", cl.enqueueCopyBuffer);
+//    CL_LOAD_FN("clEnqueueReadImage", cl.enqueueReadImage);
+//    CL_LOAD_FN("clEnqueueWriteImage", cl.enqueueWriteImage);
+//    CL_LOAD_FN("clEnqueueCopyImage", cl.enqueueCopyImage);
+//    CL_LOAD_FN("clEnqueueCopyImageToBuffer", cl.enqueueCopyImageToBuffer);
+//    CL_LOAD_FN("clEnqueueCopyBufferToImage", cl.enqueueCopyBufferToImage);
+//    CL_LOAD_FN("clEnqueueMapBuffer", cl.enqueueMapBuffer);
+//    CL_LOAD_FN("clEnqueueMapImage", cl.enqueueMapImage);
+//    CL_LOAD_FN("clEnqueueUnmapMemObject", cl.enqueueUnmapMemObject);
+    CL_LOAD_FN("clEnqueueNDRangeKernel", cl.enqueueNDRAngeKernel);
+//    CL_LOAD_FN("clEnqueueTask", cl.enqueueTask);
+//    CL_LOAD_FN("clEnqueueNativeKernel", cl.enqueueNativeKernel);
+//    CL_LOAD_FN("clEnqueueMarker", cl.enqueueMarker);
+//    CL_LOAD_FN("clEnqueueWaitForEvents", cl.enqueueWaitForEvents);
+    CL_LOAD_FN("clEnqueueBarrier", cl.enqueueBarrier);
+//    CL_LOAD_FN("clGetExtensionFunctionAddress", cl.getExtensionFunctionAddress);
+
+    return CL_SUCCESS;
+}
--- a/vp8/common/opencl/dynamic_cl.h
+++ b/vp8/common/opencl/dynamic_cl.h
@@ -0,0 +1,253 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef DYNAMIC_CL_H
+#define	DYNAMIC_CL_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+    
+#include <dlfcn.h>
+
+int load_cl(char *lib_name);
+int close_cl();
+
+extern int cl_loaded;
+
+typedef cl_int(*fn_clGetPlatformIDs_t)(cl_uint, cl_platform_id *, cl_uint *);
+typedef cl_int(*fn_clGetPlatformInfo_t)(cl_platform_id, cl_platform_info, size_t, void *, size_t *);
+typedef cl_int(*fn_clGetDeviceIDs_t)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
+typedef cl_int(*fn_clGetDeviceInfo_t)(cl_device_id, cl_device_info, size_t, void *, size_t *);
+typedef cl_context(*fn_clCreateContext_t)(const cl_context_properties *, cl_uint, const cl_device_id *, void (*pfn_notify)(const char *, const void *, size_t, void *), void *, cl_int *);
+typedef cl_context(*fn_clCreateContextFromType_t)(const cl_context_properties *, cl_device_type, void (*pfn_notify)(const char *, const void *, size_t, void *), void *, cl_int *);
+typedef cl_int(*fn_clRetainContext_t)(cl_context);
+typedef cl_int(*fn_clReleaseContext_t)(cl_context);
+typedef cl_int(*fn_clGetContextInfo_t)(cl_context, cl_context_info, size_t, void *, size_t *);
+typedef cl_command_queue(*fn_clCreateCommandQueue_t)(cl_context, cl_device_id, cl_command_queue_properties, cl_int *);
+typedef cl_int(*fn_clRetainCommandQueue_t)(cl_command_queue);
+typedef cl_int(*fn_clReleaseCommandQueue_t)(cl_command_queue);
+typedef cl_int(*fn_clGetCommandQueueInfo_t)(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *);
+typedef cl_mem(*fn_clCreateBuffer_t)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
+typedef cl_mem(*fn_clCreateImage2D_t)(cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t, void *, cl_int *);
+typedef cl_mem(*fn_clCreateImage3D_t)(cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t, size_t, size_t, void *, cl_int *);
+typedef cl_int(*fn_clRetainMemObject_t)(cl_mem);
+typedef cl_int(*fn_clReleaseMemObject_t)(cl_mem);
+typedef cl_int(*fn_clGetSupportedImageFormats_t)(cl_context, cl_mem_flags, cl_mem_object_type, cl_uint, cl_image_format *, cl_uint *);
+typedef cl_int(*fn_clGetMemObjectInfo_t)(cl_mem, cl_mem_info, size_t, void *, size_t *);
+typedef cl_int(*fn_clGetImageInfo_t)(cl_mem, cl_image_info, size_t, void *, size_t *);
+typedef cl_sampler(*fn_clCreateSampler_t)(cl_context, cl_bool, cl_addressing_mode, cl_filter_mode, cl_int *);
+typedef cl_int(*fn_clRetainSampler_t)(cl_sampler);
+typedef cl_int(*fn_clReleaseSampler_t)(cl_sampler);
+typedef cl_int(*fn_clGetSamplerInfo_t)(cl_sampler, cl_sampler_info, size_t, void *, size_t *);
+typedef cl_program(*fn_clCreateProgramWithSource_t)(cl_context, cl_uint, const char **, const size_t *, cl_int *);
+typedef cl_program(*fn_clCreateProgramWithBinary_t)(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *);
+typedef cl_int(*fn_clRetainProgram_t)(cl_program);
+typedef cl_int(*fn_clReleaseProgram_t)(cl_program);
+typedef cl_int(*fn_clBuildProgram_t)(cl_program, cl_uint, const cl_device_id *, const char *,  void (*pfn_notify)(cl_program,void*), void *);
+typedef cl_int(*fn_clUnloadCompiler_t)(void);
+typedef cl_int(*fn_clGetProgramInfo_t)(cl_program, cl_program_info, size_t, void *, size_t *);
+typedef cl_int(*fn_clGetProgramBuildInfo_t)(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *);
+typedef cl_kernel(*fn_clCreateKernel_t)(cl_program, const char *, cl_int *);
+typedef cl_int(*fn_clCreateKernelsInProgram_t)(cl_program, cl_uint, cl_kernel *, cl_uint *);
+typedef cl_int(*fn_clRetainKernel_t)(cl_kernel);
+typedef cl_int(*fn_clReleaseKernel_t)(cl_kernel);
+typedef cl_int(*fn_clSetKernelArg_t)(cl_kernel, cl_uint, size_t, const void *);
+typedef cl_int(*fn_clGetKernelInfo_t)(cl_kernel, cl_kernel_info, size_t, void *, size_t *);
+typedef cl_int(*fn_clGetKernelWorkGroupInfo_t)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *);
+typedef cl_int(*fn_clWaitForEvents_t)(cl_uint, const cl_event *);
+typedef cl_int(*fn_clGetEventInfo_t)(cl_event, cl_event_info, size_t, void *, size_t *);
+typedef cl_int(*fn_clRetainEvent_t)(cl_event);
+typedef cl_int(*fn_clReleaseEvent_t)(cl_event);
+typedef cl_int(*fn_clGetEventProfilingInfo_t)(cl_event, cl_profiling_info, size_t, void *, size_t *);
+typedef cl_int(*fn_clFlush_t)(cl_command_queue);
+typedef cl_int(*fn_clFinish_t)(cl_command_queue);
+typedef cl_int(*fn_clEnqueueReadBuffer_t)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueWriteBuffer_t)(cl_command_queue,  cl_mem,  cl_bool,  size_t,  size_t,  const void *,  cl_uint,  const cl_event *,  cl_event *);
+typedef cl_int(*fn_clEnqueueCopyBuffer_t)(cl_command_queue,  cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueReadImage_t)(cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueWriteImage_t)(cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueCopyImage_t)(cl_command_queue, cl_mem, cl_mem, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueCopyImageToBuffer_t)(cl_command_queue, cl_mem, cl_mem, const size_t *, const size_t *, size_t, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueCopyBufferToImage_t)(cl_command_queue, cl_mem, cl_mem, size_t, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
+typedef void*(*fn_clEnqueueMapBuffer_t)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *);
+typedef void*(*fn_clEnqueueMapImage_t)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, const size_t *, const size_t *, size_t *, size_t *, cl_uint, const cl_event *, cl_event *, cl_int *);
+typedef cl_int(*fn_clEnqueueUnmapMemObject_t)(cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueNDRangeKernel_t)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueTask_t)(cl_command_queue, cl_kernel, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueNativeKernel_t)(cl_command_queue,					 void (*user_func)(void *), void *, size_t, cl_uint, const cl_mem *, const void **, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueMarker_t)(cl_command_queue, cl_event *);
+typedef cl_int(*fn_clEnqueueWaitForEvents_t)(cl_command_queue, cl_uint, const cl_event *);
+typedef cl_int(*fn_clEnqueueBarrier_t)(cl_command_queue);
+typedef void*(*fn_clGetExtensionFunctionAddress_t)(const char *);
+
+typedef struct CL_FUNCTIONS {
+    fn_clGetPlatformIDs_t getPlatformIDs;
+    fn_clGetPlatformInfo_t getPlatformInfo;
+    fn_clGetDeviceIDs_t getDeviceIDs;
+    fn_clGetDeviceInfo_t getDeviceInfo;
+    fn_clCreateContext_t createContext;
+    fn_clCreateContextFromType_t createContextFromType;
+    fn_clRetainContext_t retainContext;
+    fn_clReleaseContext_t releaseContext;
+    fn_clGetContextInfo_t getContextInfo;
+    fn_clCreateCommandQueue_t createCommandQueue;
+    fn_clRetainCommandQueue_t retainCommandQueue;
+    fn_clReleaseCommandQueue_t releaseCommandQueue;
+    fn_clGetCommandQueueInfo_t getCommandQueue;
+    fn_clCreateBuffer_t createBuffer;
+    fn_clCreateImage2D_t createImage2D;
+    fn_clCreateImage3D_t createImage3D;
+    fn_clRetainMemObject_t retainMemObject;
+    fn_clReleaseMemObject_t releaseMemObject;
+    fn_clGetSupportedImageFormats_t getSupportedImageFormats;
+    fn_clGetMemObjectInfo_t getMemObjectInfo;
+    fn_clGetImageInfo_t getImageInfo;
+    fn_clCreateSampler_t createSampler;
+    fn_clRetainSampler_t retainSampler;
+    fn_clReleaseSampler_t releaseSampler;
+    fn_clGetSamplerInfo_t getSamplerInfo;
+    fn_clCreateProgramWithSource_t createProgramWithSource;
+    fn_clCreateProgramWithBinary_t createProgramWithBinary;
+    fn_clRetainProgram_t retainProgram;
+    fn_clReleaseProgram_t releaseProgram;
+    fn_clBuildProgram_t buildProgram;
+    fn_clUnloadCompiler_t unloadCompiler;
+    fn_clGetProgramInfo_t getProgramInfo;
+    fn_clGetProgramBuildInfo_t getProgramBuildInfo;
+    fn_clCreateKernel_t createKernel;
+    fn_clCreateKernelsInProgram_t createKernelsInProgram;
+    fn_clRetainKernel_t retainKernel;
+    fn_clReleaseKernel_t releaseKernel;
+    fn_clSetKernelArg_t setKernelArg;
+    fn_clGetKernelInfo_t getKernelInfo;
+    fn_clGetKernelWorkGroupInfo_t getKernelWorkGroupInfo;
+    fn_clWaitForEvents_t waitForEvents;
+    fn_clGetEventInfo_t getEventInfo;
+    fn_clRetainEvent_t retainEvent;
+    fn_clReleaseEvent_t releaseEvent;
+    fn_clGetEventProfilingInfo_t getEventProfilingInfo;
+    fn_clFlush_t flush;
+    fn_clFinish_t finish;
+    fn_clEnqueueReadBuffer_t enqueueReadBuffer;
+    fn_clEnqueueWriteBuffer_t enqueueWriteBuffer;
+    fn_clEnqueueCopyBuffer_t enqueueCopyBuffer;
+    fn_clEnqueueReadImage_t enqueueReadImage;
+    fn_clEnqueueWriteImage_t enqueueWriteImage;
+    fn_clEnqueueCopyImage_t enqueueCopyImage;
+    fn_clEnqueueCopyImageToBuffer_t enqueueCopyImageToBuffer;
+    fn_clEnqueueCopyBufferToImage_t enqueueCopyBufferToImage;
+    fn_clEnqueueMapBuffer_t enqueueMapBuffer;
+    fn_clEnqueueMapImage_t enqueueMapImage;
+    fn_clEnqueueUnmapMemObject_t enqueueUnmapMemObject;
+    fn_clEnqueueNDRangeKernel_t enqueueNDRAngeKernel;
+    fn_clEnqueueTask_t enqueueTask;
+    fn_clEnqueueNativeKernel_t enqueueNativeKernel;
+    fn_clEnqueueMarker_t enqueueMarker;
+    fn_clEnqueueWaitForEvents_t enqueueWaitForEvents;
+    fn_clEnqueueBarrier_t enqueueBarrier;
+    fn_clGetExtensionFunctionAddress_t getExtensionFunctionAddress;
+} CL_FUNCTIONS;
+
+extern CL_FUNCTIONS cl;
+
+#define clGetPlatformIDs cl.getPlatformIDs
+#define clGetPlatformInfo cl.getPlatformInfo
+#define clGetDeviceIDs cl.getDeviceIDs
+#define clGetDeviceInfo cl.getDeviceInfo
+#define clCreateContext cl.createContext
+#define clCreateContextFromType cl.createContextFromType
+#define clRetainContext cl.retainContext
+#define clReleaseContext cl.releaseContext
+#define clGetContextInfo cl.getContextInfo
+#define clCreateCommandQueue cl.createCommandQueue
+#define clRetainCommandQueue cl.retainCommandQueue
+#define clReleaseCommandQueue cl.releaseCommandQueue
+#define clGetCommandQueueInfo cl.getCommandQueue
+#define clCreateBuffer cl.createBuffer
+#define clCreateSubBuffer cl.createSubBuffer
+#define clCreateImage2D cl.createImage2D
+#define clCreateImage3D cl.createImage3D
+#define clRetainMemObject cl.retainMemObject
+#define clReleaseMemObject cl.releaseMemObject
+#define clGetSupportedImageFormats cl.getSupportedImageFormats
+#define clGetMemObjectInfo cl.getMemObjectInfo
+#define clGetImageInfo cl.getImageInfo
+#define clSetMemObjectDestructorCallback cl.setMemObjectDestructorCallback
+#define clCreateSampler cl.createSampler
+#define clRetainSampler cl.retainSampler
+#define clReleaseSampler cl.releaseSampler
+#define clGetSamplerInfo cl.getSamplerInfo
+#define clCreateProgramWithSource cl.createProgramWithSource
+#define clCreateProgramWithBinary cl.createProgramWithBinary
+#define clRetainProgram cl.retainProgram
+#define clReleaseProgram cl.releaseProgram
+#define clBuildProgram cl.buildProgram
+#define clUnloadCompiler cl.unloadCompiler
+#define clGetProgramInfo cl.getProgramInfo
+#define clGetProgramBuildInfo cl.getProgramBuildInfo
+#define clCreateKernel cl.createKernel
+#define clCreateKernelsInProgram cl.createKernelsInProgram
+#define clRetainKernel cl.retainKernel
+#define clReleaseKernel cl.releaseKernel
+#define clSetKernelArg cl.setKernelArg
+#define clGetKernelInfo cl.getKernelInfo
+#define clGetKernelWorkGroupInfo cl.getKernelWorkGroupInfo
+#define clWaitForEvents cl.waitForEvents
+#define clGetEventInfo cl.getEventInfo
+#define clCreateUserEvent cl.createUserEvent
+#define clRetainEvent cl.retainEvent
+#define clReleaseEvent cl.releaseEvent
+#define clSetUserEventStatus cl.setUserEventStatus
+#define clSetEventCallback cl.setEventCallback
+#define clGetEventProfilingInfo cl.getEventProfilingInfo
+#define clFlush cl.flush
+#define clFinish cl.finish
+#define clEnqueueReadBuffer cl.enqueueReadBuffer
+#define clEnqueueReadBufferRect cl.enqueueReadBufferRect
+#define clEnqueueWriteBuffer cl.enqueueWriteBuffer
+#define clEnqueueWriteBufferRect cl.enqueueWriteBufferRect
+#define clEnqueueCopyBuffer cl.enqueueCopyBuffer
+#define clEnqueueCopyBufferRect cl.enqueueCopyBufferRect
+#define clEnqueueReadImage cl.enqueueReadImage
+#define clEnqueueWriteImage cl.enqueueWriteImage
+#define clEnqueueCopyImage cl.enqueueCopyImage
+#define clEnqueueCopyImageToBuffer cl.enqueueCopyImageToBuffer
+#define clEnqueueCopyBufferToImage cl.enqueueCopyBufferToImage
+#define clEnqueueMapBuffer cl.enqueueMapBuffer
+#define clEnqueueMapImage cl.enqueueMapImage
+#define clEnqueueUnmapMemObject cl.enqueueUnmapMemObject
+#define clEnqueueNDRangeKernel cl.enqueueNDRAngeKernel
+#define clEnqueueTask cl.enqueueTask
+#define clEnqueueNativeKernel cl.enqueueNativeKernel
+#define clEnqueueMarker cl.enqueueMarker
+#define clEnqueueWaitForEvents cl.enqueueWaitForEvents
+#define clEnqueueBarrier cl.enqueueBarrier
+#define clGetExtensionFunctionAddress cl.getExtensionFunctionAddress
+
+#define CL_LOAD_FN(name, ref) \
+    ref = dlsym(dll,name); \
+    if (ref == NULL){ \
+        dlclose(dll); \
+        return CL_INVALID_PLATFORM; \
+    }
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* DYNAMIC_CL_H */
--- a/vp8/common/opencl/filter_cl.c
+++ b/vp8/common/opencl/filter_cl.c
@@ -0,0 +1,824 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+
+//ACW: Remove me after debugging.
+#include <stdio.h>
+#include <string.h>
+
+#include "vp8_opencl.h"
+#include "filter_cl.h"
+#include "../blockd.h"
+
+#define SIXTAP_FILTER_LEN 6
+
+const char *filterCompileOptions = "-Ivp8/common/opencl -DVP8_FILTER_WEIGHT=128 -DVP8_FILTER_SHIFT=7 -DFILTER_OFFSET";
+const char *filter_cl_file_name = "vp8/common/opencl/filter_cl.cl";
+
+#define STATIC_MEM 1
+#if STATIC_MEM
+static cl_mem int_mem = NULL;
+#endif
+
+void cl_destroy_filter(){
+
+    if (cl_data.filter_program)
+        clReleaseProgram(cl_data.filter_program);
+
+    //VP8_CL_RELEASE_KERNEL(cl_data.vp8_block_variation_kernel);
+#if !TWO_PASS_SIXTAP
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict8x8_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict8x4_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict16x16_kernel);
+#else
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_first_pass_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_second_pass_kernel);
+#endif
+    //VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict4x4_kernel);
+    //VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict8x4_kernel);
+    //VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict8x8_kernel);
+    //VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict16x16_kernel);
+
+#if MEM_COPY_KERNEL
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_memcpy_kernel);
+#endif
+
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_bil_first_pass_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_bil_second_pass_kernel);
+
+#if STATIC_MEM
+    if (int_mem != NULL)
+        clReleaseMemObject(int_mem);
+    int_mem = NULL;
+#endif
+
+    cl_data.filter_program = NULL;
+}
+
+int cl_init_filter() {
+    int err;
+
+
+    // Create the filter compute program from the file-defined source code
+    if ( cl_load_program(&cl_data.filter_program, filter_cl_file_name,
+            filterCompileOptions) != CL_SUCCESS )
+        return VP8_CL_TRIED_BUT_FAILED;
+
+    // Create the compute kernel in the program we wish to run
+#if TWO_PASS_SIXTAP
+    VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_first_pass_kernel,"vp8_filter_block2d_first_pass_kernel");
+    VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_second_pass_kernel,"vp8_filter_block2d_second_pass_kernel");
+    VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_first_pass_kernel,vp8_filter_block2d_first_pass_kernel_size);
+    VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_second_pass_kernel,vp8_filter_block2d_second_pass_kernel_size);
+#else
+    VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict_kernel,"vp8_sixtap_predict_kernel");
+    VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict_kernel,vp8_sixtap_predict_kernel_size);
+    VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict8x8_kernel,"vp8_sixtap_predict8x8_kernel");
+    VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict8x8_kernel,vp8_sixtap_predict8x8_kernel_size);
+    VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict8x4_kernel,"vp8_sixtap_predict8x4_kernel");
+    VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict8x4_kernel,vp8_sixtap_predict8x4_kernel_size);
+    VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict16x16_kernel,"vp8_sixtap_predict16x16_kernel");
+    VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict16x16_kernel,vp8_sixtap_predict16x16_kernel_size);
+#endif
+    
+    //VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_bil_first_pass_kernel,vp8_filter_block2d_bil_first_pass_kernel_size);
+    //VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_bil_second_pass_kernel,vp8_filter_block2d_bil_second_pass_kernel_size);
+    VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_bil_first_pass_kernel,"vp8_filter_block2d_bil_first_pass_kernel");
+    VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_bil_second_pass_kernel,"vp8_filter_block2d_bil_second_pass_kernel");
+
+
+    //VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict4x4_kernel,"vp8_bilinear_predict4x4_kernel");
+    //VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict8x4_kernel,"vp8_bilinear_predict8x4_kernel");
+    //VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict8x8_kernel,"vp8_bilinear_predict8x8_kernel");
+    //VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict16x16_kernel,"vp8_bilinear_predict16x16_kernel");
+
+#if MEM_COPY_KERNEL
+    VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_memcpy_kernel,"vp8_memcpy_kernel");
+    VP8_CL_CALC_LOCAL_SIZE(vp8_memcpy_kernel,vp8_memcpy_kernel_size);
+#endif
+
+#if STATIC_MEM
+    VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,err);
+#endif
+
+    return CL_SUCCESS;
+}
+
+void vp8_filter_block2d_first_pass_cl(
+    cl_command_queue cq,
+    cl_mem src_mem,
+    int src_offset,
+    cl_mem int_mem,
+    unsigned int src_pixels_per_line,
+    unsigned int int_height,
+    unsigned int int_width,
+    int xoffset
+){
+    int err;
+    size_t global = int_width*int_height;
+    size_t local = cl_data.vp8_filter_block2d_first_pass_kernel_size;
+    if (local > global)
+        local = global;
+
+    err =  clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 0, sizeof (cl_mem), &src_mem);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 1, sizeof (int), &src_offset);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 2, sizeof (cl_mem), &int_mem);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 3, sizeof (cl_uint), &src_pixels_per_line);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 4, sizeof (cl_uint), &int_height);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 5, sizeof (cl_int), &int_width);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 6, sizeof (int), &xoffset);
+    VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",
+        ,
+    );
+
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_first_pass_kernel, 1, NULL, &global, &local , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);,
+    );
+}
+
+void vp8_filter_block2d_second_pass_cl(
+    cl_command_queue cq,
+    cl_mem int_mem,
+    int int_offset,
+    cl_mem dst_mem,
+    int dst_offset,
+    int dst_pitch,
+    unsigned int output_height,
+    unsigned int output_width,
+    int yoffset
+){
+    int err;
+    size_t global = output_width*output_height;
+    size_t local = cl_data.vp8_filter_block2d_second_pass_kernel_size;
+    if (local > global){
+        //printf("Local is now %ld\n",global);
+        local = global;
+    }
+
+    /* Set kernel arguments */
+    err =  clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 0, sizeof (cl_mem), &int_mem);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 1, sizeof (int), &int_offset);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 2, sizeof (cl_mem), &dst_mem);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 3, sizeof (int), &dst_offset);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 4, sizeof (int), &dst_pitch);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 5, sizeof (int), &output_width);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 6, sizeof (int), &output_width);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 7, sizeof (int), &output_height);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 8, sizeof (int), &output_width);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 9, sizeof (int), &yoffset);
+    VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",
+        ,
+    );
+
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_second_pass_kernel, 1, NULL, &global, &local , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);,
+    );
+}
+
+void vp8_sixtap_single_pass(
+    cl_command_queue cq,
+    cl_kernel kernel,
+    size_t local,
+    size_t global,
+    cl_mem src_mem,
+    cl_mem dst_mem,
+    unsigned char *src_base,
+    int src_offset,
+    size_t src_len,
+    int src_pixels_per_line,
+    int xoffset,
+    int yoffset,
+    unsigned char *dst_base,
+    int dst_offset,
+    int dst_pitch,
+    size_t dst_len
+){
+    int err;
+
+#if !STATIC_MEM
+    cl_mem int_mem;
+#endif
+
+    int free_src = 0, free_dst = 0;
+
+    if (local > global){
+        local = global;
+    }
+
+    /* Make space for kernel input/output data.
+     * Initialize the buffer as well if needed.
+     */
+    if (src_mem == NULL){
+        VP8_CL_CREATE_BUF( cq, src_mem,, sizeof (unsigned char) * src_len, src_base-2,,);
+        src_offset = 2;
+        free_src = 1;
+    } else {
+        src_offset -= 2*src_pixels_per_line;
+    }
+
+    if (dst_mem == NULL){
+        VP8_CL_CREATE_BUF( cq, dst_mem,, sizeof (unsigned char) * dst_len + dst_offset, dst_base,, );
+        free_dst = 1;
+    }
+
+#if !STATIC_MEM
+    CL_CREATE_BUF( cq, int_mem,, sizeof(cl_int)*FData_height*FData_width, NULL,, );
+#endif
+
+    err =  clSetKernelArg(kernel, 0, sizeof (cl_mem), &src_mem);
+    err |= clSetKernelArg(kernel, 1, sizeof (int), &src_offset);
+    err |= clSetKernelArg(kernel, 2, sizeof (cl_int), &src_pixels_per_line);
+    err |= clSetKernelArg(kernel, 3, sizeof (cl_int), &xoffset);
+    err |= clSetKernelArg(kernel, 4, sizeof (cl_int), &yoffset);
+    err |= clSetKernelArg(kernel, 5, sizeof (cl_mem), &dst_mem);
+    err |= clSetKernelArg(kernel, 6, sizeof (cl_int), &dst_offset);
+    err |= clSetKernelArg(kernel, 7, sizeof (int), &dst_pitch);
+    VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",
+        ,
+    );
+
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel( cq, kernel, 1, NULL, &global, &local , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);,
+    );
+
+    if (free_src == 1)
+        clReleaseMemObject(src_mem);
+
+    if (free_dst == 1){
+        /* Read back the result data from the device */
+        err = clEnqueueReadBuffer(cq, dst_mem, CL_FALSE, 0, sizeof (unsigned char) * dst_len + dst_offset, dst_base, 0, NULL, NULL);
+        VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+            "Error: Failed to read output array!\n",
+            ,
+        );
+        clReleaseMemObject(dst_mem);
+    }
+}
+
+void vp8_sixtap_run_cl(
+    cl_command_queue cq,
+    cl_mem src_mem,
+    cl_mem dst_mem,
+    unsigned char *src_base,
+    int src_offset,
+    size_t src_len,
+    int src_pixels_per_line,
+    int xoffset,
+    int yoffset,
+    unsigned char *dst_base,
+    int dst_offset,
+    int dst_pitch,
+    size_t dst_len,
+    unsigned int FData_height,
+    unsigned int FData_width,
+    unsigned int output_height,
+    unsigned int output_width,
+    int int_offset
+)
+{
+    int err;
+
+#if !STATIC_MEM
+    cl_mem int_mem;
+#endif
+
+    int free_src = 0, free_dst = 0;
+
+    /* Make space for kernel input/output data.
+     * Initialize the buffer as well if needed.
+     */
+    if (src_mem == NULL){
+        VP8_CL_CREATE_BUF( cq, src_mem,, sizeof (unsigned char) * src_len, src_base-2,,);
+        src_offset = 2;
+        free_src = 1;
+    } else {
+        src_offset -= 2*src_pixels_per_line;
+    }
+
+    if (dst_mem == NULL){
+        VP8_CL_CREATE_BUF( cq, dst_mem,, sizeof (unsigned char) * dst_len + dst_offset, dst_base,, );
+        free_dst = 1;
+    }
+
+#if !STATIC_MEM
+    CL_CREATE_BUF( cq, int_mem,, sizeof(cl_int)*FData_height*FData_width, NULL,, );
+#endif
+
+    vp8_filter_block2d_first_pass_cl(
+        cq, src_mem, src_offset, int_mem, src_pixels_per_line,
+        FData_height, FData_width, xoffset
+    );
+
+    vp8_filter_block2d_second_pass_cl(cq,int_mem,int_offset,dst_mem,dst_offset,dst_pitch,
+            output_height,output_width,yoffset);
+
+    if (free_src == 1)
+        clReleaseMemObject(src_mem);
+
+    if (free_dst == 1){
+        /* Read back the result data from the device */
+        err = clEnqueueReadBuffer(cq, dst_mem, CL_FALSE, 0, sizeof (unsigned char) * dst_len + dst_offset, dst_base, 0, NULL, NULL);
+        VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+            "Error: Failed to read output array!\n",
+            ,
+        );
+        clReleaseMemObject(dst_mem);
+    }
+
+#if !STATIC_MEM
+    clReleaseMemObject(int_mem);
+#endif
+}
+
+void vp8_sixtap_predict4x4_cl
+(
+    cl_command_queue cq,
+    unsigned char *src_base,
+    cl_mem src_mem,
+    int src_offset,
+    int src_pixels_per_line,
+    int xoffset,
+    int yoffset,
+    unsigned char *dst_base,
+    cl_mem dst_mem,
+    int dst_offset,
+    int dst_pitch
+) {
+
+    int output_width=4, output_height=4, FData_height=9, FData_width=4;
+
+    //Size of output to transfer
+    int dst_len = DST_LEN(dst_pitch,output_height,output_width);
+    int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line);
+
+#if TWO_PASS_SIXTAP
+    int int_offset = 8;
+    unsigned char *src_ptr = src_base + src_offset;
+
+    vp8_sixtap_run_cl(cq, src_mem, dst_mem,
+            (src_ptr-2*src_pixels_per_line),src_offset, src_len,
+            src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset,
+            dst_pitch,dst_len,FData_height,FData_width,output_height,
+            output_width,int_offset
+    );
+#else
+    vp8_sixtap_single_pass(
+            cq,
+            cl_data.vp8_sixtap_predict_kernel,
+            cl_data.vp8_sixtap_predict_kernel_size,
+            FData_height*FData_width,
+            src_mem,
+            dst_mem,
+            src_base,
+            src_offset,
+            src_len,
+            src_pixels_per_line,
+            xoffset,
+            yoffset,
+            dst_base,
+            dst_offset,
+            dst_pitch,
+            dst_len
+    );
+#endif
+
+
+    return;
+}
+
+void vp8_sixtap_predict8x8_cl
+(
+    cl_command_queue cq,
+    unsigned char *src_base,
+    cl_mem src_mem,
+    int src_offset,
+    int src_pixels_per_line,
+    int xoffset,
+    int yoffset,
+    unsigned char *dst_base,
+    cl_mem dst_mem,
+    int dst_offset,
+    int dst_pitch
+) {
+    int output_width=8, output_height=8, FData_height=13, FData_width=8;
+
+    //Size of output to transfer
+    int dst_len = DST_LEN(dst_pitch,output_height,output_width);
+    int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line);
+
+#if TWO_PASS_SIXTAP
+    int int_offset = 16;
+    unsigned char *src_ptr = src_base + src_offset;
+
+    vp8_sixtap_run_cl(cq, src_mem, dst_mem,
+            (src_ptr-2*src_pixels_per_line),src_offset, src_len,
+            src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset,
+            dst_pitch,dst_len,FData_height,FData_width,output_height,
+            output_width,int_offset
+    );
+#else
+    vp8_sixtap_single_pass(
+            cq,
+            cl_data.vp8_sixtap_predict8x8_kernel,
+            cl_data.vp8_sixtap_predict8x8_kernel_size,
+            FData_height*FData_width,
+            src_mem,
+            dst_mem,
+            src_base,
+            src_offset,
+            src_len,
+            src_pixels_per_line,
+            xoffset,
+            yoffset,
+            dst_base,
+            dst_offset,
+            dst_pitch,
+            dst_len
+    );
+#endif
+
+    return;
+}
+
+void vp8_sixtap_predict8x4_cl
+(
+    cl_command_queue cq,
+    unsigned char *src_base,
+    cl_mem src_mem,
+    int src_offset,
+    int src_pixels_per_line,
+    int xoffset,
+    int yoffset,
+    unsigned char *dst_base,
+    cl_mem dst_mem,
+    int dst_offset,
+    int dst_pitch
+) {
+
+    int output_width=8, output_height=4, FData_height=9, FData_width=8;
+
+    //Size of output to transfer
+    int dst_len = DST_LEN(dst_pitch,output_height,output_width);
+    int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line);
+
+#if TWO_PASS_SIXTAP
+    int int_offset = 16;
+    unsigned char *src_ptr = src_base + src_offset;
+    
+    vp8_sixtap_run_cl(cq, src_mem, dst_mem,
+            (src_ptr-2*src_pixels_per_line),src_offset, src_len,
+            src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset,
+            dst_pitch,dst_len,FData_height,FData_width,output_height,
+            output_width,int_offset
+    );
+#else
+    vp8_sixtap_single_pass(
+            cq,
+            cl_data.vp8_sixtap_predict8x4_kernel,
+            cl_data.vp8_sixtap_predict8x4_kernel_size,
+            FData_height*FData_width,
+            src_mem,
+            dst_mem,
+            src_base,
+            src_offset,
+            src_len,
+            src_pixels_per_line,
+            xoffset,
+            yoffset,
+            dst_base,
+            dst_offset,
+            dst_pitch,
+            dst_len
+    );
+#endif
+
+    return;
+}
+
+void vp8_sixtap_predict16x16_cl
+(
+    cl_command_queue cq,
+    unsigned char *src_base,
+    cl_mem src_mem,
+    int src_offset,
+    int src_pixels_per_line,
+    int xoffset,
+    int yoffset,
+    unsigned char *dst_base,
+    cl_mem dst_mem,
+    int dst_offset,
+    int dst_pitch
+) {
+
+    int output_width=16, output_height=16, FData_height=21, FData_width=16;
+
+    //Size of output to transfer
+    int dst_len = DST_LEN(dst_pitch,output_height,output_width);
+    int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line);
+
+#if TWO_PASS_SIXTAP
+    int int_offset = 32;
+    unsigned char *src_ptr = src_base + src_offset;
+
+    vp8_sixtap_run_cl(cq, src_mem, dst_mem,
+            (src_ptr-2*src_pixels_per_line),src_offset, src_len,
+            src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset,
+            dst_pitch,dst_len,FData_height,FData_width,output_height,
+            output_width,int_offset
+    );
+#else
+    vp8_sixtap_single_pass(
+            cq,
+            cl_data.vp8_sixtap_predict16x16_kernel,
+            cl_data.vp8_sixtap_predict16x16_kernel_size,
+            FData_height*FData_width,
+            src_mem,
+            dst_mem,
+            src_base,
+            src_offset,
+            src_len,
+            src_pixels_per_line,
+            xoffset,
+            yoffset,
+            dst_base,
+            dst_offset,
+            dst_pitch,
+            dst_len
+    );
+#endif
+
+    return;
+
+}
+
+
+
+void vp8_filter_block2d_bil_first_pass_cl(
+    cl_command_queue cq,
+    unsigned char *src_base,
+    cl_mem src_mem,
+    int src_offset,
+    cl_mem int_mem,
+    int src_pixels_per_line,
+    int height,
+    int width,
+    int xoffset
+)
+{
+    int err;
+    size_t global = width*height;
+    int free_src = 0;
+
+    if (src_mem == NULL){
+        int src_len = BIL_SRC_LEN(width,height,src_pixels_per_line);
+
+        /*Make space for kernel input/output data. Initialize the buffer as well if needed. */
+        VP8_CL_CREATE_BUF(cq, src_mem, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
+            sizeof (unsigned char) * src_len, src_base+src_offset,,
+        );
+        src_offset = 0; //Set to zero as long as src_mem starts at base+offset
+        free_src = 1;
+    }
+
+    err =  clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 0, sizeof (cl_mem), &src_mem);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 1, sizeof (int), &src_offset);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 2, sizeof (cl_mem), &int_mem);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 3, sizeof (int), &src_pixels_per_line);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 4, sizeof (int), &height);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 5, sizeof (int), &width);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 6, sizeof (int), &xoffset);
+    VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",
+        ,
+    );
+
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_bil_first_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);,
+    );
+
+    if (free_src == 1)
+        clReleaseMemObject(src_mem);
+}
+
+
+void vp8_filter_block2d_bil_second_pass_cl(
+    cl_command_queue cq,
+    cl_mem int_mem,
+    unsigned char *dst_base,
+    cl_mem dst_mem,
+    int dst_offset,
+    int dst_pitch,
+    int height,
+    int width,
+    int yoffset
+)
+{
+    int err;
+    size_t global = width*height;
+
+    //Size of output data
+    int dst_len = DST_LEN(dst_pitch,height,width);
+
+    int free_dst = 0;
+    if (dst_mem == NULL){
+        VP8_CL_CREATE_BUF(cq, dst_mem, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR,
+            sizeof (unsigned char) * dst_len + dst_offset, dst_base,,
+        );
+        free_dst = 1;
+    }
+
+    err =  clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 0, sizeof (cl_mem), &int_mem);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 1, sizeof (cl_mem), &dst_mem);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 2, sizeof (int), &dst_offset);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 3, sizeof (int), &dst_pitch);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 4, sizeof (int), &height);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 5, sizeof (int), &width);
+    err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 6, sizeof (int), &yoffset);
+    VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",
+        ,
+    );
+
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_bil_second_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);,
+    );
+
+    if (free_dst == 1){
+        /* Read back the result data from the device */
+        err = clEnqueueReadBuffer(cq, dst_mem, CL_FALSE, 0, sizeof (unsigned char) * dst_len + dst_offset, dst_base, 0, NULL, NULL);
+        VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+            "Error: Failed to read output array!\n",
+            ,
+        );
+        clReleaseMemObject(dst_mem);
+    }
+
+}
+
+void vp8_bilinear_predict4x4_cl
+(
+    cl_command_queue cq,
+    unsigned char *src_base,
+    cl_mem src_mem,
+    int src_offset,
+    int src_pixels_per_line,
+    int xoffset,
+    int yoffset,
+    unsigned char *dst_base,
+    cl_mem dst_mem,
+    int dst_offset,
+    int dst_pitch
+) {
+
+    const int height = 4, width = 4;
+
+#if !STATIC_MEM
+    int err;
+    cl_mem int_mem = NULL;
+    VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,);
+#endif
+    
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset);
+
+    /* then 1-D vertically... */
+    vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset);
+
+#if !STATIC_MEM
+    clReleaseMemObject(int_mem);
+#endif
+
+}
+
+void vp8_bilinear_predict8x8_cl
+(
+    cl_command_queue cq,
+    unsigned char *src_base,
+    cl_mem src_mem,
+    int src_offset,
+    int src_pixels_per_line,
+    int xoffset,
+    int yoffset,
+    unsigned char *dst_base,
+    cl_mem dst_mem,
+    int dst_offset,
+    int dst_pitch
+) {
+
+    const int height = 8, width = 8;
+
+#if !STATIC_MEM
+    int err;
+    cl_mem int_mem = NULL;
+    VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,);
+#endif
+
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset);
+
+    /* then 1-D vertically... */
+    vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset);
+
+#if !STATIC_MEM
+    clReleaseMemObject(int_mem);
+#endif
+    
+}
+
+void vp8_bilinear_predict8x4_cl
+(
+    cl_command_queue cq,
+    unsigned char *src_base,
+    cl_mem src_mem,
+    int src_offset,
+    int src_pixels_per_line,
+    int xoffset,
+    int yoffset,
+    unsigned char *dst_base,
+    cl_mem dst_mem,
+    int dst_offset,
+    int dst_pitch
+) {
+
+    const int height = 4, width = 8;
+
+#if !STATIC_MEM
+    int err;
+    cl_mem int_mem = NULL;
+    VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,);
+#endif
+
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset);
+
+    /* then 1-D vertically... */
+    vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset);
+
+#if !STATIC_MEM
+    clReleaseMemObject(int_mem);
+#endif
+
+}
+
+void vp8_bilinear_predict16x16_cl
+(
+    cl_command_queue cq,
+    unsigned char *src_base,
+    cl_mem src_mem,
+    int src_offset,
+    int src_pixels_per_line,
+    int xoffset,
+    int yoffset,
+    unsigned char *dst_base,
+    cl_mem dst_mem,
+    int dst_offset,
+    int dst_pitch
+) {
+
+    const int height = 16, width = 16;
+
+#if !STATIC_MEM
+    int err;
+    cl_mem int_mem = NULL;
+    VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,);
+#endif
+
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset);
+
+    /* then 1-D vertically... */
+    vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset);
+
+#if !STATIC_MEM
+    clReleaseMemObject(int_mem);
+#endif
+
+}
--- a/vp8/common/opencl/filter_cl.cl
+++ b/vp8/common/opencl/filter_cl.cl
@@ -0,0 +1,562 @@
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+__constant int bilinear_filters[8][2] = {
+    { 128, 0},
+    { 112, 16},
+    { 96, 32},
+    { 80, 48},
+    { 64, 64},
+    { 48, 80},
+    { 32, 96},
+    { 16, 112}
+};
+
+__constant short sub_pel_filters[8][8] = {
+    //These were originally 8x6, but are padded for vector ops
+    { 0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
+    { 0, -6, 123, 12, -1, 0, 0, 0},
+    { 2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
+    { 0, -9, 93, 50, -6, 0, 0, 0},
+    { 3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
+    { 0, -6, 50, 93, -9, 0, 0, 0},
+    { 1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
+    { 0, -1, 12, 123, -6, 0, 0, 0},
+};
+
+
+kernel void vp8_filter_block2d_first_pass_kernel(
+    __global unsigned char *src_base,
+    int src_offset,
+    __global int *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_height,
+    unsigned int output_width,
+    int filter_offset
+){
+    uint tid = get_global_id(0);
+
+    global unsigned char *src_ptr = &src_base[src_offset];
+    //Note that src_offset will be reset later, which is why we use it now
+
+    int Temp;
+
+    __constant short *vp8_filter = sub_pel_filters[filter_offset];
+
+    if (tid < (output_width*output_height)){
+        src_offset = tid + (tid/output_width * (src_pixels_per_line - output_width));
+
+        Temp = (int)(src_ptr[src_offset - 2] * vp8_filter[0]) +
+           (int)(src_ptr[src_offset - 1] * vp8_filter[1]) +
+           (int)(src_ptr[src_offset]     * vp8_filter[2]) +
+           (int)(src_ptr[src_offset + 1] * vp8_filter[3]) +
+           (int)(src_ptr[src_offset + 2] * vp8_filter[4]) +
+           (int)(src_ptr[src_offset + 3] * vp8_filter[5]) +
+           (VP8_FILTER_WEIGHT >> 1);      /* Rounding */
+
+        /* Normalize back to 0-255 */
+        Temp = Temp >> VP8_FILTER_SHIFT;
+
+        if (Temp < 0)
+            Temp = 0;
+        else if ( Temp > 255 )
+            Temp = 255;
+
+        output_ptr[tid] = Temp;
+    }
+
+}
+
+kernel void vp8_filter_block2d_second_pass_kernel
+(
+    __global int *src_base,
+    int src_offset,
+    __global unsigned char *output_base,
+    int output_offset,
+    int output_pitch,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    int filter_offset
+) {
+
+    uint i = get_global_id(0);
+
+    global int *src_ptr = &src_base[src_offset];
+    global unsigned char *output_ptr = &output_base[output_offset];
+
+    int out_offset; //Not same as output_offset...
+    int Temp;
+    int PS2 = 2*(int)pixel_step;
+    int PS3 = 3*(int)pixel_step;
+
+    unsigned int src_increment = src_pixels_per_line - output_width;
+
+    __constant short *vp8_filter = sub_pel_filters[filter_offset];
+
+    if (i < (output_width * output_height)){
+        out_offset = i/output_width;
+        src_offset = out_offset;
+
+        src_offset = i + (src_offset * src_increment);
+        out_offset = i%output_width + (out_offset * output_pitch);
+
+        /* Apply filter */
+        Temp = ((int)src_ptr[src_offset - PS2] * vp8_filter[0]) +
+           ((int)src_ptr[src_offset -(int)pixel_step] * vp8_filter[1]) +
+           ((int)src_ptr[src_offset]                  * vp8_filter[2]) +
+           ((int)src_ptr[src_offset + pixel_step]     * vp8_filter[3]) +
+           ((int)src_ptr[src_offset + PS2]       * vp8_filter[4]) +
+           ((int)src_ptr[src_offset + PS3]       * vp8_filter[5]) +
+           (VP8_FILTER_WEIGHT >> 1);   /* Rounding */
+
+        /* Normalize back to 0-255 */
+        Temp = Temp >> VP8_FILTER_SHIFT;
+        if (Temp < 0)
+            Temp = 0;
+        else if (Temp > 255)
+            Temp = 255;
+
+        output_ptr[out_offset] = (unsigned char)Temp;
+    }
+}
+
+
+kernel void vp8_filter_block2d_bil_first_pass_kernel(
+    __global unsigned char *src_base,
+    int src_offset,
+    __global int *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_height,
+    unsigned int output_width,
+    int filter_offset
+)
+{
+    uint tid = get_global_id(0);
+
+    if (tid < output_width * output_height){
+        global unsigned char *src_ptr = &src_base[src_offset];
+
+        unsigned int i, j;
+        __constant int *vp8_filter = bilinear_filters[filter_offset];
+
+        unsigned int out_row,out_offset;
+        int src_increment = src_pixels_per_line - output_width;
+
+        i = tid / output_width;
+        j = tid % output_width;
+
+        src_offset = i*(output_width+src_increment) + j;
+        out_row = output_width * i;
+
+        out_offset = out_row + j;
+
+        /* Apply bilinear filter */
+        output_ptr[out_offset] = (((int)src_ptr[src_offset]   * vp8_filter[0]) +
+                 ((int)src_ptr[src_offset+1] * vp8_filter[1]) +
+                 (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+    }
+}
+
+kernel void vp8_filter_block2d_bil_second_pass_kernel
+(
+    __global int *src_ptr,
+    __global unsigned char *output_base,
+    int output_offset,
+    int output_pitch,
+    unsigned int output_height,
+    unsigned int output_width,
+    int filter_offset
+)
+{
+
+    uint tid = get_global_id(0);
+
+    if (tid < output_width * output_height){
+        global unsigned char *output_ptr = &output_base[output_offset];
+
+        unsigned int i, j;
+        int Temp;
+        __constant int *vp8_filter = bilinear_filters[filter_offset];
+
+        int out_offset;
+        int src_offset;
+
+        i = tid / output_width;
+        j = tid % output_width;
+
+        src_offset = i*(output_width) + j;
+        out_offset = i*output_pitch + j;
+
+        /* Apply filter */
+        Temp = ((int)src_ptr[src_offset]         * vp8_filter[0]) +
+               ((int)src_ptr[src_offset+output_width] * vp8_filter[1]) +
+               (VP8_FILTER_WEIGHT / 2);
+
+        output_ptr[out_offset++] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+    }
+}
+
+
+
+
+//Called from reconinter_cl.c
+kernel void vp8_memcpy_kernel(
+    global unsigned char *src_base,
+    int src_offset,
+    int src_stride,
+    global unsigned char *dst_base,
+    int dst_offset,
+    int dst_stride,
+    int num_bytes,
+    int num_iter
+){
+
+    int i,r;
+    global unsigned char *src = &src_base[src_offset];
+    global unsigned char *dst = &dst_base[dst_offset];
+    src_offset = dst_offset = 0;
+
+    r = get_global_id(1);
+    if (r < get_global_size(1)){
+        i = get_global_id(0);
+        if (i < get_global_size(0)){
+            src_offset = r*src_stride + i;
+            dst_offset = r*dst_stride + i;
+            dst[dst_offset] = src[src_offset];
+        }
+    }
+}
+
+//Not used currently.
+void vp8_memset_short(
+    global short *mem,
+    int offset,
+    short newval,
+    unsigned int size
+)
+{
+    int tid = get_global_id(0);
+
+    if (tid < (size/2)){
+        mem[offset+tid/2] = newval;
+    }
+}
+
+
+
+__kernel void vp8_bilinear_predict4x4_kernel
+(
+        __global unsigned char *src_base,
+        int src_offset,
+        int src_pixels_per_line,
+        int xoffset,
+        int yoffset,
+        __global unsigned char *dst_base,
+        int dst_offset,
+        int dst_pitch,
+        __global int *int_mem
+)
+{
+    int Height = 4, Width = 4;
+
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset);
+
+    /* then 1-D vertically... */
+    vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset);
+}
+
+__kernel void vp8_bilinear_predict8x8_kernel
+(
+    __global unsigned char *src_base,
+    int src_offset,
+    int src_pixels_per_line,
+    int xoffset,
+    int yoffset,
+    __global unsigned char *dst_base,
+    int dst_offset,
+    int dst_pitch,
+    __global int *int_mem
+)
+{
+    int Height = 8, Width = 8;
+
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset);
+
+    /* then 1-D vertically... */
+    vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset);
+
+}
+
+__kernel void vp8_bilinear_predict8x4_kernel
+(
+    __global unsigned char *src_base,
+    int src_offset,
+    int src_pixels_per_line,
+    int xoffset,
+    int yoffset,
+    __global unsigned char *dst_base,
+    int dst_offset,
+    int dst_pitch,
+    __global int *int_mem
+)
+{
+    int Height = 4, Width = 8;
+
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset);
+
+    /* then 1-D vertically... */
+    vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset);
+}
+
+__kernel void vp8_bilinear_predict16x16_kernel
+(
+    __global unsigned char *src_base,
+    int src_offset,
+    int src_pixels_per_line,
+    int xoffset,
+    int yoffset,
+    __global unsigned char *dst_base,
+    int dst_offset,
+    int dst_pitch,
+    __global int *int_mem
+)
+{
+
+    int Height = 16, Width = 16;
+
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset);
+
+    /* then 1-D vertically... */
+    vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset);
+
+}
+
+void vp8_filter_block2d_first_pass(
+    global unsigned char *src_base,
+    int src_offset,
+    local int *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    int filter_offset
+){
+    uint tid = get_global_id(0);
+    uint i = tid;
+
+    int nthreads = get_global_size(0);
+    int ngroups = nthreads / get_local_size(0);
+
+    global unsigned char *src_ptr = &src_base[src_offset];
+    //Note that src_offset will be reset later, which is why we capture it now
+
+    int Temp;
+
+    __constant short *vp8_filter = sub_pel_filters[filter_offset];
+
+    if (tid < (output_width*output_height)){
+        short filter0 = vp8_filter[0];
+        short filter1 = vp8_filter[1];
+        short filter2 = vp8_filter[2];
+        short filter3 = vp8_filter[3];
+        short filter4 = vp8_filter[4];
+        short filter5 = vp8_filter[5];
+
+        if (ngroups > 1){
+            //This is generally only true on Apple CPU-CL, which gives a group
+            //size of 1, regardless of the CPU core count.
+            for (i=0; i < output_width*output_height; i++){
+                src_offset = i + (i/output_width * (src_pixels_per_line - output_width));
+
+                Temp = (int)(src_ptr[src_offset - 2] * filter0) +
+                       (int)(src_ptr[src_offset - 1] * filter1) +
+                       (int)(src_ptr[src_offset]     * filter2) +
+                       (int)(src_ptr[src_offset + 1] * filter3) +
+                       (int)(src_ptr[src_offset + 2] * filter4) +
+                       (int)(src_ptr[src_offset + 3] * filter5) +
+                       (VP8_FILTER_WEIGHT >> 1);      /* Rounding */
+
+                /* Normalize back to 0-255 */
+                Temp >>= VP8_FILTER_SHIFT;
+
+                if (Temp < 0)
+                    Temp = 0;
+                else if ( Temp > 255 )
+                    Temp = 255;
+
+                output_ptr[i] = Temp;
+            }
+        } else {
+            src_offset = i + (i/output_width * (src_pixels_per_line - output_width));
+
+            Temp = (int)(src_ptr[src_offset - 2] * filter0) +
+                   (int)(src_ptr[src_offset - 1] * filter1) +
+                   (int)(src_ptr[src_offset]     * filter2) +
+                   (int)(src_ptr[src_offset + 1] * filter3) +
+                   (int)(src_ptr[src_offset + 2] * filter4) +
+                   (int)(src_ptr[src_offset + 3] * filter5) +
+                   (VP8_FILTER_WEIGHT >> 1);      /* Rounding */
+
+            /* Normalize back to 0-255 */
+            Temp >>= VP8_FILTER_SHIFT;
+
+            if (Temp < 0)
+                Temp = 0;
+            else if ( Temp > 255 )
+                Temp = 255;
+
+            output_ptr[i] = Temp;
+        }
+    }
+
+    //Add a fence so that no 2nd pass stuff starts before 1st pass writes are done.
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+void vp8_filter_block2d_second_pass
+(
+    local int *src_ptr,
+    global unsigned char *output_base,
+    int output_offset,
+    int output_pitch,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    int filter_offset
+) {
+
+    global unsigned char *output_ptr = &output_base[output_offset];
+
+    int out_offset; //Not same as output_offset...
+    int src_offset;
+    int Temp;
+    int PS2 = 2*(int)pixel_step;
+    int PS3 = 3*(int)pixel_step;
+
+    unsigned int src_increment = src_pixels_per_line - output_width;
+
+    uint i = get_global_id(0);
+
+    __constant short *vp8_filter = sub_pel_filters[filter_offset];
+
+    if (i < (output_width * output_height)){
+        out_offset = i/output_width;
+        src_offset = out_offset;
+
+        src_offset = i + (src_offset * src_increment);
+        out_offset = i%output_width + (out_offset * output_pitch);
+
+        /* Apply filter */
+        Temp = ((int)src_ptr[src_offset - PS2] * vp8_filter[0]) +
+           ((int)src_ptr[src_offset -(int)pixel_step] * vp8_filter[1]) +
+           ((int)src_ptr[src_offset]                  * vp8_filter[2]) +
+           ((int)src_ptr[src_offset + pixel_step]     * vp8_filter[3]) +
+           ((int)src_ptr[src_offset + PS2]            * vp8_filter[4]) +
+           ((int)src_ptr[src_offset + PS3]       * vp8_filter[5]) +
+           (VP8_FILTER_WEIGHT >> 1);   /* Rounding */
+
+        /* Normalize back to 0-255 */
+        Temp = Temp >> VP8_FILTER_SHIFT;
+        if (Temp < 0)
+            Temp = 0;
+        else if (Temp > 255)
+            Temp = 255;
+
+        output_ptr[out_offset] = (unsigned char)Temp;
+    }
+}
+
+__kernel void vp8_sixtap_predict_kernel
+(
+    __global unsigned char  *src_ptr,
+    int src_offset,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    __global unsigned char *dst_ptr,
+    int dst_offset,
+    int  dst_pitch
+)
+{
+
+    local int FData[9*4];
+
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 9, 4, xoffset);
+
+    /* then filter vertically... */
+    vp8_filter_block2d_second_pass(&FData[8], dst_ptr, dst_offset, dst_pitch, 4, 4, 4, 4, yoffset);
+}
+
+__kernel void vp8_sixtap_predict8x8_kernel
+(
+    __global unsigned char  *src_ptr,
+    int src_offset,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    __global unsigned char *dst_ptr,
+    int dst_offset,
+    int  dst_pitch
+)
+{
+    local int FData[13*16];   /* Temp data bufffer used in filtering */
+
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 13, 8, xoffset);
+
+    /* then filter vertically... */
+    vp8_filter_block2d_second_pass(&FData[16], dst_ptr, dst_offset, dst_pitch, 8, 8, 8, 8, yoffset);
+
+}
+
+__kernel void vp8_sixtap_predict8x4_kernel
+(
+    __global unsigned char  *src_ptr,
+    int src_offset,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    __global unsigned char *dst_ptr,
+    int dst_offset,
+    int  dst_pitch
+)
+{
+    local int FData[13*16];   /* Temp data buffer used in filtering */
+
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 9, 8, xoffset);
+
+    /* then filter verticaly... */
+    vp8_filter_block2d_second_pass(&FData[16], dst_ptr, dst_offset, dst_pitch, 8, 8, 4, 8, yoffset);
+}
+
+__kernel void vp8_sixtap_predict16x16_kernel
+(
+    __global unsigned char  *src_ptr,
+    int src_offset,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    __global unsigned char *dst_ptr,
+    int dst_offset,
+    int  dst_pitch
+)
+{
+    local int FData[21*24];   /* Temp data buffer used in filtering */
+
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 21, 16, xoffset);
+
+    /* then filter verticaly... */
+    vp8_filter_block2d_second_pass(&FData[32], dst_ptr, dst_offset, dst_pitch, 16, 16, 16, 16, yoffset);
+
+    return;
+}
--- a/vp8/common/opencl/filter_cl.h
+++ b/vp8/common/opencl/filter_cl.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef FILTER_CL_H_
+#define FILTER_CL_H_
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include "vp8_opencl.h"
+
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT  7
+
+#define REGISTER_FILTER 1
+#define CLAMP(x,min,max) if (x < min) x = min; else if ( x > max ) x = max;
+#define PRE_CALC_PIXEL_STEPS 1
+#define PRE_CALC_SRC_INCREMENT 1
+
+#if PRE_CALC_PIXEL_STEPS
+#define PS2 two_pixel_steps
+#define PS3 three_pixel_steps
+#else
+#define PS2 2*(int)pixel_step
+#define PS3 3*(int)pixel_step
+#endif
+
+#if REGISTER_FILTER
+#define FILTER0 filter0
+#define FILTER1 filter1
+#define FILTER2 filter2
+#define FILTER3 filter3
+#define FILTER4 filter4
+#define FILTER5 filter5
+#else
+#define FILTER0 vp8_filter[0]
+#define FILTER1 vp8_filter[1]
+#define FILTER2 vp8_filter[2]
+#define FILTER3 vp8_filter[3]
+#define FILTER4 vp8_filter[4]
+#define FILTER5 vp8_filter[5]
+#endif
+
+#if PRE_CALC_SRC_INCREMENT
+#define SRC_INCREMENT src_increment
+#else
+#define SRC_INCREMENT (src_pixels_per_line - output_width)
+#endif
+
+#define FILTER_OFFSET //Filter data stored as CL constant memory
+#define FILTER_REF sub_pel_filters[filter_offset]
+
+extern const char *filterCompileOptions;
+extern const char *filter_cl_file_name;
+
+//Copy the -2*pixel_step (and ps*3) bytes because the filter algorithm
+//accesses negative indexes
+#define SIXTAP_SRC_LEN(out_width,out_height,src_px) ((out_width)*(out_height) + (((out_width)*(out_height)-1)/(out_width))*(src_px - out_width) + 5)
+#define BIL_SRC_LEN(out_width,out_height,src_px) ((out_height) * src_px + out_width)
+#define DST_LEN(dst_pitch,dst_height,dst_width) (dst_pitch * (dst_height) + (dst_width))
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* FILTER_CL_H_ */
--- a/vp8/common/opencl/idct_cl.h
+++ b/vp8/common/opencl/idct_cl.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef IDCT_OPENCL_H
+#define IDCT_OPENCL_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include "vp8_opencl.h"
+#include "vp8/common/blockd.h"
+
+#define prototype_second_order_cl(sym) \
+    void sym(BLOCKD *b)
+
+#define prototype_idct_cl(sym) \
+    void sym(BLOCKD *b, int pitch)
+
+#define prototype_idct_scalar_add_cl(sym) \
+    void sym(BLOCKD *b, cl_int use_diff, int diff_offset, int qcoeff_offset, \
+             int pred_offset, unsigned char *output, cl_mem out_mem, int out_offset, size_t out_size, \
+             int pitch, int stride)\
+
+
+extern prototype_idct_cl(vp8_short_idct4x4llm_1_cl);
+extern prototype_idct_cl(vp8_short_idct4x4llm_cl);
+extern prototype_idct_scalar_add_cl(vp8_dc_only_idct_add_cl);
+
+extern prototype_second_order_cl(vp8_short_inv_walsh4x4_1_cl);
+extern prototype_second_order_cl(vp8_short_inv_walsh4x4_cl);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif
--- a/vp8/common/opencl/idctllm_cl.c
+++ b/vp8/common/opencl/idctllm_cl.c
@@ -0,0 +1,325 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+
+//ACW: Remove me after debugging.
+#include <stdio.h>
+#include <string.h>
+
+#include "idct_cl.h"
+#include "idctllm_cl.h"
+#include "blockd_cl.h"
+
+void cl_destroy_idct(){
+
+    if (cl_data.idct_program)
+        clReleaseProgram(cl_data.idct_program);
+
+    cl_data.idct_program = NULL;
+    
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_inv_walsh4x4_1_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_dc_only_idct_add_kernel);
+    //VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_idct4x4llm_1_kernel);
+    //VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_idct4x4llm_kernel);
+
+}
+
+int cl_init_idct() {
+    int err;
+
+    // Create the filter compute program from the file-defined source code
+    if (cl_load_program(&cl_data.idct_program, idctllm_cl_file_name,
+            idctCompileOptions) != CL_SUCCESS)
+        return VP8_CL_TRIED_BUT_FAILED;
+
+    // Create the compute kernel in the program we wish to run
+    VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_inv_walsh4x4_1_kernel,"vp8_short_inv_walsh4x4_1_kernel");
+    VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_inv_walsh4x4_1st_pass_kernel,"vp8_short_inv_walsh4x4_1st_pass_kernel");
+    VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_inv_walsh4x4_2nd_pass_kernel,"vp8_short_inv_walsh4x4_2nd_pass_kernel");
+    VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_dc_only_idct_add_kernel,"vp8_dc_only_idct_add_kernel");
+
+    ////idct4x4llm kernels are only useful for the encoder
+    //VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_idct4x4llm_1_kernel,"vp8_short_idct4x4llm_1_kernel");
+    //VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_idct4x4llm_kernel,"vp8_short_idct4x4llm_kernel");
+
+    return CL_SUCCESS;
+}
+
+#define max(x,y) (x > y ? x: y)
+//#define NO_CL
+
+/* Only useful for encoder... Untested... */
+void vp8_short_idct4x4llm_cl(BLOCKD *b, int pitch)
+{
+    int err;
+
+    short *input = b->dqcoeff_base + b->dqcoeff_offset;
+    short *output = &b->diff_base[b->diff_offset];
+
+    cl_mem src_mem, dst_mem;
+
+    //1 instance for now. This should be split into 2-pass * 4 thread.
+    size_t global = 1;
+
+    if (cl_initialized != CL_SUCCESS){
+        vp8_short_idct4x4llm_c(input,output,pitch);
+        return;
+    }
+
+    VP8_CL_CREATE_BUF(b->cl_commands, src_mem,,
+            sizeof(short)*16, input,
+            vp8_short_idct4x4llm_c(input,output,pitch),
+    );
+
+    VP8_CL_CREATE_BUF(b->cl_commands, dst_mem,,
+            sizeof(short)*(4+(pitch/2)*3), output,
+            vp8_short_idct4x4llm_c(input,output,pitch),
+    );
+
+    //Set arguments and run kernel
+    err = 0;
+    err = clSetKernelArg(cl_data.vp8_short_idct4x4llm_kernel, 0, sizeof (cl_mem), &src_mem);
+    err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_kernel, 1, sizeof (cl_mem), &dst_mem);
+    err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_kernel, 2, sizeof (int), &pitch);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",
+        vp8_short_idct4x4llm_c(input,output,pitch),
+    );
+    
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_idct4x4llm_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);
+        vp8_short_idct4x4llm_c(input,output,pitch),
+    );
+
+    /* Read back the result data from the device */
+    err = clEnqueueReadBuffer(b->cl_commands, dst_mem, CL_FALSE, 0, sizeof(short)*(4+pitch/2*3), output, 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS(b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to read output array!\n",
+        vp8_short_idct4x4llm_c(input,output,pitch),
+    );
+
+    clReleaseMemObject(src_mem);
+    clReleaseMemObject(dst_mem);
+
+    return;
+}
+
+/* Only useful for encoder... Untested... */
+void vp8_short_idct4x4llm_1_cl(BLOCKD *b, int pitch)
+{
+    int err;
+    size_t global = 4;
+
+    short *input = b->dqcoeff_base + b->dqcoeff_offset;
+    short *output = &b->diff_base[b->diff_offset];
+
+    cl_mem src_mem, dst_mem;
+
+    if (cl_initialized != CL_SUCCESS){
+        vp8_short_idct4x4llm_1_c(input,output,pitch);
+        return;
+    }
+
+    printf("vp8_short_idct4x4llm_1_cl\n");
+
+    VP8_CL_CREATE_BUF(b->cl_commands, src_mem,,
+            sizeof(short), input,
+            vp8_short_idct4x4llm_1_c(input,output,pitch),
+    );
+
+    VP8_CL_CREATE_BUF(b->cl_commands, dst_mem,,
+            sizeof(short)*(4+(pitch/2)*3), output,
+            vp8_short_idct4x4llm_1_c(input,output,pitch),
+    );
+
+    //Set arguments and run kernel
+    err = 0;
+    err = clSetKernelArg(cl_data.vp8_short_idct4x4llm_1_kernel, 0, sizeof (cl_mem), &src_mem);
+    err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_1_kernel, 1, sizeof (cl_mem), &dst_mem);
+    err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_1_kernel, 2, sizeof (int), &pitch);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",
+        vp8_short_idct4x4llm_1_c(input,output,pitch),
+    );
+
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_idct4x4llm_1_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);
+        vp8_short_idct4x4llm_1_c(input,output,pitch),
+    );
+
+    /* Read back the result data from the device */
+    err = clEnqueueReadBuffer(b->cl_commands, dst_mem, CL_FALSE, 0, sizeof(short)*(4+pitch/2*3), output, 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS(b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to read output array!\n",
+        vp8_short_idct4x4llm_1_c(input,output,pitch),
+    );
+
+    clReleaseMemObject(src_mem);
+    clReleaseMemObject(dst_mem);
+
+    return;
+
+}
+
+void vp8_dc_only_idct_add_cl(BLOCKD *b, cl_int use_diff, int diff_offset, 
+        int qcoeff_offset, int pred_offset,
+        unsigned char *dst_base, cl_mem dst_mem, int dst_offset, size_t dest_size,
+        int pitch, int stride
+)
+{
+    
+    int err;
+    size_t global = 16;
+
+    int free_mem = 0;
+    //cl_mem dest_mem = NULL;
+
+    if (dst_mem == NULL){
+        VP8_CL_CREATE_BUF(b->cl_commands, dst_mem,,
+                dest_size, dst_base,,
+        );
+        free_mem = 1;
+    }
+
+    //Set arguments and run kernel
+    err =  clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 0, sizeof (cl_mem), &b->cl_predictor_mem);
+    err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 1, sizeof (int), &pred_offset);
+    err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 2, sizeof (cl_mem), &dst_mem);
+    err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 3, sizeof (int), &dst_offset);
+    err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 4, sizeof (int), &pitch);
+    err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 5, sizeof (int), &stride);
+    err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 6, sizeof (cl_int), &use_diff);
+    err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 7, sizeof (cl_mem), &b->cl_diff_mem);
+    err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 8, sizeof (int), &diff_offset);
+    err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 9, sizeof (cl_mem), &b->cl_qcoeff_mem);
+    err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 10, sizeof (int), &qcoeff_offset);
+    err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 11, sizeof (cl_mem), &b->cl_dequant_mem);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",,
+    );
+
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_dc_only_idct_add_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);,
+    );
+
+
+    if (free_mem == 1){
+    /* Read back the result data from the device */
+        err = clEnqueueReadBuffer(b->cl_commands, dst_mem, CL_FALSE, 0,
+                dest_size, dst_base, 0, NULL, NULL);
+
+        VP8_CL_CHECK_SUCCESS(b->cl_commands, err != CL_SUCCESS,
+            "Error: Failed to read output array!\n",,
+        );
+
+        clReleaseMemObject(dst_mem);
+    }
+
+    return;
+}
+
+void vp8_short_inv_walsh4x4_cl(BLOCKD *b)
+{
+    int err;
+    size_t global = 4;
+
+    if (cl_initialized != CL_SUCCESS){
+        vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset,&b->diff_base[b->diff_offset]);
+        return;
+    }
+
+    //Set arguments and run kernel
+    err = 0;
+    err = clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 0, sizeof (cl_mem), &b->cl_dqcoeff_mem);
+    err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 1, sizeof(int), &b->dqcoeff_offset);
+    err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 2, sizeof (cl_mem), &b->cl_diff_mem);
+    err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 3, sizeof(int), &b->diff_offset);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",
+        vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]),
+    );
+
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);
+        vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]),
+    );
+
+    //Second pass
+    //Set arguments and run kernel
+    err = 0;
+    err = clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel, 0, sizeof (cl_mem), &b->cl_diff_mem);
+    err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel, 1, sizeof(int), &b->diff_offset);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",
+        vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]),
+    );
+
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);
+        vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]),
+    );
+
+    return;
+}
+
+void vp8_short_inv_walsh4x4_1_cl(BLOCKD *b)
+{
+    
+    int err;
+    size_t global = 4;
+
+    if (cl_initialized != CL_SUCCESS){
+        vp8_short_inv_walsh4x4_1_c(b->dqcoeff_base + b->dqcoeff_offset,
+            &b->diff_base[b->diff_offset]);
+        return;
+    }
+
+    //Set arguments and run kernel
+    err = 0;
+    err = clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 0, sizeof (cl_mem), &b->cl_dqcoeff_mem);
+    err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 1, sizeof (int), &b->dqcoeff_offset);
+    err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 2, sizeof (cl_mem), &b->cl_diff_mem);
+    err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 3, sizeof (int), &b->diff_offset);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",
+        vp8_short_inv_walsh4x4_1_c(b->dqcoeff_base + b->dqcoeff_offset,
+            &b->diff_base[b->diff_offset]),
+    );
+
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_inv_walsh4x4_1_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);
+        vp8_short_inv_walsh4x4_1_c(b->dqcoeff_base + b->dqcoeff_offset,
+                &b->diff_base[b->diff_offset]),
+    );
+
+    return;
+}
--- a/vp8/common/opencl/idctllm_cl.cl
+++ b/vp8/common/opencl/idctllm_cl.cl
@@ -0,0 +1,309 @@
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+__constant int cospi8sqrt2minus1 = 20091;
+__constant int sinpi8sqrt2      = 35468;
+__constant int rounding = 0;
+
+
+kernel void vp8_short_idct4x4llm_1st_pass_kernel(global short*,global short *,int);
+kernel void vp8_short_idct4x4llm_2nd_pass_kernel(global short*,int);
+
+
+__kernel void vp8_short_idct4x4llm_kernel(
+    __global short *input,
+    __global short *output,
+    int pitch
+){
+    vp8_short_idct4x4llm_1st_pass_kernel(input,output,pitch);
+    vp8_short_idct4x4llm_2nd_pass_kernel(output,pitch);
+}
+
+__kernel void vp8_short_idct4x4llm_1st_pass_kernel(
+    __global short *ip,
+    __global short *op,
+    int pitch
+)
+{
+    int i;
+    int a1, b1, c1, d1;
+
+    int temp1, temp2;
+    int shortpitch = pitch >> 1;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[8];
+        b1 = ip[0] - ip[8];
+
+        temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
+        temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
+        c1 = temp1 - temp2;
+
+        temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
+        temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
+        d1 = temp1 + temp2;
+
+        op[shortpitch*0] = a1 + d1;
+        op[shortpitch*3] = a1 - d1;
+
+        op[shortpitch*1] = b1 + c1;
+        op[shortpitch*2] = b1 - c1;
+
+        ip++;
+        op++;
+    }
+
+    return;
+}
+
+__kernel void vp8_short_idct4x4llm_2nd_pass_kernel(
+    __global short *output,
+    int pitch
+)
+{
+    int i;
+    int a1, b1, c1, d1;
+
+    int temp1, temp2;
+    int shortpitch = pitch >> 1;
+    __global short *ip = output;
+    __global short *op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[2];
+        b1 = ip[0] - ip[2];
+
+        temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
+        temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
+        c1 = temp1 - temp2;
+
+        temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
+        temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
+        d1 = temp1 + temp2;
+
+        op[0] = (a1 + d1 + 4) >> 3;
+        op[3] = (a1 - d1 + 4) >> 3;
+
+        op[1] = (b1 + c1 + 4) >> 3;
+        op[2] = (b1 - c1 + 4) >> 3;
+
+        ip += shortpitch;
+        op += shortpitch;
+    }
+
+    return;
+}
+
+__kernel void vp8_short_idct4x4llm_1_kernel(
+    __global short *input,
+    __global short *output,
+    int pitch
+)
+{
+    int a1;
+    int out_offset;
+    int shortpitch = pitch >> 1;
+
+    //short4 a;
+    a1 = ((input[0] + 4) >> 3);
+    //a = a1;
+
+    int tid = get_global_id(0);
+    if (tid < 4){
+        out_offset = shortpitch * tid;
+
+        //vstore4(a,0,&output[out_offset];
+        output[out_offset] = a1;
+        output[out_offset+1] = a1;
+        output[out_offset+2] = a1;
+        output[out_offset+3] = a1;
+    }
+}
+
+__kernel void vp8_dc_only_idct_add_kernel(
+    __global unsigned char *pred_base,
+    int pred_offset,
+    __global unsigned char *dst_base,
+    int dst_offset,
+    int pitch,
+    int stride,
+    int use_diff,
+    global short *diff_base,
+    int diff_offset,
+    global short *qcoeff_base,
+    int qcoeff_offset,
+    global short *dequant
+)
+{
+    int r, c;
+    //int pred_offset;
+    global unsigned char *pred_ptr = &pred_base[pred_offset];
+    global unsigned char *dst_ptr = &dst_base[dst_offset];
+
+    int tid = get_global_id(0);
+
+    int a1;
+
+    if (tid < 16){
+
+        if (use_diff == 1){
+            a1 = diff_base[diff_offset];
+        } else {
+            a1 = qcoeff_base[qcoeff_offset] * dequant[0];
+        }
+        a1 = (a1 + 4)>>3;
+
+        r = tid / 4;
+        c = tid % 4;
+
+        pred_offset = r * pitch;
+        dst_offset += r * stride;
+        int a = a1 + pred_ptr[pred_offset + c] ;
+
+        if (a < 0)
+            a = 0;
+        else if (a > 255)
+            a = 255;
+
+        dst_base[dst_offset + c] = (unsigned char) a ;
+    }
+}
+
+
+__kernel void vp8_short_inv_walsh4x4_1st_pass_kernel(
+    __global short *src_base,
+    int src_offset,
+    __global short *output_base,
+    int out_offset
+)
+{
+
+    __global short *input = src_base + src_offset;
+    __global short *output = output_base + src_offset;
+    int tid = get_global_id(0);
+
+#define VEC_WALSH 0
+#if VEC_WALSH
+    //4-short vectors to calculate things in
+    short4 a,b,c,d, a2v, b2v, c2v, d2v, a1t, b1t, c1t, d1t;
+    short16 out;
+
+    if (tid == 0){
+        //first pass loop in vector form
+        a = vload4(0,input) + vload4(3,input);
+        b = vload4(1,input) + vload4(2,input);
+        c = vload4(1,input) - vload4(2,input);
+        d = vload4(0,input) - vload4(3,input);
+        vstore4(a + b, 0, output);
+        vstore4(c + d, 1, output);
+        vstore4(a - b, 2, output);
+        vstore4(d - c, 3, output);
+
+        return;
+
+        //2nd pass
+        a = (short4)(output[0], output[4], output[8], output[12]);
+        b = (short4)(output[1], output[5], output[9], output[13]);
+        c = (short4)(output[1], output[5], output[9], output[13]);
+        d = (short4)(output[0], output[4], output[8], output[12]);
+        a1t = (short4)(output[3], output[7], output[11], output[15]);
+        b1t = (short4)(output[2], output[6], output[10], output[14]);
+        c1t = (short4)(output[2], output[6], output[10], output[14]);
+        d1t = (short4)(output[3], output[7], output[11], output[15]);
+
+        a = a + a1t + (short)3;
+        b = b + b1t;
+        c = c - c1t;
+        d = d - d1t + (short)3;
+
+        a2v = (a + b) >> (short)3;
+        b2v = (c + d) >> (short)3;
+        c2v = (a - b) >> (short)3;
+        d2v = (d - c) >> (short)3;
+
+        out.s048c = a2v;
+        out.s159d = b2v;
+        out.s26ae = c2v;
+        out.s37bf = d2v;
+        vstore16(out,0,output);
+    }
+#else
+
+    int i;
+    int a1, b1, c1, d1;
+    int a2, b2, c2, d2;
+    global short *ip = input;
+    global short *op = output;
+
+    int offset;
+
+    if (tid < 4){
+        offset = tid;
+        a1 = ip[offset] + ip[offset + 12];
+        b1 = ip[offset + 4] + ip[offset + 8];
+        c1 = ip[offset + 4] - ip[offset + 8];
+        d1 = ip[offset] - ip[offset + 12];
+
+        op[offset] = a1 + b1;
+        op[offset + 4] = c1 + d1;
+        op[offset + 8] = a1 - b1;
+        op[offset + 12] = d1 - c1;
+    }
+#endif
+}
+
+__kernel void vp8_short_inv_walsh4x4_2nd_pass_kernel(
+    __global short *output_base,
+    int out_offset
+)
+{
+    int i;
+    int a1, b1, c1, d1;
+    int a2, b2, c2, d2;
+
+    __global short *output = output_base + out_offset;
+    int tid = get_global_id(0);
+    int offset = 0;
+
+    if (tid < 4){
+        offset = 4*tid;
+        a1 = output[offset] + output[offset + 3];
+        b1 = output[offset + 1] + output[offset + 2];
+        c1 = output[offset + 1] - output[offset + 2];
+        d1 = output[offset + 0] - output[offset + 3];
+
+        a2 = a1 + b1;
+        b2 = c1 + d1;
+        c2 = a1 - b1;
+        d2 = d1 - c1;
+
+        output[offset + 0] = (a2 + 3) >> 3;
+        output[offset + 1] = (b2 + 3) >> 3;
+        output[offset + 2] = (c2 + 3) >> 3;
+        output[offset + 3] = (d2 + 3) >> 3;
+    }
+}
+
+__kernel void vp8_short_inv_walsh4x4_1_kernel(
+    __global short *src_data,
+    int src_offset,
+    __global short *dst_data,
+    int dst_offset
+){
+    int a1;
+    int tid = get_global_id(0);
+    //short16 a;
+    int i;
+    short4 a;
+    __global short *input = src_data + src_offset;
+    __global short *output = dst_data + dst_offset;
+
+    if (tid < 4)
+    {
+        a1 = ((input[0] + 3) >> 3);
+        a = (short)a1; //Set all elements of vector to a1
+        vstore4(a, tid, output);
+    }
+}
--- a/vp8/common/opencl/idctllm_cl.h
+++ b/vp8/common/opencl/idctllm_cl.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_opencl.h"
+#include "vp8/common/blockd.h"
+
+#define CLAMP(x,min,max) if (x < min) x = min; else if ( x > max ) x = max;
+
+//External functions that are fallbacks if CL is unavailable
+extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch);
+extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+extern void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride);
+extern void vp8_short_inv_walsh4x4_c(short *input, short *output);
+extern void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+
+const char *idctCompileOptions = "-Ivp8/common/opencl";
+const char *idctllm_cl_file_name = "vp8/common/opencl/idctllm_cl.cl";
+
--- a/vp8/common/opencl/loopfilter.cl
+++ b/vp8/common/opencl/loopfilter.cl
@@ -0,0 +1,427 @@
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+typedef unsigned char uc;
+typedef signed char sc;
+
+__inline signed char vp8_filter_mask(sc, sc, uc, uc, uc, uc, uc, uc, uc, uc);
+__inline signed char vp8_simple_filter_mask(signed char, signed char, uc, uc, uc, uc);
+__inline signed char vp8_hevmask(signed char, uc, uc, uc, uc);
+__inline signed char vp8_signed_char_clamp(int);
+
+__inline void vp8_mbfilter(signed char mask,signed char hev,global uc *op2,
+    global uc *op1,global uc *op0,global uc *oq0,global uc *oq1,global uc *oq2);
+
+void vp8_simple_filter(signed char mask,global uc *base, int op1_off,int op0_off,int oq0_off,int oq1_off);
+
+
+typedef struct
+{
+    signed char lim[16];
+    signed char flim[16];
+    signed char thr[16];
+    signed char mbflim[16];
+    signed char mbthr[16];
+    signed char uvlim[16];
+    signed char uvflim[16];
+    signed char uvthr[16];
+    signed char uvmbflim[16];
+    signed char uvmbthr[16];
+} loop_filter_info;
+
+
+
+
+void vp8_filter(
+    signed char mask,
+    signed char hev,
+    global uc *base,
+    int op1_off,
+    int op0_off,
+    int oq0_off,
+    int oq1_off
+)
+{
+
+    global uc *op1 = &base[op1_off];
+    global uc *op0 = &base[op0_off];
+    global uc *oq0 = &base[oq0_off];
+    global uc *oq1 = &base[oq1_off];
+
+    signed char ps0, qs0;
+    signed char ps1, qs1;
+    signed char vp8_filter, Filter1, Filter2;
+    signed char u;
+
+    ps1 = (signed char) * op1 ^ 0x80;
+    ps0 = (signed char) * op0 ^ 0x80;
+    qs0 = (signed char) * oq0 ^ 0x80;
+    qs1 = (signed char) * oq1 ^ 0x80;
+
+    /* add outer taps if we have high edge variance */
+    vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
+    vp8_filter &= hev;
+
+    /* inner taps */
+    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
+    vp8_filter &= mask;
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3
+     * if it equals 4 we'll set to adjust by -1 to account for the fact
+     * we'd round 3 the other way
+     */
+    Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
+    Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
+    Filter1 >>= 3;
+    Filter2 >>= 3;
+    u = vp8_signed_char_clamp(qs0 - Filter1);
+    *oq0 = u ^ 0x80;
+    u = vp8_signed_char_clamp(ps0 + Filter2);
+    *op0 = u ^ 0x80;
+    vp8_filter = Filter1;
+
+    /* outer tap adjustments */
+    vp8_filter += 1;
+    vp8_filter >>= 1;
+    vp8_filter &= ~hev;
+
+    u = vp8_signed_char_clamp(qs1 - vp8_filter);
+    *oq1 = u ^ 0x80;
+    u = vp8_signed_char_clamp(ps1 + vp8_filter);
+    *op1 = u ^ 0x80;
+}
+
+
+kernel void vp8_loop_filter_horizontal_edge_kernel
+(
+    global unsigned char *s_base,
+    int s_off,
+    int p, /* pitch */
+    global signed char *flimit,
+    global signed char *limit,
+    global signed char *thresh,
+    int off_stride
+)
+{
+    int  hev = 0; /* high edge variance */
+    signed char mask = 0;
+    int i = get_global_id(0);
+
+    if (i < get_global_size(0)){
+        s_off += i;
+
+        mask = vp8_filter_mask(limit[i], flimit[i], s_base[s_off - 4*p],
+                s_base[s_off - 3*p], s_base[s_off - 2*p], s_base[s_off - p],
+                s_base[s_off], s_base[s_off + p], s_base[s_off + 2*p],
+                s_base[s_off + 3*p]);
+
+        hev = vp8_hevmask(thresh[i], s_base[s_off - 2*p], s_base[s_off - p],
+                s_base[s_off], s_base[s_off+p]);
+
+        vp8_filter(mask, hev, s_base, s_off - 2 * p, s_off - p, s_off,
+                s_off + p);
+    }
+}
+
+
+kernel void vp8_loop_filter_vertical_edge_kernel
+(
+    global unsigned char *s_base,
+    int s_off,
+    int p,
+    global signed char *flimit,
+    global signed char *limit,
+    global signed char *thresh,
+    int off_stride
+)
+{
+
+    int  hev = 0; /* high edge variance */
+    signed char mask = 0;
+    int i = get_global_id(0);
+
+    if ( i < get_global_size(0) ){
+        s_off += p * i;
+        mask = vp8_filter_mask(limit[i], flimit[i],
+                s_base[s_off-4], s_base[s_off-3], s_base[s_off-2],
+                s_base[s_off-1], s_base[s_off], s_base[s_off+1],
+                s_base[s_off+2], s_base[s_off+3]);
+
+        hev = vp8_hevmask(thresh[i], s_base[s_off-2], s_base[s_off-1],
+                s_base[s_off], s_base[s_off+1]);
+
+        vp8_filter(mask, hev, s_base, s_off - 2, s_off - 1, s_off, s_off + 1);
+
+    }
+}
+
+
+kernel void vp8_mbloop_filter_horizontal_edge_kernel
+(
+    global unsigned char *s_base,
+    int s_off,
+    int p,
+    global signed char *flimit,
+    global signed char *limit,
+    global signed char *thresh,
+    int off_stride
+)
+{
+
+    global uc *s = s_base+s_off;
+
+    signed char hev = 0; /* high edge variance */
+    signed char mask = 0;
+    int i = get_global_id(0);
+
+    if (i < get_global_size(0)){
+        s += i;
+
+        mask = vp8_filter_mask(limit[i], flimit[i],
+                               s[-4*p], s[-3*p], s[-2*p], s[-1*p],
+                               s[0*p], s[1*p], s[2*p], s[3*p]);
+
+        hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+
+        vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);
+
+    }
+}
+
+
+kernel void vp8_mbloop_filter_vertical_edge_kernel
+(
+    global unsigned char *s_base,
+    int s_off,
+    int p,
+    global signed char *flimit,
+    global signed char *limit,
+    global signed char *thresh,
+    int off_stride
+)
+{
+
+    global uc *s = s_base + s_off;
+
+    signed char hev = 0; /* high edge variance */
+    signed char mask = 0;
+    int i = get_global_id(0);
+
+    if (i < get_global_size(0)){
+        s += p * i;
+
+        mask = vp8_filter_mask(limit[i], flimit[i],
+                               s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
+
+        hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]);
+
+        vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);
+
+    }
+}
+
+
+kernel void vp8_loop_filter_simple_horizontal_edge_kernel
+(
+    global unsigned char *s_base,
+    int s_off,
+    int p,
+    global const signed char *flimit,
+    global const signed char *limit,
+    global const signed char *thresh,
+    int off_stride
+)
+{
+
+    signed char mask = 0;
+    int i = get_global_id(0);
+    (void) thresh;
+
+    if (i < get_global_size(0))
+    {
+        s_off += i;
+        mask = vp8_simple_filter_mask(limit[i], flimit[i], s_base[s_off-2*p], s_base[s_off-p], s_base[s_off], s_base[s_off+p]);
+        vp8_simple_filter(mask, s_base, s_off - 2 * p, s_off - 1 * p, s_off, s_off + 1 * p);
+    }
+}
+
+
+kernel void vp8_loop_filter_simple_vertical_edge_kernel
+(
+    global unsigned char *s_base,
+    int s_off,
+    int p,
+    global signed char *flimit,
+    global signed char *limit,
+    global signed char *thresh,
+    int off_stride
+)
+{
+
+    signed char mask = 0;
+    int i = get_global_id(0);
+    (void) thresh;
+
+    if (i < get_global_size(0)){
+        s_off += p * i;
+        mask = vp8_simple_filter_mask(limit[i], flimit[i], s_base[s_off-2], s_base[s_off-1], s_base[s_off], s_base[s_off+1]);
+        vp8_simple_filter(mask, s_base, s_off - 2, s_off - 1, s_off, s_off + 1);
+    }
+
+}
+
+
+
+//Inline and non-kernel functions follow.
+
+__inline void vp8_mbfilter(
+    signed char mask,
+    signed char hev,
+    global uc *op2,
+    global uc *op1,
+    global uc *op0,
+    global uc *oq0,
+    global uc *oq1,
+    global uc *oq2
+)
+{
+    signed char s, u;
+    signed char vp8_filter, Filter1, Filter2;
+    signed char ps2 = (signed char) * op2 ^ 0x80;
+    signed char ps1 = (signed char) * op1 ^ 0x80;
+    signed char ps0 = (signed char) * op0 ^ 0x80;
+    signed char qs0 = (signed char) * oq0 ^ 0x80;
+    signed char qs1 = (signed char) * oq1 ^ 0x80;
+    signed char qs2 = (signed char) * oq2 ^ 0x80;
+
+    /* add outer taps if we have high edge variance */
+    vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
+    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
+    vp8_filter &= mask;
+
+    Filter2 = vp8_filter;
+    Filter2 &= hev;
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
+    Filter1 = vp8_signed_char_clamp(Filter2 + 4);
+    Filter2 = vp8_signed_char_clamp(Filter2 + 3);
+    Filter1 >>= 3;
+    Filter2 >>= 3;
+    qs0 = vp8_signed_char_clamp(qs0 - Filter1);
+    ps0 = vp8_signed_char_clamp(ps0 + Filter2);
+
+
+    /* only apply wider filter if not high edge variance */
+    vp8_filter &= ~hev;
+    Filter2 = vp8_filter;
+
+    /* roughly 3/7th difference across boundary */
+    u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
+    s = vp8_signed_char_clamp(qs0 - u);
+    *oq0 = s ^ 0x80;
+    s = vp8_signed_char_clamp(ps0 + u);
+    *op0 = s ^ 0x80;
+
+    /* roughly 2/7th difference across boundary */
+    u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7);
+    s = vp8_signed_char_clamp(qs1 - u);
+    *oq1 = s ^ 0x80;
+    s = vp8_signed_char_clamp(ps1 + u);
+    *op1 = s ^ 0x80;
+
+    /* roughly 1/7th difference across boundary */
+    u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7);
+    s = vp8_signed_char_clamp(qs2 - u);
+    *oq2 = s ^ 0x80;
+    s = vp8_signed_char_clamp(ps2 + u);
+    *op2 = s ^ 0x80;
+}
+
+
+__inline signed char vp8_signed_char_clamp(int t)
+{
+    t = (t < -128 ? -128 : t);
+    t = (t > 127 ? 127 : t);
+    return (signed char) t;
+}
+
+
+/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
+__inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1)
+{
+    signed char hev = 0;
+    hev  |= (abs(p1 - p0) > thresh) * -1;
+    hev  |= (abs(q1 - q0) > thresh) * -1;
+    return hev;
+}
+
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+__inline signed char vp8_filter_mask(
+    signed char limit,
+    signed char flimit,
+     uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3)
+{
+    signed char mask = 0;
+    mask |= (abs(p3 - p2) > limit) * -1;
+    mask |= (abs(p2 - p1) > limit) * -1;
+    mask |= (abs(p1 - p0) > limit) * -1;
+    mask |= (abs(q1 - q0) > limit) * -1;
+    mask |= (abs(q2 - q1) > limit) * -1;
+    mask |= (abs(q3 - q2) > limit) * -1;
+    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit) * -1;
+    mask = ~mask;
+    return mask;
+}
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+__inline signed char vp8_simple_filter_mask(
+    signed char limit,
+    signed char flimit,
+    uc p1,
+    uc p0,
+    uc q0,
+    uc q1
+)
+{
+    signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= flimit * 2 + limit) * -1;
+    return mask;
+}
+
+void vp8_simple_filter(
+    signed char mask,
+    global uc *base,
+    int op1_off,
+    int op0_off,
+    int oq0_off,
+    int oq1_off
+)
+{
+
+    global uc *op1 = base + op1_off;
+    global uc *op0 = base + op0_off;
+    global uc *oq0 = base + oq0_off;
+    global uc *oq1 = base + oq1_off;
+
+    signed char vp8_filter, Filter1, Filter2;
+    signed char p1 = (signed char) * op1 ^ 0x80;
+    signed char p0 = (signed char) * op0 ^ 0x80;
+    signed char q0 = (signed char) * oq0 ^ 0x80;
+    signed char q1 = (signed char) * oq1 ^ 0x80;
+    signed char u;
+
+    vp8_filter = vp8_signed_char_clamp(p1 - q1);
+    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (q0 - p0));
+    vp8_filter &= mask;
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
+    Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
+    Filter1 >>= 3;
+    u = vp8_signed_char_clamp(q0 - Filter1);
+    *oq0  = u ^ 0x80;
+
+    Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
+    Filter2 >>= 3;
+    u = vp8_signed_char_clamp(p0 + Filter2);
+    *op0 = u ^ 0x80;
+}
--- a/vp8/common/opencl/loopfilter_cl.c
+++ b/vp8/common/opencl/loopfilter_cl.c
@@ -0,0 +1,457 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "../../../vpx_ports/config.h"
+#include "loopfilter_cl.h"
+#include "../onyxc_int.h"
+
+#include "vpx_config.h"
+#include "vp8_opencl.h"
+#include "blockd_cl.h"
+
+const char *loopFilterCompileOptions = "-Ivp8/common/opencl";
+const char *loop_filter_cl_file_name = "vp8/common/opencl/loopfilter.cl";
+
+typedef unsigned char uc;
+
+extern void vp8_loop_filter_frame
+(
+    VP8_COMMON *cm,
+    MACROBLOCKD *mbd,
+    int default_filt_lvl
+);
+
+prototype_loopfilter_cl(vp8_loop_filter_horizontal_edge_cl);
+prototype_loopfilter_cl(vp8_loop_filter_vertical_edge_cl);
+prototype_loopfilter_cl(vp8_mbloop_filter_horizontal_edge_cl);
+prototype_loopfilter_cl(vp8_mbloop_filter_vertical_edge_cl);
+prototype_loopfilter_cl(vp8_loop_filter_simple_horizontal_edge_cl);
+prototype_loopfilter_cl(vp8_loop_filter_simple_vertical_edge_cl);
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_cl(
+    MACROBLOCKD *x,
+    cl_mem buf_base,
+    int y_off,
+    int u_off,
+    int v_off,
+    int y_stride,
+    int uv_stride,
+    loop_filter_info *lfi,
+    int simpler_lpf
+)
+{
+    (void) simpler_lpf;
+
+    vp8_mbloop_filter_horizontal_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1);
+    vp8_mbloop_filter_horizontal_edge_cl(x, buf_base, u_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1);
+    vp8_mbloop_filter_horizontal_edge_cl(x, buf_base, v_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1);
+}
+
+void vp8_loop_filter_mbhs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
+                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
+                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+
+    vp8_mbloop_filter_vertical_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1);
+    vp8_mbloop_filter_vertical_edge_cl(x, buf_base, u_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1);
+    vp8_mbloop_filter_vertical_edge_cl(x, buf_base, v_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1);
+}
+
+void vp8_loop_filter_mbvs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
+                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) uv_stride;
+    (void) simpler_lpf;
+    vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
+                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+
+    vp8_loop_filter_horizontal_edge_cl(x, buf_base, y_off + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+    vp8_loop_filter_horizontal_edge_cl(x, buf_base, y_off + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+    vp8_loop_filter_horizontal_edge_cl(x, buf_base, y_off + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+    vp8_loop_filter_horizontal_edge_cl(x, buf_base, u_off + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1);
+    vp8_loop_filter_horizontal_edge_cl(x, buf_base, v_off + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1);
+
+}
+
+void vp8_loop_filter_bhs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
+                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) uv_stride;
+    (void) simpler_lpf;
+
+    vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+    vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+    vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
+                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) simpler_lpf;
+
+    vp8_loop_filter_vertical_edge_cl(x, buf_base, y_off + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+    vp8_loop_filter_vertical_edge_cl(x, buf_base, y_off + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+    vp8_loop_filter_vertical_edge_cl(x, buf_base, y_off + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+
+    vp8_loop_filter_vertical_edge_cl(x, buf_base, u_off + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1);
+    vp8_loop_filter_vertical_edge_cl(x, buf_base, v_off + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1);
+}
+
+void vp8_loop_filter_bvs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
+                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+    (void) uv_stride;
+    (void) simpler_lpf;
+
+    vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+    vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+    vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+}
+
+void vp8_init_loop_filter_cl(VP8_COMMON *cm)
+{
+    loop_filter_info *lfi = cm->lf_info;
+    int sharpness_lvl = cm->sharpness_level;
+    int frame_type = cm->frame_type;
+    int i, j;
+
+    int block_inside_limit = 0;
+    int HEVThresh;
+    const int yhedge_boost  = 2;
+
+    /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
+    for (i = 0; i <= MAX_LOOP_FILTER; i++)
+    {
+        int filt_lvl = i;
+
+        if (frame_type == KEY_FRAME)
+        {
+            if (filt_lvl >= 40)
+                HEVThresh = 2;
+            else if (filt_lvl >= 15)
+                HEVThresh = 1;
+            else
+                HEVThresh = 0;
+        }
+        else
+        {
+            if (filt_lvl >= 40)
+                HEVThresh = 3;
+            else if (filt_lvl >= 20)
+                HEVThresh = 2;
+            else if (filt_lvl >= 15)
+                HEVThresh = 1;
+            else
+                HEVThresh = 0;
+        }
+
+        /* Set loop filter paramaeters that control sharpness. */
+        block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
+        block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
+
+        if (sharpness_lvl > 0)
+        {
+            if (block_inside_limit > (9 - sharpness_lvl))
+                block_inside_limit = (9 - sharpness_lvl);
+        }
+
+        if (block_inside_limit < 1)
+            block_inside_limit = 1;
+
+        for (j = 0; j < 16; j++)
+        {
+            lfi[i].lim[j] = block_inside_limit;
+            lfi[i].mbflim[j] = filt_lvl + yhedge_boost;
+            lfi[i].flim[j] = filt_lvl;
+            lfi[i].thr[j] = HEVThresh;
+        }
+    }
+}
+
+/* Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding
+ * each frame. Check last_frame_type to skip the function most of times.
+ */
+void vp8_frame_init_loop_filter_cl(loop_filter_info *lfi, int frame_type)
+{
+    int HEVThresh;
+    int i, j;
+
+    /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
+    for (i = 0; i <= MAX_LOOP_FILTER; i++)
+    {
+        int filt_lvl = i;
+
+        if (frame_type == KEY_FRAME)
+        {
+            if (filt_lvl >= 40)
+                HEVThresh = 2;
+            else if (filt_lvl >= 15)
+                HEVThresh = 1;
+            else
+                HEVThresh = 0;
+        }
+        else
+        {
+            if (filt_lvl >= 40)
+                HEVThresh = 3;
+            else if (filt_lvl >= 20)
+                HEVThresh = 2;
+            else if (filt_lvl >= 15)
+                HEVThresh = 1;
+            else
+                HEVThresh = 0;
+        }
+
+        for (j = 0; j < 16; j++)
+        {
+            lfi[i].thr[j] = HEVThresh;
+        }
+    }
+}
+
+
+//This might not need to be copied from loopfilter.c
+void vp8_adjust_mb_lf_value_cl(MACROBLOCKD *mbd, int *filter_level)
+{
+    MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi;
+
+    if (mbd->mode_ref_lf_delta_enabled)
+    {
+        /* Apply delta for reference frame */
+        *filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];
+
+        /* Apply delta for mode */
+        if (mbmi->ref_frame == INTRA_FRAME)
+        {
+            /* Only the split mode BPRED has a further special case */
+            if (mbmi->mode == B_PRED)
+                *filter_level +=  mbd->mode_lf_deltas[0];
+        }
+        else
+        {
+            /* Zero motion mode */
+            if (mbmi->mode == ZEROMV)
+                *filter_level +=  mbd->mode_lf_deltas[1];
+
+            /* Split MB motion mode */
+            else if (mbmi->mode == SPLITMV)
+                *filter_level +=  mbd->mode_lf_deltas[3];
+
+            /* All other inter motion modes (Nearest, Near, New) */
+            else
+                *filter_level +=  mbd->mode_lf_deltas[2];
+        }
+
+        /* Range check */
+        if (*filter_level > MAX_LOOP_FILTER)
+            *filter_level = MAX_LOOP_FILTER;
+        else if (*filter_level < 0)
+            *filter_level = 0;
+    }
+}
+
+
+//Start of externally callable functions.
+
+int cl_init_loop_filter() {
+    int err;
+
+    // Create the filter compute program from the file-defined source code
+    if ( cl_load_program(&cl_data.loop_filter_program, loop_filter_cl_file_name,
+            loopFilterCompileOptions) != CL_SUCCESS )
+        return VP8_CL_TRIED_BUT_FAILED;
+
+    // Create the compute kernels in the program we wish to run
+    VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_horizontal_edge_kernel,"vp8_loop_filter_horizontal_edge_kernel");
+    VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_vertical_edge_kernel,"vp8_loop_filter_vertical_edge_kernel");
+    VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_mbloop_filter_horizontal_edge_kernel,"vp8_mbloop_filter_horizontal_edge_kernel");
+    VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_mbloop_filter_vertical_edge_kernel,"vp8_mbloop_filter_vertical_edge_kernel");
+    VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_simple_horizontal_edge_kernel,"vp8_loop_filter_simple_horizontal_edge_kernel");
+    VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_simple_vertical_edge_kernel,"vp8_loop_filter_simple_vertical_edge_kernel");
+
+    return CL_SUCCESS;
+}
+
+void cl_destroy_loop_filter(){
+
+    if (cl_data.loop_filter_program)
+        clReleaseProgram(cl_data.loop_filter_program);
+
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_horizontal_edge_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_vertical_edge_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_mbloop_filter_horizontal_edge_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_mbloop_filter_vertical_edge_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_simple_horizontal_edge_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_simple_vertical_edge_kernel);
+
+    cl_data.loop_filter_program = NULL;
+}
+
+
+void vp8_loop_filter_set_baselines_cl(MACROBLOCKD *mbd, int default_filt_lvl, int *baseline_filter_level){
+    int alt_flt_enabled = mbd->segmentation_enabled;
+    int i;
+
+    if (alt_flt_enabled)
+    {
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+        {
+            /* Abs value */
+            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+                baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+            /* Delta Value */
+            else
+            {
+                baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  /* Clamp to valid range */
+            }
+        }
+    }
+    else
+    {
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+            baseline_filter_level[i] = default_filt_lvl;
+    }
+}
+
+void vp8_loop_filter_frame_cl
+(
+    VP8_COMMON *cm,
+    MACROBLOCKD *mbd,
+    int default_filt_lvl
+)
+{
+    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+    loop_filter_info *lfi = cm->lf_info;
+    FRAME_TYPE frame_type = cm->frame_type;
+    LOOPFILTERTYPE filter_type = cm->filter_type;
+
+    int mb_row;
+    int mb_col;
+
+    int baseline_filter_level[MAX_MB_SEGMENTS];
+    int filter_level;
+    int alt_flt_enabled = mbd->segmentation_enabled;
+
+    int err;
+    unsigned char *buf_base;
+    int y_off, u_off, v_off;
+    //unsigned char *y_ptr, *u_ptr, *v_ptr;
+
+    mbd->mode_info_context = cm->mi;          /* Point at base of Mb MODE_INFO list */
+
+    /* Note the baseline filter values for each segment */
+    vp8_loop_filter_set_baselines_cl(mbd, default_filt_lvl, baseline_filter_level);
+
+    /* Initialize the loop filter for this frame. */
+    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
+        vp8_init_loop_filter_cl(cm);
+    else if (frame_type != cm->last_frame_type)
+        vp8_frame_init_loop_filter_cl(lfi, frame_type);
+
+    /* Set up the buffer pointers */
+
+    buf_base = post->buffer_alloc;
+    y_off = post->y_buffer - buf_base;
+    u_off = post->u_buffer - buf_base;
+    v_off = post->v_buffer - buf_base;
+
+    VP8_CL_SET_BUF(mbd->cl_commands, post->buffer_mem, post->buffer_size, post->buffer_alloc,
+            vp8_loop_filter_frame(cm,mbd,default_filt_lvl),);
+
+    /* vp8_filter each macro block */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+
+            filter_level = baseline_filter_level[Segment];
+
+            /* Distance of Mb to the various image edges.
+             * These specified to 8th pel as they are always compared to values 
+             * that are in 1/8th pel units. Apply any context driven MB level
+             * adjustment
+             */
+            filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);
+
+            if (filter_level)
+            {
+                if (mb_col > 0){
+                    if (filter_type == NORMAL_LOOPFILTER)
+                        vp8_loop_filter_mbv_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                    else
+                        vp8_loop_filter_mbvs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                }
+
+                if (mbd->mode_info_context->mbmi.dc_diff > 0){
+                    if (filter_type == NORMAL_LOOPFILTER)
+                        vp8_loop_filter_bv_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                    else
+                        vp8_loop_filter_bvs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                }
+
+                /* don't apply across umv border */
+                if (mb_row > 0){
+                    if (filter_type == NORMAL_LOOPFILTER)
+                        vp8_loop_filter_mbh_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                    else
+                        vp8_loop_filter_mbhs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                }
+
+                if (mbd->mode_info_context->mbmi.dc_diff > 0){
+                    if (filter_type == NORMAL_LOOPFILTER)
+                        vp8_loop_filter_bh_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                    else
+                        vp8_loop_filter_bhs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                }
+            }
+
+            y_off += 16;
+            u_off += 8;
+            v_off += 8;
+
+            mbd->mode_info_context++;     /* step to next MB */
+        }
+
+        y_off += post->y_stride  * 16 - post->y_width;
+        u_off += post->uv_stride *  8 - post->uv_width;
+        v_off += post->uv_stride *  8 - post->uv_width;
+
+        mbd->mode_info_context++;         /* Skip border mb */
+    }
+
+    //Retrieve buffer contents
+    err = clEnqueueReadBuffer(mbd->cl_commands, post->buffer_mem, CL_FALSE, 0, post->buffer_size, post->buffer_alloc, 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS(mbd->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to read loop filter output!\n",
+        ,
+    );
+
+    VP8_CL_FINISH(mbd->cl_commands);
+}
--- a/vp8/common/opencl/loopfilter_cl.h
+++ b/vp8/common/opencl/loopfilter_cl.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef loopfilter_cl_h
+#define loopfilter_cl_h
+
+#include "../../../vpx_ports/mem.h"
+
+#include "../onyxc_int.h"
+#include "blockd_cl.h"
+#include "../loopfilter.h"
+
+#define prototype_loopfilter_cl(sym) \
+    void sym(MACROBLOCKD*, cl_mem src_base, int src_offset,  \
+             int pitch, const signed char *flimit, \
+             const signed char *limit, const signed char *thresh, int count, int block_cnt)
+
+#define prototype_loopfilter_block_cl(sym) \
+    void sym(MACROBLOCKD*, unsigned char *y, unsigned char *u, unsigned char *v,\
+             int ystride, int uv_stride, loop_filter_info *lfi, int simpler)
+
+extern void vp8_loop_filter_frame_cl
+(
+    VP8_COMMON *cm,
+    MACROBLOCKD *mbd,
+    int default_filt_lvl
+);
+
+extern prototype_loopfilter_block_cl(vp8_lf_normal_mb_v_cl);
+extern prototype_loopfilter_block_cl(vp8_lf_normal_b_v_cl);
+extern prototype_loopfilter_block_cl(vp8_lf_normal_mb_h_cl);
+extern prototype_loopfilter_block_cl(vp8_lf_normal_b_h_cl);
+extern prototype_loopfilter_block_cl(vp8_lf_simple_mb_v_cl);
+extern prototype_loopfilter_block_cl(vp8_lf_simple_b_v_cl);
+extern prototype_loopfilter_block_cl(vp8_lf_simple_mb_h_cl);
+extern prototype_loopfilter_block_cl(vp8_lf_simple_b_h_cl);
+
+typedef prototype_loopfilter_block_cl((*vp8_lf_block_cl_fn_t));
+
+#endif
--- a/vp8/common/opencl/loopfilter_filters_cl.c
+++ b/vp8/common/opencl/loopfilter_filters_cl.c
@@ -0,0 +1,187 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+
+#include <stdio.h>
+
+#include "vpx_ports/config.h"
+#include "vp8_opencl.h"
+#include "blockd_cl.h"
+
+//#include "loopfilter_cl.h"
+//#include "../onyxc_int.h"
+
+typedef unsigned char uc;
+
+static void vp8_loop_filter_cl_run(
+    cl_command_queue cq,
+    cl_kernel kernel,
+    cl_mem buf_mem,
+    int s_off,
+    int p,
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    int count,
+    int block_cnt
+){
+    size_t global[] = {count,block_cnt};
+    int err;
+
+    cl_mem flimit_mem;
+    cl_mem limit_mem;
+    cl_mem thresh_mem;
+
+    VP8_CL_CREATE_BUF(cq, flimit_mem, , sizeof(uc)*16, flimit,, );
+    VP8_CL_CREATE_BUF(cq, limit_mem, , sizeof(uc)*16, limit,, );
+    VP8_CL_CREATE_BUF(cq, thresh_mem, , sizeof(uc)*16, thresh,, );
+
+    err = 0;
+    err = clSetKernelArg(kernel, 0, sizeof (cl_mem), &buf_mem);
+    err |= clSetKernelArg(kernel, 1, sizeof (cl_int), &s_off);
+    err |= clSetKernelArg(kernel, 2, sizeof (cl_int), &p);
+    err |= clSetKernelArg(kernel, 3, sizeof (cl_mem), &flimit_mem);
+    err |= clSetKernelArg(kernel, 4, sizeof (cl_mem), &limit_mem);
+    err |= clSetKernelArg(kernel, 5, sizeof (cl_mem), &thresh_mem);
+    err |= clSetKernelArg(kernel, 6, sizeof (cl_int), &block_cnt);
+    VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",,
+    );
+
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel(cq, kernel, 2, NULL, global, NULL , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);,
+    );
+
+    clReleaseMemObject(flimit_mem);
+    clReleaseMemObject(limit_mem);
+    clReleaseMemObject(thresh_mem);
+
+    VP8_CL_FINISH(cq);
+}
+
+void vp8_loop_filter_horizontal_edge_cl
+(
+    MACROBLOCKD *x,
+    cl_mem s_base,
+    int s_off,
+    int p, /* pitch */
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    int count,
+    int block_cnt
+)
+{
+    vp8_loop_filter_cl_run(x->cl_commands,
+        cl_data.vp8_loop_filter_horizontal_edge_kernel, s_base, s_off,
+        p, flimit, limit, thresh, count*8, block_cnt
+    );
+}
+
+void vp8_loop_filter_vertical_edge_cl
+(
+    MACROBLOCKD *x,
+    cl_mem s_base,
+    int s_off,
+    int p,
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    int count,
+    int block_cnt
+)
+{
+    vp8_loop_filter_cl_run(x->cl_commands,
+        cl_data.vp8_loop_filter_vertical_edge_kernel, s_base, s_off,
+        p, flimit, limit, thresh, count*8, block_cnt
+    );
+}
+
+void vp8_mbloop_filter_horizontal_edge_cl
+(
+    MACROBLOCKD *x,
+    cl_mem s_base,
+    int s_off,
+    int p,
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    int count,
+    int block_cnt
+)
+{
+    vp8_loop_filter_cl_run(x->cl_commands,
+        cl_data.vp8_mbloop_filter_horizontal_edge_kernel, s_base, s_off,
+        p, flimit, limit, thresh, count*8, block_cnt
+    );
+}
+
+
+void vp8_mbloop_filter_vertical_edge_cl
+(
+    MACROBLOCKD *x,
+    cl_mem s_base,
+    int s_off,
+    int p,
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    int count,
+    int block_cnt
+)
+{
+    vp8_loop_filter_cl_run(x->cl_commands,
+        cl_data.vp8_mbloop_filter_vertical_edge_kernel, s_base, s_off,
+        p, flimit, limit, thresh, count*8, block_cnt
+    );
+}
+
+void vp8_loop_filter_simple_horizontal_edge_cl
+(
+    MACROBLOCKD *x,
+    cl_mem s_base,
+    int s_off,
+    int p,
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    int count,
+    int block_cnt
+)
+{
+    vp8_loop_filter_cl_run(x->cl_commands,
+        cl_data.vp8_loop_filter_simple_horizontal_edge_kernel, s_base, s_off,
+        p, flimit, limit, thresh, count*8, block_cnt
+    );
+}
+
+void vp8_loop_filter_simple_vertical_edge_cl
+(
+    MACROBLOCKD *x,
+    cl_mem s_base,
+    int s_off,
+    int p,
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    int count,
+    int block_cnt
+)
+{
+    vp8_loop_filter_cl_run(x->cl_commands,
+        cl_data.vp8_loop_filter_simple_vertical_edge_kernel, s_base, s_off,
+        p, flimit, limit, thresh, count*8, block_cnt
+    );
+}
--- a/vp8/common/opencl/opencl_systemdependent.c
+++ b/vp8/common/opencl/opencl_systemdependent.c
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "../subpixel.h"
+#include "subpixel_cl.h"
+#include "../onyxc_int.h"
+#include "vp8_opencl.h"
+
+#if HAVE_DLOPEN
+#include "dynamic_cl.h"
+#endif
+
+void vp8_arch_opencl_common_init(VP8_COMMON *ctx)
+{
+
+#if HAVE_DLOPEN
+
+#if WIN32 //Windows .dll has no lib prefix and no extension
+    	cl_loaded = load_cl("OpenCL");
+#else   //But *nix needs full name
+    	cl_loaded = load_cl("libOpenCL.so");
+#endif
+
+        if (cl_loaded == CL_SUCCESS)
+            cl_initialized = cl_common_init();
+        else
+            cl_initialized = VP8_CL_TRIED_BUT_FAILED;
+
+#else //!HAVE_DLOPEN (e.g. Apple)
+        cl_initialized = cl_common_init();
+#endif
+
+}
--- a/vp8/common/opencl/reconinter_cl.c
+++ b/vp8/common/opencl/reconinter_cl.c
@@ -0,0 +1,641 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+//for the decoder, all subpixel prediction is done in this file.
+//
+//Need to determine some sort of mechanism for easily determining SIXTAP/BILINEAR
+//and what arguments to feed into the kernels. These kernels SHOULD be 2-pass,
+//and ideally there'd be a data structure that determined what static arguments
+//to pass in.
+//
+//Also, the only external functions being called here are the subpixel prediction
+//functions. Hopefully this means no worrying about when to copy data back/forth.
+
+#include "../../../vpx_ports/config.h"
+//#include "../recon.h"
+#include "../subpixel.h"
+//#include "../blockd.h"
+//#include "../reconinter.h"
+#if CONFIG_RUNTIME_CPU_DETECT
+//#include "../onyxc_int.h"
+#endif
+
+#include "vp8_opencl.h"
+#include "filter_cl.h"
+#include "reconinter_cl.h"
+#include "blockd_cl.h"
+
+#include <stdio.h>
+
+/* use this define on systems where unaligned int reads and writes are
+ * not allowed, i.e. ARM architectures
+ */
+/*#define MUST_BE_ALIGNED*/
+
+static const int bbb[4] = {0, 2, 8, 10};
+
+static void vp8_memcpy(
+    unsigned char *src_base,
+    int src_offset,
+    int src_stride,
+    unsigned char *dst_base,
+    int dst_offset,
+    int dst_stride,
+    int num_bytes,
+    int num_iter
+){
+
+    int i,r;
+    unsigned char *src = &src_base[src_offset];
+    unsigned char *dst = &dst_base[dst_offset];
+    src_offset = dst_offset = 0;
+
+    for (r = 0; r < num_iter; r++){
+        for (i = 0; i < num_bytes; i++){
+            src_offset = r*src_stride + i;
+            dst_offset = r*dst_stride + i;
+            dst[dst_offset] = src[src_offset];
+        }
+    }
+}
+
+static void vp8_copy_mem_cl(
+    cl_command_queue cq,
+    cl_mem src_mem,
+    int *src_offsets,
+    int src_stride,
+    cl_mem dst_mem,
+    int *dst_offsets,
+    int dst_stride,
+    int num_bytes,
+    int num_iter,
+    int num_blocks
+){
+
+    int err,block;
+
+#if MEM_COPY_KERNEL
+    size_t global[3] = {num_bytes, num_iter, num_blocks};
+
+    size_t local[3];
+    local[0] = global[0];
+    local[1] = global[1];
+    local[2] = global[2];
+
+    err  = clSetKernelArg(cl_data.vp8_memcpy_kernel, 0, sizeof (cl_mem), &src_mem);
+    err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 2, sizeof (int), &src_stride);
+    err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 3, sizeof (cl_mem), &dst_mem);
+    err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 5, sizeof (int), &dst_stride);
+    err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 6, sizeof (int), &num_bytes);
+    err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 7, sizeof (int), &num_iter);
+    VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",
+        return,
+    );
+
+    for (block = 0; block < num_blocks; block++){
+
+        /* Set kernel arguments */
+        err = clSetKernelArg(cl_data.vp8_memcpy_kernel, 1, sizeof (int), &src_offsets[block]);
+        err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 4, sizeof (int), &dst_offsets[block]);
+        VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+            "Error: Failed to set kernel arguments!\n",
+            return,
+        );
+
+        /* Execute the kernel */
+        if (num_bytes * num_iter > cl_data.vp8_memcpy_kernel_size){
+            err = clEnqueueNDRangeKernel( cq, cl_data.vp8_memcpy_kernel, 2, NULL, global, NULL , 0, NULL, NULL);
+        } else {
+            err = clEnqueueNDRangeKernel( cq, cl_data.vp8_memcpy_kernel, 2, NULL, global, local , 0, NULL, NULL);
+        }
+
+        VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+            "Error: Failed to execute kernel!\n",
+            return,
+        );
+    }
+#else
+    int iter;
+    for (block=0; block < num_blocks; block++){
+        for (iter = 0; iter < num_iter; iter++){
+            err = clEnqueueCopyBuffer(cq, src_mem, dst_mem,
+                    src_offsets[block]+iter*src_stride,
+                    dst_offsets[block]+iter*dst_stride,
+                    num_bytes, 0, NULL, NULL
+                  );
+            VP8_CL_CHECK_SUCCESS(cq, err != CL_SUCCESS, "Error copying between buffers\n",
+                    ,
+            );
+        }
+    }
+#endif
+}
+
+static void vp8_build_inter_predictors_b_cl(MACROBLOCKD *x, BLOCKD *d, int pitch)
+{
+    unsigned char *ptr_base = *(d->base_pre);
+    int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    vp8_subpix_cl_fn_t sppf;
+
+    int pre_dist = *d->base_pre - x->pre.buffer_alloc;
+    cl_mem pre_mem = x->pre.buffer_mem;
+    int pre_off = pre_dist+ptr_offset;
+
+    if (d->sixtap_filter == CL_TRUE)
+        sppf = vp8_sixtap_predict4x4_cl;
+    else
+        sppf = vp8_bilinear_predict4x4_cl;
+
+    //ptr_base a.k.a. d->base_pre is the start of the
+    //Macroblock's y_buffer, u_buffer, or v_buffer
+
+    if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
+    {
+        sppf(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
+    }
+    else
+    {
+        vp8_copy_mem_cl(d->cl_commands, pre_mem, &pre_off, d->pre_stride,d->cl_predictor_mem, &d->predictor_offset,pitch,4,4,1);
+    }
+}
+
+
+static void vp8_build_inter_predictors4b_cl(MACROBLOCKD *x, BLOCKD *d, int pitch)
+{
+    unsigned char *ptr_base = *(d->base_pre);
+    int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    int pre_dist = *d->base_pre - x->pre.buffer_alloc;
+    cl_mem pre_mem = x->pre.buffer_mem;
+    int pre_off = pre_dist + ptr_offset;
+
+    //If there's motion in the bottom 8 subpixels, need to do subpixel prediction
+    if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
+    {
+            if (d->sixtap_filter == CL_TRUE)
+                vp8_sixtap_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
+            else
+                vp8_bilinear_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
+    }
+    //Otherwise copy memory directly from src to dest
+    else
+    {
+        vp8_copy_mem_cl(d->cl_commands, pre_mem, &pre_off, d->pre_stride, d->cl_predictor_mem, &d->predictor_offset, pitch, 8, 8, 1);
+    }
+
+
+}
+
+static void vp8_build_inter_predictors2b_cl(MACROBLOCKD *x, BLOCKD *d, int pitch)
+{
+    unsigned char *ptr_base = *(d->base_pre);
+
+    int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    int pre_dist = *d->base_pre - x->pre.buffer_alloc;
+    cl_mem pre_mem = x->pre.buffer_mem;
+    int pre_off = pre_dist+ptr_offset;
+
+    if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
+    {
+        if (d->sixtap_filter == CL_TRUE)
+            vp8_sixtap_predict8x4_cl(d->cl_commands,ptr_base,pre_mem,pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
+        else
+            vp8_bilinear_predict8x4_cl(d->cl_commands,ptr_base,pre_mem,pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
+    }
+    else
+    {
+        vp8_copy_mem_cl(d->cl_commands, pre_mem, &pre_off, d->pre_stride, d->cl_predictor_mem, &d->predictor_offset, pitch, 8, 4, 1);
+    }
+}
+
+
+void vp8_build_inter_predictors_mbuv_cl(MACROBLOCKD *x)
+{
+    int i;
+
+    vp8_cl_mb_prep(x, PREDICTOR|PRE_BUF);
+
+#if !ONE_CQ_PER_MB
+    VP8_CL_FINISH(x->cl_commands);
+#endif
+
+    if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
+        x->mode_info_context->mbmi.mode != SPLITMV)
+    {
+
+        unsigned char *pred_base = x->predictor;
+        int upred_offset = 256;
+        int vpred_offset = 320;
+
+        int mv_row = x->block[16].bmi.mv.as_mv.row;
+        int mv_col = x->block[16].bmi.mv.as_mv.col;
+        int offset;
+
+        unsigned char *pre_base = x->pre.buffer_alloc;
+        cl_mem pre_mem = x->pre.buffer_mem;
+        int upre_off = x->pre.u_buffer - pre_base;
+        int vpre_off = x->pre.v_buffer - pre_base;
+        int pre_stride = x->block[16].pre_stride;
+
+        offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+        if ((mv_row | mv_col) & 7)
+        {
+            if (cl_initialized == CL_SUCCESS && x->sixtap_filter == CL_TRUE){
+                vp8_sixtap_predict8x8_cl(x->block[16].cl_commands,pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8);
+                vp8_sixtap_predict8x8_cl(x->block[20].cl_commands,pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8);
+            }
+            else{
+                vp8_bilinear_predict8x8_cl(x->block[16].cl_commands,pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8);
+                vp8_bilinear_predict8x8_cl(x->block[20].cl_commands,pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8);
+            }
+        }
+        else
+        {
+            int pre_offsets[2] = {upre_off+offset, vpre_off+offset};
+            int pred_offsets[2] = {upred_offset,vpred_offset};
+            vp8_copy_mem_cl(x->block[16].cl_commands, pre_mem, pre_offsets, pre_stride, x->cl_predictor_mem, pred_offsets, 8, 8, 8, 2);
+        }
+    }
+    else
+    {
+        // Can probably batch these operations as well, but not tested in decoder
+        // (or at least the test videos I've been using.
+        for (i = 16; i < 24; i += 2)
+        {
+            BLOCKD *d0 = &x->block[i];
+            BLOCKD *d1 = &x->block[i+1];
+            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+                vp8_build_inter_predictors2b_cl(x, d0, 8);
+            else
+            {
+                vp8_build_inter_predictors_b_cl(x, d0, 8);
+                vp8_build_inter_predictors_b_cl(x, d1, 8);
+            }
+        }
+    }
+
+#if !ONE_CQ_PER_MB
+    VP8_CL_FINISH(x->block[0].cl_commands);
+    VP8_CL_FINISH(x->block[16].cl_commands);
+    VP8_CL_FINISH(x->block[20].cl_commands);
+#endif
+
+    vp8_cl_mb_finish(x, PREDICTOR);
+}
+
+void vp8_build_inter_predictors_mb_cl(MACROBLOCKD *x)
+{
+    //If CL is running in encoder, need to call following before proceeding.
+    //vp8_cl_mb_prep(x, PRE_BUF);
+
+#if !ONE_CQ_PER_MB
+    VP8_CL_FINISH(x->cl_commands);
+#endif
+
+    if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
+        x->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        int offset;
+        unsigned char *pred_base = x->predictor;
+        int upred_offset = 256;
+        int vpred_offset = 320;
+
+        int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+        int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+        int pre_stride = x->block[0].pre_stride;
+
+        unsigned char *pre_base = x->pre.buffer_alloc;
+        cl_mem pre_mem = x->pre.buffer_mem;
+        int ypre_off = x->pre.y_buffer - pre_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+        int upre_off = x->pre.u_buffer - pre_base;
+        int vpre_off = x->pre.v_buffer - pre_base;
+
+        if ((mv_row | mv_col) & 7)
+        {
+            if (cl_initialized == CL_SUCCESS && x->sixtap_filter == CL_TRUE){
+                vp8_sixtap_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, 0, 16);
+            }
+            else
+                vp8_bilinear_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem,  ypre_off, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, 0, 16);
+        }
+        else
+        {
+            //16x16 copy
+            int pred_off = 0;
+            vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &ypre_off, pre_stride, x->cl_predictor_mem, &pred_off, 16, 16, 16, 1);
+        }
+
+
+        mv_row = x->block[16].bmi.mv.as_mv.row;
+        mv_col = x->block[16].bmi.mv.as_mv.col;
+        pre_stride >>= 1;
+        offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+        if ((mv_row | mv_col) & 7)
+        {
+            if (x->sixtap_filter == CL_TRUE){
+                vp8_sixtap_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8);
+                vp8_sixtap_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8);
+            }
+            else {
+                vp8_bilinear_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8);
+                vp8_bilinear_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8);
+            }
+        }
+        else
+        {
+            int pre_off = upre_off + offset;
+            vp8_copy_mem_cl(x->block[16].cl_commands, pre_mem, &pre_off, pre_stride, x->cl_predictor_mem, &upred_offset, 8, 8, 8, 1);
+            pre_off = vpre_off + offset;
+            vp8_copy_mem_cl(x->block[20].cl_commands, pre_mem, &pre_off, pre_stride, x->cl_predictor_mem, &vpred_offset, 8, 8, 8, 1);
+        }
+    }
+    else
+    {
+        int i;
+
+        if (x->mode_info_context->mbmi.partitioning < 3)
+        {
+            for (i = 0; i < 4; i++)
+            {
+                BLOCKD *d = &x->block[bbb[i]];
+                vp8_build_inter_predictors4b_cl(x, d, 16);
+            }
+        }
+        else
+        {
+            /* This loop can be done in any order... No dependencies.*/
+            /* Also, d0/d1 can be decoded simultaneously */
+            for (i = 0; i < 16; i += 2)
+            {
+                BLOCKD *d0 = &x->block[i];
+                BLOCKD *d1 = &x->block[i+1];
+
+                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+                    vp8_build_inter_predictors2b_cl(x, d0, 16);
+                else
+                {
+                    vp8_build_inter_predictors_b_cl(x, d0, 16);
+                    vp8_build_inter_predictors_b_cl(x, d1, 16);
+                }
+            }
+        }
+
+        /* Another case of re-orderable/batchable loop */
+        for (i = 16; i < 24; i += 2)
+        {
+            BLOCKD *d0 = &x->block[i];
+            BLOCKD *d1 = &x->block[i+1];
+
+            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+                vp8_build_inter_predictors2b_cl(x, d0, 8);
+            else
+            {
+                vp8_build_inter_predictors_b_cl(x, d0, 8);
+                vp8_build_inter_predictors_b_cl(x, d1, 8);
+            }
+        }
+    }
+
+#if !ONE_CQ_PER_MB
+    VP8_CL_FINISH(x->block[0].cl_commands);
+    VP8_CL_FINISH(x->block[16].cl_commands);
+    VP8_CL_FINISH(x->block[20].cl_commands);
+#endif
+
+    vp8_cl_mb_finish(x, PREDICTOR);
+}
+
+
+/* The following functions are written for skip_recon_mb() to call. Since there is no recon in this
+ * situation, we can write the result directly to dst buffer instead of writing it to predictor
+ * buffer and then copying it to dst buffer.
+ */
+static void vp8_build_inter_predictors_b_s_cl(MACROBLOCKD *x, BLOCKD *d, int dst_offset)
+{
+    unsigned char *ptr_base = *(d->base_pre);
+    int dst_stride = d->dst_stride;
+    int pre_stride = d->pre_stride;
+    int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+    vp8_subpix_cl_fn_t sppf;
+
+    int pre_dist = *d->base_pre - x->pre.buffer_alloc;
+    cl_mem pre_mem = x->pre.buffer_mem;
+    cl_mem dst_mem = x->dst.buffer_mem;
+
+    if (d->sixtap_filter == CL_TRUE){
+        sppf = vp8_sixtap_predict4x4_cl;
+    } else
+        sppf = vp8_bilinear_predict4x4_cl;
+        
+    if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
+    {
+        sppf(d->cl_commands, ptr_base, pre_mem, pre_dist+ptr_offset, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, NULL, dst_mem, dst_offset, dst_stride);
+    }
+    else
+    {
+        int pre_off = pre_dist+ptr_offset;
+        vp8_copy_mem_cl(d->cl_commands, pre_mem,&pre_off,pre_stride, dst_mem, &dst_offset,dst_stride,4,4,1);
+    }
+}
+
+
+void vp8_build_inter_predictors_mb_s_cl(MACROBLOCKD *x)
+{
+    cl_mem dst_mem = NULL;
+    cl_mem pre_mem = x->pre.buffer_mem;
+
+    unsigned char *dst_base = x->dst.buffer_alloc;
+    int ydst_off = x->dst.y_buffer - dst_base;
+    int udst_off = x->dst.u_buffer - dst_base;
+    int vdst_off = x->dst.v_buffer - dst_base;
+
+    dst_mem = x->dst.buffer_mem;
+    vp8_cl_mb_prep(x, DST_BUF);
+
+#if !ONE_CQ_PER_MB
+    VP8_CL_FINISH(x->cl_commands);
+#endif
+
+    if (x->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        int offset;
+        unsigned char *pre_base = x->pre.buffer_alloc;
+        int ypre_off = x->pre.y_buffer - pre_base;
+        int upre_off = x->pre.u_buffer - pre_base;
+        int vpre_off = x->pre.v_buffer - pre_base;
+
+        int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+        int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+        int pre_stride = x->dst.y_stride;
+
+        int ptr_offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+        if ((mv_row | mv_col) & 7)
+        {
+            if (x->sixtap_filter == CL_TRUE){
+                vp8_sixtap_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off+ptr_offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
+            }
+            else
+                vp8_bilinear_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off+ptr_offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
+        }
+        else
+        {
+            int pre_off = ypre_off+ptr_offset;
+            vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off, pre_stride, dst_mem, &ydst_off, x->dst.y_stride, 16, 16, 1);
+        }
+
+        mv_row = x->block[16].bmi.mv.as_mv.row;
+        mv_col = x->block[16].bmi.mv.as_mv.col;
+        pre_stride >>= 1;
+        offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+        if ((mv_row | mv_col) & 7)
+        {
+            if (x->sixtap_filter == CL_TRUE){
+                vp8_sixtap_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, udst_off, x->dst.uv_stride);
+                vp8_sixtap_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, vdst_off, x->dst.uv_stride);
+            } else {
+                vp8_bilinear_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, udst_off, x->dst.uv_stride);
+                vp8_bilinear_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, vdst_off, x->dst.uv_stride);
+            }
+        }
+        else
+        {
+            int pre_offsets[2] = {upre_off+offset, vpre_off+offset};
+            int dst_offsets[2] = {udst_off,vdst_off};
+            vp8_copy_mem_cl(x->block[16].cl_commands, pre_mem, pre_offsets, pre_stride, dst_mem, dst_offsets, x->dst.uv_stride, 8, 8, 2);
+        }
+
+    }
+    else
+    {
+        /* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later,
+         * if sth is wrong, go back to what it is in build_inter_predictors_mb.
+         *
+         * ACW: Not sure who the above comment belongs to, but it is
+         *      accurate for the decoder. Verified by reverse trace of source
+         */
+        int i;
+
+        if (x->mode_info_context->mbmi.partitioning < 3)
+        {
+            for (i = 0; i < 4; i++)
+            {
+                BLOCKD *d = &x->block[bbb[i]];
+
+                {
+                    unsigned char *ptr_base = *(d->base_pre);
+                    int pre_off = ptr_base - x->pre.buffer_alloc;
+                    
+                    int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+                    pre_off += ptr_offset;
+
+                    if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
+                    {
+                        if (x->sixtap_filter == CL_TRUE)
+                            vp8_sixtap_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
+                        else
+                            vp8_bilinear_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
+                    }
+                    else
+                    {
+                        vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off, d->pre_stride, dst_mem, &ydst_off, x->dst.y_stride, 8, 8, 1);
+                    }
+                }
+            }
+        }
+        else
+        {
+            for (i = 0; i < 16; i += 2)
+            {
+                BLOCKD *d0 = &x->block[i];
+                BLOCKD *d1 = &x->block[i+1];
+
+                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+                {
+                    /*vp8_build_inter_predictors2b(x, d0, 16);*/
+                    unsigned char *ptr_base = *(d0->base_pre);
+
+                    int pre_off = ptr_base - x->pre.buffer_alloc;
+
+                    int ptr_offset = d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
+                    pre_off += ptr_offset;
+
+                    if ( (d0->bmi.mv.as_mv.row | d0->bmi.mv.as_mv.col) & 7)
+                    {
+                        if (d0->sixtap_filter == CL_TRUE)
+                            vp8_sixtap_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem, pre_off, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
+                        else
+                            vp8_bilinear_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem,pre_off, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
+                    }
+                    else
+                    {
+                        vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off, d0->pre_stride, dst_mem, &ydst_off, x->dst.y_stride, 8, 4, 1);
+                    }
+                }
+                else
+                {
+                    vp8_build_inter_predictors_b_s_cl(x,d0, ydst_off);
+                    vp8_build_inter_predictors_b_s_cl(x,d1, ydst_off);
+                }
+            }
+        }
+
+        for (i = 16; i < 24; i += 2)
+        {
+            BLOCKD *d0 = &x->block[i];
+            BLOCKD *d1 = &x->block[i+1];
+
+            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+            {
+                /*vp8_build_inter_predictors2b(x, d0, 8);*/
+                unsigned char *ptr_base = *(d0->base_pre);
+                int ptr_offset = d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
+                int pre_off = ptr_base - x->pre.buffer_alloc + ptr_offset;
+
+                if ( (d0->bmi.mv.as_mv.row | d0->bmi.mv.as_mv.col) & 7)
+                {
+                    if (d0->sixtap_filter || CL_TRUE)
+                        vp8_sixtap_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem, pre_off, d0->pre_stride,
+                            d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7,
+                            dst_base, dst_mem, ydst_off, x->dst.uv_stride);
+                    else
+                        vp8_bilinear_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem, pre_off, d0->pre_stride,
+                            d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7,
+                            dst_base, dst_mem, ydst_off, x->dst.uv_stride);
+                }
+                else
+                {
+                    vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off,
+                        d0->pre_stride, dst_mem, &ydst_off, x->dst.uv_stride, 8, 4, 1);
+                }
+            }
+            else
+            {
+                vp8_build_inter_predictors_b_s_cl(x,d0, ydst_off);
+                vp8_build_inter_predictors_b_s_cl(x,d1, ydst_off);
+            }
+        } //end for
+    }
+
+#if !ONE_CQ_PER_MB
+    VP8_CL_FINISH(x->block[0].cl_commands);
+    VP8_CL_FINISH(x->block[16].cl_commands);
+    VP8_CL_FINISH(x->block[20].cl_commands);
+#endif
+
+    vp8_cl_mb_finish(x, DST_BUF);
+}
--- a/vp8/common/opencl/reconinter_cl.h
+++ b/vp8/common/opencl/reconinter_cl.h
@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTER_CL_H
+#define __INC_RECONINTER_CL_H
+
+#include "blockd_cl.h"
+#include "subpixel_cl.h"
+#include "filter_cl.h"
+
+extern void vp8_build_inter_predictors_mb_cl(MACROBLOCKD *x);
+extern void vp8_build_inter_predictors_mbuv_cl(MACROBLOCKD *x);
+
+extern void vp8_build_inter_predictors_mb_s_cl(MACROBLOCKD *x);
+//extern void vp8_build_inter_predictors_b_cl(BLOCKD *d, int pitch);
+
+#endif
--- a/vp8/common/opencl/subpixel_cl.h
+++ b/vp8/common/opencl/subpixel_cl.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef SUBPIXEL_CL_H
+#define SUBPIXEL_CL_H
+
+#include "../blockd.h"
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#define prototype_subpixel_predict_cl(sym) \
+    void sym(cl_command_queue cq, unsigned char *src_base, cl_mem src_mem, int src_offset, \
+            int src_pitch, int xofst, int yofst, \
+             unsigned char *dst_base, cl_mem dst_mem, int dst_offset, int dst_pitch)
+
+extern prototype_subpixel_predict_cl(vp8_sixtap_predict16x16_cl);
+extern prototype_subpixel_predict_cl(vp8_sixtap_predict8x8_cl);
+extern prototype_subpixel_predict_cl(vp8_sixtap_predict8x4_cl);
+extern prototype_subpixel_predict_cl(vp8_sixtap_predict4x4_cl);
+extern prototype_subpixel_predict_cl(vp8_bilinear_predict16x16_cl);
+extern prototype_subpixel_predict_cl(vp8_bilinear_predict8x8_cl);
+extern prototype_subpixel_predict_cl(vp8_bilinear_predict8x4_cl);
+extern prototype_subpixel_predict_cl(vp8_bilinear_predict4x4_cl);
+
+typedef prototype_subpixel_predict_cl((*vp8_subpix_cl_fn_t));
+
+//typedef enum
+//{
+//    SIXTAP = 0,
+//    BILINEAR = 1
+//} SUBPIX_TYPE;
+
+#endif
--- a/vp8/common/opencl/vp8_opencl.c
+++ b/vp8/common/opencl/vp8_opencl.c
@@ -0,0 +1,342 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "vp8_opencl.h"
+
+int cl_initialized = VP8_CL_NOT_INITIALIZED;
+VP8_COMMON_CL cl_data;
+
+//Initialization functions for various CL programs.
+extern int cl_init_filter();
+extern int cl_init_idct();
+extern int cl_init_loop_filter();
+
+//Common CL destructors
+extern void cl_destroy_loop_filter();
+extern void cl_destroy_filter();
+extern void cl_destroy_idct();
+
+//Destructors for encoder/decoder-specific bits
+extern void cl_decode_destroy();
+extern void cl_encode_destroy();
+
+/**
+ * 
+ * @param cq
+ * @param new_status
+ */
+void cl_destroy(cl_command_queue cq, int new_status) {
+
+    if (cl_initialized != CL_SUCCESS)
+        return;
+
+    //Wait on any pending operations to complete... frees up all of our pointers
+    if (cq != NULL)
+        clFinish(cq);
+
+#if ENABLE_CL_SUBPIXEL
+    //Release the objects that we've allocated on the GPU
+    cl_destroy_filter();
+#endif
+
+#if ENABLE_CL_IDCT_DEQUANT
+    cl_destroy_idct();
+
+#if CONFIG_VP8_DECODER
+    if (cl_data.cl_decode_initialized == CL_SUCCESS)
+        cl_decode_destroy();
+#endif
+
+#endif
+#if ENABLE_CL_LOOPFILTER
+    cl_destroy_loop_filter();
+#endif
+
+
+#if CONFIG_VP8_ENCODER
+    //placeholder for if/when encoder CL gets implemented
+#endif
+
+    if (cq){
+        clReleaseCommandQueue(cq);
+    }
+
+    if (cl_data.context){
+        clReleaseContext(cl_data.context);
+        cl_data.context = NULL;
+    }
+
+    cl_initialized = new_status;
+
+    return;
+}
+
+/**
+ * 
+ * @param dev
+ * @return
+ */
+cl_device_type device_type(cl_device_id dev){
+    cl_device_type type;
+    int err;
+
+    err = clGetDeviceInfo(dev, CL_DEVICE_TYPE, sizeof(type),&type,NULL);
+    if (err != CL_SUCCESS)
+        return CL_INVALID_DEVICE;
+    return type;
+}
+
+/**
+ * 
+ * @return
+ */
+int cl_common_init() {
+    int err,i,dev;
+    cl_platform_id platform_ids[MAX_NUM_PLATFORMS];
+    cl_uint num_found, num_devices;
+    cl_device_id devices[MAX_NUM_DEVICES];
+
+    //Don't allow multiple CL contexts..
+    if (cl_initialized != VP8_CL_NOT_INITIALIZED)
+        return cl_initialized;
+
+    // Connect to a compute device
+    err = clGetPlatformIDs(MAX_NUM_PLATFORMS, platform_ids, &num_found);
+
+    if (err != CL_SUCCESS) {
+        fprintf(stderr, "Couldn't query platform IDs\n");
+        return VP8_CL_TRIED_BUT_FAILED;
+    }
+
+    if (num_found == 0) {
+        fprintf(stderr, "No platforms found\n");
+        return VP8_CL_TRIED_BUT_FAILED;
+    }
+
+    //printf("Enumerating %d platform(s)\n", num_found);
+    //Enumerate the platforms found
+    for (i = 0; i < num_found; i++){
+    	char buf[2048];
+        size_t len;
+        
+    	err = clGetPlatformInfo( platform_ids[i], CL_PLATFORM_VENDOR, sizeof(buf), buf, &len);
+    	if (err != CL_SUCCESS){
+            fprintf(stderr, "Error retrieving platform vendor for platform %d",i);
+            continue;
+    	}
+    	//printf("Platform %d: %s\n",i,buf);
+
+        //If you need to force a platform (e.g. CPU-only testing), uncomment this
+        //if (strstr(buf,"NVIDIA"))
+        //    continue;
+
+    	//Try to find a valid compute device
+    	//Favor the GPU, but fall back to any other available device if necessary
+#ifdef __APPLE__
+    	printf("Apple system. Running CL as CPU-only for now...\n");
+        err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_CPU, MAX_NUM_DEVICES, devices, &num_devices);
+#else
+        err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, MAX_NUM_DEVICES, devices, &num_devices);
+#endif //__APPLE__
+        //printf("found %d devices\n", num_devices);
+        cl_data.device_id = NULL;
+        for( dev = 0; dev < num_devices; dev++ ){
+            char ext[2048];
+            //Get info for this device.
+            err = clGetDeviceInfo(devices[dev], CL_DEVICE_EXTENSIONS,
+                    sizeof(ext),ext,NULL);
+            VP8_CL_CHECK_SUCCESS(NULL,err != CL_SUCCESS,
+                    "Error retrieving device extension list",continue, 0);
+            //printf("Device %d supports: %s\n",dev,ext);
+            
+            //The kernels in VP8 require byte-addressable stores, which is an
+            //extension. It's required in OpenCL 1.1, but not all devices
+            //support it.
+            if (strstr(ext,"cl_khr_byte_addressable_store")){
+                //We found a valid device, so use it. But if we find a GPU
+                //(maybe this is one), prefer that.
+                cl_data.device_id = devices[dev];
+
+                if ( device_type(devices[dev]) == CL_DEVICE_TYPE_GPU ){
+                    //printf("Device %d is a GPU\n",dev);
+                    break;
+                }
+            }
+        }
+
+        //If we've found a usable GPU, stop looking.
+        if (cl_data.device_id != NULL && device_type(cl_data.device_id) == CL_DEVICE_TYPE_GPU )
+            break;
+
+    }
+
+    if (cl_data.device_id == NULL){
+    	printf("Error: Failed to find a valid OpenCL device. Using CPU paths\n");
+    	return VP8_CL_TRIED_BUT_FAILED;
+    }
+
+    // Create the compute context
+    cl_data.context = clCreateContext(0, 1, &cl_data.device_id, NULL, NULL, &err);
+    if (!cl_data.context) {
+        printf("Error: Failed to create a compute context!\n");
+        return VP8_CL_TRIED_BUT_FAILED;
+    }
+
+    //Initialize programs to null value
+    //Enables detection of if they've been initialized as well.
+    cl_data.filter_program = NULL;
+    cl_data.idct_program = NULL;
+    cl_data.loop_filter_program = NULL;
+
+#if ENABLE_CL_SUBPIXEL
+    err = cl_init_filter();
+    if (err != CL_SUCCESS)
+        return err;
+#endif
+
+#if ENABLE_CL_IDCT_DEQUANT
+    err = cl_init_idct();
+    if (err != CL_SUCCESS)
+        return err;
+#endif
+
+#if ENABLE_CL_LOOPFILTER
+
+    err = cl_init_loop_filter();
+    if (err != CL_SUCCESS)
+        return err;
+#endif
+
+    return CL_SUCCESS;
+}
+
+char *cl_read_file(const char* file_name) {
+    long pos;
+    char *bytes;
+    size_t amt_read;
+    FILE *f;
+
+    f = fopen(file_name, "rb");
+    
+    if (f == NULL) {
+        char *fullpath;
+        //printf("Couldn't find %s\n", file_name);
+
+        //Generate a file path for the CL sources using the library install dir
+        fullpath = malloc(strlen(vpx_codec_lib_dir()) + strlen(file_name) + 2);
+        if (fullpath == NULL) {
+           return NULL;
+        }
+        strcpy(fullpath, vpx_codec_lib_dir());
+        strcat(fullpath, "/"); //Will need to be changed for MSVS
+        strcat(fullpath, file_name);
+
+        //printf("Looking in %s\n", fullpath);
+
+        f = fopen(fullpath, "rb");
+        if (f == NULL) {
+            fprintf(stderr,"Couldn't find CL source at %s or %s\n", file_name, fullpath);
+            free(fullpath);
+            return NULL;
+        }
+
+        //printf("Found cl source at %s\n", fullpath);
+        free(fullpath);
+    } else {
+        //printf("Found cl source at %s\n", file_name);
+    }
+
+    fseek(f, 0, SEEK_END);
+    pos = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    bytes = malloc(pos+1);
+
+    if (bytes == NULL) {
+        fclose(f);
+        return NULL;
+    }
+
+    amt_read = fread(bytes, pos, 1, f);
+    if (amt_read != 1) {
+        free(bytes);
+        fclose(f);
+        return NULL;
+    }
+
+    bytes[pos] = '\0'; //null terminate the source string
+    fclose(f);
+
+
+    return bytes;
+}
+
+void show_build_log(cl_program *prog_ref){
+    size_t len;
+    char *buffer;
+    int err = clGetProgramBuildInfo(*prog_ref, cl_data.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
+
+    if (err != CL_SUCCESS){
+        printf("Error: Could not get length of CL build log\n");
+    }
+
+    buffer = (char*) malloc(len);
+    if (buffer == NULL) {
+        printf("Error: Couldn't allocate compile output buffer memory\n");
+    }
+
+    err = clGetProgramBuildInfo(*prog_ref, cl_data.device_id, CL_PROGRAM_BUILD_LOG, len, buffer, NULL);
+    if (err != CL_SUCCESS) {
+        printf("Error: Could not get CL build log\n");
+
+    } else {
+        printf("Compile output: %s\n", buffer);
+    }
+    free(buffer);
+}
+
+int cl_load_program(cl_program *prog_ref, const char *file_name, const char *opts) {
+
+    int err;
+    char *kernel_src = cl_read_file(file_name);
+    
+    *prog_ref = NULL;
+    if (kernel_src != NULL) {
+        *prog_ref = clCreateProgramWithSource(cl_data.context, 1, (const char**)&kernel_src, NULL, &err);
+        free(kernel_src);
+    } else {
+        cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
+        printf("Couldn't find OpenCL source files. \nUsing software path.\n");
+        return VP8_CL_TRIED_BUT_FAILED;
+    }
+
+    if (*prog_ref == NULL) {
+        printf("Error: Couldn't create program\n");
+        return VP8_CL_TRIED_BUT_FAILED;
+    }
+
+    if (err != CL_SUCCESS) {
+        printf("Error creating program: %d\n", err);
+    }
+
+    /* Build the program executable */
+    err = clBuildProgram(*prog_ref, 0, NULL, opts, NULL, NULL);
+    if (err != CL_SUCCESS) {
+        printf("Error: Failed to build program executable for %s!\n", file_name);
+
+        show_build_log(prog_ref);
+
+        return VP8_CL_TRIED_BUT_FAILED;
+    }
+
+    return CL_SUCCESS;
+}
--- a/vp8/common/opencl/vp8_opencl.h
+++ b/vp8/common/opencl/vp8_opencl.h
@@ -0,0 +1,192 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_OPENCL_H
+#define	VP8_OPENCL_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include "../../../vpx_config.h"
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#if HAVE_DLOPEN
+#include "dynamic_cl.h"
+#endif
+
+#define ENABLE_CL_IDCT_DEQUANT 0
+#define ENABLE_CL_SUBPIXEL 1
+#define TWO_PASS_SIXTAP 0
+#define MEM_COPY_KERNEL 1
+#define ONE_CQ_PER_MB 1 //Value of 0 is racey... still experimental.
+#define ENABLE_CL_LOOPFILTER 0
+
+extern char *cl_read_file(const char* file_name);
+extern int cl_common_init();
+extern void cl_destroy(cl_command_queue cq, int new_status);
+extern int cl_load_program(cl_program *prog_ref, const char *file_name, const char *opts);
+
+#define MAX_NUM_PLATFORMS 4
+#define MAX_NUM_DEVICES 10
+
+#define VP8_CL_TRIED_BUT_FAILED 1
+#define VP8_CL_NOT_INITIALIZED -1
+extern int cl_initialized;
+
+extern const char *vpx_codec_lib_dir(void);
+
+#define VP8_CL_FINISH(cq) \
+    if (cl_initialized == CL_SUCCESS){ \
+        /* Wait for kernels to finish. */ \
+        clFinish(cq); \
+    }
+
+#define VP8_CL_BARRIER(cq) \
+    if (cl_initialized == CL_SUCCESS){ \
+        /* Insert a barrier into the command queue. */ \
+        clEnqueueBarrier(cq); \
+    }
+
+#define VP8_CL_CHECK_SUCCESS(cq,cond,msg,alt,retCode) \
+    if ( cond ){ \
+        fprintf(stderr, msg);  \
+        cl_destroy(cq, VP8_CL_TRIED_BUT_FAILED); \
+        alt; \
+        return retCode; \
+    }
+
+#define VP8_CL_CALC_LOCAL_SIZE(kernel, kernel_size) \
+    err = clGetKernelWorkGroupInfo( cl_data.kernel, \
+  	cl_data.device_id, \
+  	CL_KERNEL_WORK_GROUP_SIZE, \
+  	sizeof(size_t), \
+  	&cl_data.kernel_size, \
+  	NULL);\
+    VP8_CL_CHECK_SUCCESS(NULL, err != CL_SUCCESS, \
+        "Error: Failed to calculate local size of kernel!\n", \
+        ,\
+        VP8_CL_TRIED_BUT_FAILED \
+    ); \
+
+#define VP8_CL_CREATE_KERNEL(data,program,name,str_name) \
+    data.name = clCreateKernel(data.program, str_name , &err); \
+    VP8_CL_CHECK_SUCCESS(NULL, err != CL_SUCCESS || !data.name, \
+        "Error: Failed to create compute kernel "#str_name"!\n", \
+        ,\
+        VP8_CL_TRIED_BUT_FAILED \
+    );
+
+#define VP8_CL_READ_BUF(cq, bufRef, bufSize, dstPtr) \
+    err = clEnqueueReadBuffer(cq, bufRef, CL_FALSE, 0, bufSize , dstPtr, 0, NULL, NULL); \
+    VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, \
+        "Error: Failed to read from GPU!\n",, err \
+    ); \
+
+#define VP8_CL_SET_BUF(cq, bufRef, bufSize, dataPtr, altPath, retCode) \
+    { \
+        err = clEnqueueWriteBuffer(cq, bufRef, CL_FALSE, 0, \
+            bufSize, dataPtr, 0, NULL, NULL); \
+        \
+        VP8_CL_CHECK_SUCCESS(cq, err != CL_SUCCESS, \
+            "Error: Failed to write to buffer!\n", \
+            altPath, retCode\
+        ); \
+    } \
+
+#define VP8_CL_CREATE_BUF(cq, bufRef, bufType, bufSize, dataPtr, altPath, retCode) \
+    bufRef = clCreateBuffer(cl_data.context, CL_MEM_READ_WRITE, bufSize, NULL, NULL); \
+    if (dataPtr != NULL && bufRef != NULL){ \
+        VP8_CL_SET_BUF(cq, bufRef, bufSize, dataPtr, altPath, retCode)\
+    } \
+    VP8_CL_CHECK_SUCCESS(cq, !bufRef, \
+        "Error: Failed to allocate buffer. Using CPU path!\n", \
+        altPath, retCode\
+    ); \
+
+#define VP8_CL_RELEASE_KERNEL(kernel) \
+    if (kernel) \
+        clReleaseKernel(kernel); \
+    kernel = NULL;
+
+typedef struct VP8_COMMON_CL {
+    cl_device_id device_id; // compute device id
+    cl_context context; // compute context
+    //cl_command_queue commands; // compute command queue
+
+    cl_program filter_program; // compute program for subpixel/bilinear filters
+    cl_kernel vp8_sixtap_predict_kernel;
+    size_t    vp8_sixtap_predict_kernel_size;
+    cl_kernel vp8_sixtap_predict8x4_kernel;
+    size_t    vp8_sixtap_predict8x4_kernel_size;
+    cl_kernel vp8_sixtap_predict8x8_kernel;
+    size_t    vp8_sixtap_predict8x8_kernel_size;
+    cl_kernel vp8_sixtap_predict16x16_kernel;
+    size_t    vp8_sixtap_predict16x16_kernel_size;
+
+    cl_kernel vp8_bilinear_predict4x4_kernel;
+    cl_kernel vp8_bilinear_predict8x4_kernel;
+    cl_kernel vp8_bilinear_predict8x8_kernel;
+    cl_kernel vp8_bilinear_predict16x16_kernel;
+
+    cl_kernel vp8_filter_block2d_first_pass_kernel;
+    size_t    vp8_filter_block2d_first_pass_kernel_size;
+    cl_kernel vp8_filter_block2d_second_pass_kernel;
+    size_t    vp8_filter_block2d_second_pass_kernel_size;
+
+    cl_kernel vp8_filter_block2d_bil_first_pass_kernel;
+    size_t    vp8_filter_block2d_bil_first_pass_kernel_size;
+    cl_kernel vp8_filter_block2d_bil_second_pass_kernel;
+    size_t    vp8_filter_block2d_bil_second_pass_kernel_size;
+
+    cl_kernel vp8_memcpy_kernel;
+    size_t    vp8_memcpy_kernel_size;
+    cl_kernel vp8_memset_short_kernel;
+
+    cl_program idct_program;
+    cl_kernel vp8_short_inv_walsh4x4_1_kernel;
+    cl_kernel vp8_short_inv_walsh4x4_1st_pass_kernel;
+    cl_kernel vp8_short_inv_walsh4x4_2nd_pass_kernel;
+    cl_kernel vp8_dc_only_idct_add_kernel;
+    //Note that the following 2 kernels are encoder-only. Not used in decoder.
+    cl_kernel vp8_short_idct4x4llm_1_kernel;
+    cl_kernel vp8_short_idct4x4llm_kernel;
+
+    cl_program loop_filter_program;
+    cl_kernel vp8_loop_filter_horizontal_edge_kernel;
+    cl_kernel vp8_loop_filter_vertical_edge_kernel;
+    cl_kernel vp8_mbloop_filter_horizontal_edge_kernel;
+    cl_kernel vp8_mbloop_filter_vertical_edge_kernel;
+    cl_kernel vp8_loop_filter_simple_horizontal_edge_kernel;
+    cl_kernel vp8_loop_filter_simple_vertical_edge_kernel;
+
+    cl_program dequant_program;
+    cl_kernel vp8_dequant_dc_idct_add_kernel;
+    cl_kernel vp8_dequant_idct_add_kernel;
+    cl_kernel vp8_dequantize_b_kernel;
+
+    cl_int cl_decode_initialized;
+    cl_int cl_encode_initialized;
+    
+} VP8_COMMON_CL;
+
+extern VP8_COMMON_CL cl_data;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* VP8_OPENCL_H */
+
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -211,7 +211,7 @@ void vp8_post_proc_down_and_across_c
    }
 }

-int vp8_q2mbl(int x)
+static int q2mbl(int x)
 {
    if (x < 20) x = 20;

@@ -314,8 +314,8 @@ static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG         *source,
    (void) flag;

    POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer, source->y_stride,  post->y_stride, source->y_height, source->y_width,  ppl);
-    POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride, post->y_height, post->y_width, vp8_q2mbl(q));
-    POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride, post->y_height, post->y_width, vp8_q2mbl(q));
+    POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
+    POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));

    POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
    POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
--- a/vp8/common/quant_common.c
+++ b/vp8/common/quant_common.c
@@ -66,6 +66,7 @@ int vp8_dc2quant(int QIndex, int Delta)
    return retval;

 }
+
 int vp8_dc_uv_quant(int QIndex, int Delta)
 {
    int retval;
@@ -116,6 +117,7 @@ int vp8_ac2quant(int QIndex, int Delta)

    return retval;
 }
+
 int vp8_ac_uv_quant(int QIndex, int Delta)
 {
    int retval;
--- a/vp8/common/recon.c
+++ b/vp8/common/recon.c
@@ -110,19 +110,19 @@ void vp8_recon_mby_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
 {
 #if ARCH_ARM
    BLOCKD *b = &x->block[0];
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);

    /*b = &x->block[4];*/
    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);

    /*b = &x->block[8];*/
    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);

    /*b = &x->block[12];*/
    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
 #else
    int i;

@@ -130,7 +130,7 @@ void vp8_recon_mby_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
    {
        BLOCKD *b = &x->block[i];

-        RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
    }
 #endif
 }
@@ -140,27 +140,27 @@ void vp8_recon_mb_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
 #if ARCH_ARM
    BLOCKD *b = &x->block[0];

-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
    b += 4;

    /*b = &x->block[16];*/

-    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
    b++;
    b++;
-    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
    b++;
    b++;
-    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
    b++;
    b++;
-    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
 #else
    int i;

@@ -168,14 +168,14 @@ void vp8_recon_mb_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
    {
        BLOCKD *b = &x->block[i];

-        RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
    }

    for (i = 16; i < 24; i += 2)
    {
        BLOCKD *b = &x->block[i];

-        RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
    }
 #endif
 }
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -8,7 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #include "vpx_ports/config.h"
 #include "recon.h"
 #include "subpixel.h"
@@ -18,6 +17,12 @@
 #include "onyxc_int.h"
 #endif

+#if CONFIG_OPENCL
+#include "opencl/vp8_opencl.h"
+#include "opencl/filter_cl.h"
+#include "opencl/reconinter_cl.h"
+#endif
+
 /* use this define on systems where unaligned int reads and writes are
 * not allowed, i.e. ARM architectures
 */
@@ -27,7 +32,7 @@
 static const int bbb[4] = {0, 2, 8, 10};


-
+//Copy 16 x 16-bytes from src to dst.
 void vp8_copy_mem16x16_c(
    unsigned char *src,
    int src_stride,
@@ -37,6 +42,9 @@ void vp8_copy_mem16x16_c(

    int r;

+	//Set this up as a 2D kernel. Each loop iteration is X, each byte/int within
+	//is the Y address.
+
    for (r = 0; r < 16; r++)
    {
 #ifdef MUST_BE_ALIGNED
@@ -71,6 +79,7 @@ void vp8_copy_mem16x16_c(

 }

+//Copy 8 x 8-bytes
 void vp8_copy_mem8x8_c(
    unsigned char *src,
    int src_stride,
@@ -136,43 +145,41 @@ void vp8_copy_mem8x4_c(
 void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf)
 {
    int r;
-    unsigned char *ptr_base;
-    unsigned char *ptr;
-    unsigned char *pred_ptr = d->predictor;

-    ptr_base = *(d->base_pre);
+    //d->base_pre is the start of the previous frame's y_buffer, u_buffer, or v_buffer
+    unsigned char *ptr_base = *(d->base_pre);
+    int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    unsigned char *pred_ptr = d->predictor_base + d->predictor_offset;

    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
    {
-        ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
-        sppf(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
+        sppf(ptr_base+ptr_offset, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
    }
    else
    {
-        ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
-        ptr = ptr_base;

        for (r = 0; r < 4; r++)
        {
 #ifdef MUST_BE_ALIGNED
-            pred_ptr[0]  = ptr[0];
-            pred_ptr[1]  = ptr[1];
-            pred_ptr[2]  = ptr[2];
-            pred_ptr[3]  = ptr[3];
+            pred_ptr[0]  = ptr_base[ptr_offset];
+            pred_ptr[1]  = ptr_base[ptr_offset+1];
+            pred_ptr[2]  = ptr_base[ptr_offset+2];
+            pred_ptr[3]  = ptr_base[ptr_offset+3];
 #else
-            *(int *)pred_ptr = *(int *)ptr ;
+            *(int *)pred_ptr = *(int *)(ptr_base+ptr_offset) ;
 #endif
            pred_ptr     += pitch;
-            ptr         += d->pre_stride;
+            ptr_offset   += d->pre_stride;
        }
    }
 }

-void vp8_build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch)
+static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch)
 {
    unsigned char *ptr_base;
    unsigned char *ptr;
-    unsigned char *pred_ptr = d->predictor;
+    unsigned char *pred_ptr = d->predictor_base + d->predictor_offset;

    ptr_base = *(d->base_pre);
    ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
@@ -187,11 +194,11 @@ void vp8_build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch)
    }
 }

-void vp8_build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch)
+static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch)
 {
    unsigned char *ptr_base;
    unsigned char *ptr;
-    unsigned char *pred_ptr = d->predictor;
+    unsigned char *pred_ptr = d->predictor_base + d->predictor_offset;

    ptr_base = *(d->base_pre);
    ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
@@ -206,11 +213,22 @@ void vp8_build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch)
    }
 }

-
+/* Encoder only */
 void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
 {
    int i;

+#if CONFIG_OPENCL
+    if ( 0 && cl_initialized == CL_SUCCESS ){
+        vp8_build_inter_predictors_mbuv_cl(x);
+        VP8_CL_FINISH(x->cl_commands);
+        VP8_CL_FINISH(x->block[0].cl_commands);
+        VP8_CL_FINISH(x->block[16].cl_commands);
+        VP8_CL_FINISH(x->block[20].cl_commands);
+        return;
+    }
+#endif
+
    if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
        x->mode_info_context->mbmi.mode != SPLITMV)
    {
@@ -229,8 +247,8 @@ void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)

        if ((mv_row | mv_col) & 7)
        {
-            x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
-            x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
+                x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
+                x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
        }
        else
        {
@@ -246,7 +264,7 @@ void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
            BLOCKD *d1 = &x->block[i+1];

            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                vp8_build_inter_predictors2b(x, d0, 8);
+                build_inter_predictors2b(x, d0, 8);
            else
            {
                vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
@@ -260,8 +278,8 @@ void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
 void vp8_build_inter_predictors_mby(MACROBLOCKD *x)
 {

-  if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
-      x->mode_info_context->mbmi.mode != SPLITMV)
+    if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
+        x->mode_info_context->mbmi.mode != SPLITMV)
    {
        unsigned char *ptr_base;
        unsigned char *ptr;
@@ -275,7 +293,7 @@ void vp8_build_inter_predictors_mby(MACROBLOCKD *x)

        if ((mv_row | mv_col) & 7)
        {
-            x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
+                x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
        }
        else
        {
@@ -291,7 +309,7 @@ void vp8_build_inter_predictors_mby(MACROBLOCKD *x)
            for (i = 0; i < 4; i++)
            {
                BLOCKD *d = &x->block[bbb[i]];
-                vp8_build_inter_predictors4b(x, d, 16);
+                build_inter_predictors4b(x, d, 16);
            }

        }
@@ -303,7 +321,7 @@ void vp8_build_inter_predictors_mby(MACROBLOCKD *x)
                BLOCKD *d1 = &x->block[i+1];

                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                    vp8_build_inter_predictors2b(x, d0, 16);
+                    build_inter_predictors2b(x, d0, 16);
                else
                {
                    vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
@@ -354,8 +372,8 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *x)

        if ((mv_row | mv_col) & 7)
        {
-            x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
-            x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
+                x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
+                x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
        }
        else
        {
@@ -372,7 +390,7 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
            for (i = 0; i < 4; i++)
            {
                BLOCKD *d = &x->block[bbb[i]];
-                vp8_build_inter_predictors4b(x, d, 16);
+                build_inter_predictors4b(x, d, 16);
            }
        }
        else
@@ -383,7 +401,7 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
                BLOCKD *d1 = &x->block[i+1];

                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                    vp8_build_inter_predictors2b(x, d0, 16);
+                    build_inter_predictors2b(x, d0, 16);
                else
                {
                    vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
@@ -400,7 +418,7 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
            BLOCKD *d1 = &x->block[i+1];

            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                vp8_build_inter_predictors2b(x, d0, 8);
+                build_inter_predictors2b(x, d0, 8);
            else
            {
                vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
@@ -492,7 +510,7 @@ void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel)
 }


-/* The following functions are wriiten for skip_recon_mb() to call. Since there is no recon in this
+/* The following functions are written for skip_recon_mb() to call. Since there is no recon in this
 * situation, we can write the result directly to dst buffer instead of writing it to predictor
 * buffer and then copying it to dst buffer.
 */
@@ -501,22 +519,20 @@ static void vp8_build_inter_predictors_b_s(BLOCKD *d, unsigned char *dst_ptr, vp
    int r;
    unsigned char *ptr_base;
    unsigned char *ptr;
-    /*unsigned char *pred_ptr = d->predictor;*/
+    /*unsigned char *pred_ptr = d->predictor_base + d->predictor_offset;*/
    int dst_stride = d->dst_stride;
    int pre_stride = d->pre_stride;
+    int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);

    ptr_base = *(d->base_pre);
+    ptr = ptr_base + ptr_offset;

    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
    {
-        ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
        sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, dst_stride);
    }
    else
    {
-        ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
-        ptr = ptr_base;
-
        for (r = 0; r < 4; r++)
        {
 #ifdef MUST_BE_ALIGNED
@@ -534,14 +550,17 @@ static void vp8_build_inter_predictors_b_s(BLOCKD *d, unsigned char *dst_ptr, vp
 }


-
 void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
 {
-    /*unsigned char *pred_ptr = x->block[0].predictor;
-    unsigned char *dst_ptr = *(x->block[0].base_dst) + x->block[0].dst;*/
-    unsigned char *pred_ptr = x->predictor;
    unsigned char *dst_ptr = x->dst.y_buffer;

+#if CONFIG_OPENCL && ENABLE_CL_SUBPIXEL
+    if (cl_initialized == CL_SUCCESS){
+        vp8_build_inter_predictors_mb_s_cl(x);
+        return;
+    }
+#endif
+
    if (x->mode_info_context->mbmi.mode != SPLITMV)
    {
        int offset;
@@ -563,7 +582,7 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)

        if ((mv_row | mv_col) & 7)
        {
-            x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
+                x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
        }
        else
        {
@@ -579,8 +598,8 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)

        if ((mv_row | mv_col) & 7)
        {
-            x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, udst_ptr, x->dst.uv_stride);
-            x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vdst_ptr, x->dst.uv_stride);
+                x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, udst_ptr, x->dst.uv_stride);
+                x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vdst_ptr, x->dst.uv_stride);
        }
        else
        {
@@ -592,6 +611,8 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
    {
        /* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later,
         * if sth is wrong, go back to what it is in build_inter_predictors_mb.
+         *
+         * ACW: note: Not sure who the above comment belongs to.
         */
        int i;

@@ -600,12 +621,11 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
            for (i = 0; i < 4; i++)
            {
                BLOCKD *d = &x->block[bbb[i]];
-                /*vp8_build_inter_predictors4b(x, d, 16);*/
+                /*build_inter_predictors4b(x, d, 16);*/

                {
                    unsigned char *ptr_base;
                    unsigned char *ptr;
-                    unsigned char *pred_ptr = d->predictor;

                    ptr_base = *(d->base_pre);
                    ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
@@ -621,7 +641,7 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
                }
            }
        }
-        else
+		else
        {
            for (i = 0; i < 16; i += 2)
            {
@@ -630,10 +650,9 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)

                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
                {
-                    /*vp8_build_inter_predictors2b(x, d0, 16);*/
+                    /*build_inter_predictors2b(x, d0, 16);*/
                    unsigned char *ptr_base;
                    unsigned char *ptr;
-                    unsigned char *pred_ptr = d0->predictor;

                    ptr_base = *(d0->base_pre);
                    ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
@@ -662,10 +681,9 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)

            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
            {
-                /*vp8_build_inter_predictors2b(x, d0, 8);*/
+                /*build_inter_predictors2b(x, d0, 8);*/
                unsigned char *ptr_base;
                unsigned char *ptr;
-                unsigned char *pred_ptr = d0->predictor;

                ptr_base = *(d0->base_pre);
                ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@@ -24,7 +24,7 @@ void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
    for (i = 16; i < 24; i += 2)
    {
        BLOCKD *b = &x->block[i];
-        RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
    }
 }

--- a/vp8/common/reconintra4x4.c
+++ b/vp8/common/reconintra4x4.c
@@ -124,6 +124,18 @@ void vp8_predict_intra4x4(BLOCKD *x,
    case B_LD_PRED:
    {
        unsigned char *ptr = Above;
+
+#if 0
+        //More readable version of the unrolled loop
+        int stride = 16, r=0, c=0;
+        for (r=0; r < 4; r++){
+            for (c=0; c < 4; c++){
+                int off = r+c;
+                int off2 = off > 5 ? 5: off; //Clamp so [3,3] has max off2 of 7
+                predictor[r*stride+c] = (ptr[off] + ptr[off+1]*2 + ptr[off2+2] + 2)>>2;
+            }
+        }
+#else
        predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
        predictor[0 * 16 + 1] =
            predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
@@ -140,7 +152,8 @@ void vp8_predict_intra4x4(BLOCKD *x,
        predictor[2 * 16 + 3] =
            predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
        predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
-
+#endif
+        
    }
    break;
    case B_RD_PRED:
@@ -311,5 +324,3 @@ void vp8_intra_prediction_down_copy(MACROBLOCKD *x)
    *dst_ptr1 = *src_ptr;
    *dst_ptr2 = *src_ptr;
 }
-
-
--- a/vp8/common/swapyv12buffer.c
+++ b/vp8/common/swapyv12buffer.c
@@ -11,10 +11,16 @@

 #include "swapyv12buffer.h"

+
+
 void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame)
 {
    unsigned char *temp;
-
+#if CONFIG_OPENCL
+    cl_mem temp_mem;
+#endif
+    int temp_size;
+    
    temp = last_frame->buffer_alloc;
    last_frame->buffer_alloc = new_frame->buffer_alloc;
    new_frame->buffer_alloc = temp;
@@ -31,4 +37,14 @@ void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *las
    last_frame->v_buffer = new_frame->v_buffer;
    new_frame->v_buffer = temp;

+    temp_size = last_frame->buffer_size;
+    last_frame->buffer_size = new_frame->buffer_size;
+    new_frame->buffer_size = temp_size;
+
+#if CONFIG_OPENCL
+    temp_mem = last_frame->buffer_mem;
+    last_frame->buffer_mem = new_frame->buffer_mem;
+    new_frame->buffer_mem = temp_mem;
+#endif
+
 }
--- a/vp8/common/x86/boolcoder.cxx
+++ b/vp8/common/x86/boolcoder.cxx
@@ -1,494 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-
-/* Arithmetic bool coder with largish probability range.
-   Timothy S Murphy  6 August 2004 */
-
-#include <assert.h>
-#include <math.h>
-
-#include "bool_coder.h"
-
-#if tim_vp8
-    extern "C" {
-#       include "VP8cx/treewriter.h"
-    }
-#endif
-
-int_types::~int_types() {}
-
-void bool_coder_spec::check_prec() const {
-    assert( w  &&  (r==Up || w > 1)  &&  w < 24  &&  (ebias || w < 17));
-}
-
-bool bool_coder_spec::float_init( uint Ebits, uint Mbits) {
-    uint b = (ebits = Ebits) + (mbits = Mbits);
-    if( b) {
-        assert( ebits < 6  &&  w + mbits < 31);
-        assert( ebits + mbits  <  sizeof(Index) * 8);
-        ebias = (1 << ebits) + 1 + mbits;
-        mmask = (1 << mbits) - 1;
-        max_index = ( ( half_index = 1 << b ) << 1) - 1;
-    } else {
-        ebias = 0;
-        max_index = 255;
-        half_index = 128;
-    }
-    check_prec();
-    return b? 1:0;
-}
-
-void bool_coder_spec::cost_init()
-{
-    static cdouble c = -(1 << 20)/log( 2.);
-
-    FILE *f = fopen( "costs.txt", "w");
-    assert( f);
-
-    assert( sizeof(int) >= 4);  /* for C interface */
-    assert( max_index <= 255);   /* size of Ctbl */
-    uint i = 0;  do {
-        cdouble p = ( *this)( (Index) i);
-        Ctbl[i] = (uint32) ( log( p) * c);
-        fprintf(
-            f, "cost( %d -> %10.7f) = %10d = %12.5f bits\n",
-            i, p, Ctbl[i], (double) Ctbl[i] / (1<<20)
-        );
-    } while( ++i <= max_index);
-    fclose( f);
-}
-
-bool_coder_spec_explicit_table::bool_coder_spec_explicit_table(
-    cuint16 tbl[256], Rounding rr, uint prec
-)
-  : bool_coder_spec( prec, rr)
-{
-    check_prec();
-    uint i = 0;
-    if( tbl)
-        do { Ptbl[i] = tbl[i];}  while( ++i < 256);
-    else
-        do { Ptbl[i] = i << 8;}  while( ++i < 256);
-    cost_init();
-}
-
-
-bool_coder_spec_exponential_table::bool_coder_spec_exponential_table(
-    uint x, Rounding rr, uint prec
-)
-  : bool_coder_spec( prec, rr)
-{
-    assert( x > 1  &&  x <= 16);
-    check_prec();
-    Ptbl[128] = 32768u;
-    Ptbl[0] = (uint16) pow( 2., 16. - x);
-    --x;
-    int i=1;  do {
-        cdouble d = pow( .5, 1. + (1. - i/128.)*x) * 65536.;
-        uint16 v = (uint16) d;
-        if( v < i)
-            v = i;
-        Ptbl[256-i] = (uint16) ( 65536U - (Ptbl[i] = v));
-    } while( ++i < 128);
-    cost_init();
-}
-
-bool_coder_spec::bool_coder_spec( FILE *fp) {
-    fscanf( fp, "%d", &w);
-    int v;
-    fscanf( fp, "%d", &v);
-    assert( 0 <= v  &&  v <= 2);
-    r = (Rounding) v;
-    fscanf( fp, "%d", &ebits);
-    fscanf( fp, "%d", &mbits);
-    if( float_init( ebits, mbits))
-        return;
-    int i=0;  do {
-        uint v;
-        fscanf( fp, "%d", &v);
-        assert( 0 <=v  &&  v <= 65535U);
-        Ptbl[i] = v;
-    } while( ++i < 256);
-    cost_init();
-}
-
-void bool_coder_spec::dump( FILE *fp) const {
-    fprintf( fp, "%d %d %d %d\n", w, (int) r, ebits, mbits);
-    if( ebits  ||  mbits)
-        return;
-    int i=0;  do { fprintf( fp, "%d\n", Ptbl[i]);}  while( ++i < 256);
-}
-
-vp8bc_index_t bool_coder_spec::operator()( double p) const
-{
-    if( p <= 0.)
-        return 0;
-    if( p >= 1.)
-        return max_index;
-    if( ebias) {
-        if( p > .5)
-            return max_index - ( *this)( 1. - p);
-        int e;
-        uint m = (uint) ldexp( frexp( p, &e), mbits + 2);
-        uint x = 1 << (mbits + 1);
-        assert( x <= m  &&  m < x<<1);
-        if( (m = (m >> 1) + (m & 1)) >= x) {
-            m = x >> 1;
-            ++e;
-        }
-        int y = 1 << ebits;
-        if( (e += y) >= y)
-            return half_index - 1;
-        if( e < 0)
-            return 0;
-        return (Index) ( (e << mbits) + (m & mmask));
-    }
-
-    cuint16 v = (uint16) (p * 65536.);
-    int i = 128;
-    int j = 128;
-    uint16 w;
-    while( w = Ptbl[i], j >>= 1) {
-        if( w < v)
-            i += j;
-        else if( w == v)
-            return (uchar) i;
-        else
-            i -= j;
-    }
-    if( w > v) {
-        cuint16 x = Ptbl[i-1];
-        if( v <= x  ||  w - v > v - x)
-            --i;
-    } else if( w < v  &&  i < 255) {
-        cuint16 x = Ptbl[i+1];
-        if( x <= v  ||  x - v < v - w)
-            ++i;
-    }
-    return (Index) i;
-}
-
-double bool_coder_spec::operator()( Index i) const {
-    if( !ebias)
-        return Ptbl[i]/65536.;
-    if( i >= half_index)
-        return 1. - ( *this)( (Index) (max_index - i));
-    return ldexp( (double)mantissa( i), - (int) exponent( i));
-}
-
-
-
-void bool_writer::carry() {
-    uchar *p = B;
-    assert( p > Bstart);
-    while( *--p == 255) { assert( p > Bstart);  *p = 0;}
-    ++*p;
-}
-
-
-bool_writer::bool_writer( c_spec& s, uchar *Dest, size_t Len)
-  : bool_coder( s),
-    Bstart( Dest),
-    Bend( Len? Dest+Len : 0),
-    B( Dest)
-{
-    assert( Dest);
-    reset();
-}
-
-bool_writer::~bool_writer() { flush();}
-
-#if 1
-    extern "C" { int bc_v = 0;}
-#else
-#   define bc_v 0
-#endif
-
-
-void bool_writer::raw( bool value, uint32 s) {
-    uint32 L = Low;
-
-    assert( Range >= min_range  &&  Range <= spec.max_range());
-    assert( !is_toast  &&  s  &&  s < Range);
-
-    if( bc_v) printf(
-        "Writing a %d, B %x  Low %x  Range %x  s %x   blag %d ...\n",
-        value? 1:0, B-Bstart, Low, Range, s, bit_lag
-    );
-    if( value) {
-        L += s;
-        s = Range - s;
-    } else
-        s -= rinc;
-    if( s < min_range) {
-        int ct = bit_lag;  do {
-            if( !--ct) {
-                ct = 8;
-                if( L & (1 << 31))
-                    carry();
-                assert( !Bend  ||  B < Bend);
-                *B++ = (uchar) (L >> 23);
-                L &= (1<<23) - 1;
-            }
-        } while( L += L, (s += s + rinc) < min_range);
-        bit_lag = ct;
-    }
-    Low = L;
-    Range = s;
-    if( bc_v)
-        printf(
-            "...done, B %x  Low %x  Range %x  blag %d \n",
-                B-Bstart, Low, Range, bit_lag
-        );
-}
-
-bool_writer& bool_writer::flush() {
-    if( is_toast)
-        return *this;
-    int b = bit_lag;
-    uint32 L = Low;
-    assert( b);
-    if( L & (1 << (32 - b)))
-        carry();
-    L <<= b & 7;
-    b >>= 3;
-    while( --b >= 0)
-        L <<= 8;
-    b = 4;
-    assert( !Bend  ||  B + 4 <= Bend);
-    do {
-        *B++ = (uchar) (L >> 24);
-        L <<= 8;
-    } while( --b);
-    is_toast = 1;
-    return *this;
-}
-
-
-bool_reader::bool_reader( c_spec& s, cuchar *src, size_t Len)
-  : bool_coder( s),
-    Bstart( src),
-    B( src),
-    Bend( Len? src+Len : 0),
-    shf( 32 - s.w),
-    bct( 8)
-{
-    int i = 4;  do { Low <<= 8;  Low |= *B++;}  while( --i);
-}
-
-
-bool bool_reader::raw( uint32 s) {
-
-    bool val = 0;
-    uint32 L = Low;
-    cuint32 S = s << shf;
-
-    assert( Range >= min_range  &&  Range <= spec.max_range());
-    assert( s  &&  s < Range  &&  (L >> shf) < Range);
-
-    if( bc_v)
-        printf(
-            "Reading, B %x  Low %x  Range %x  s %x  bct %d ...\n",
-            B-Bstart, Low, Range, s, bct
-        );
-
-    if( L >= S) {
-        L -= S;
-        s = Range - s;
-        assert( L < (s << shf));
-        val = 1;
-    } else
-        s -= rinc;
-    if( s < min_range) {
-        int ct = bct;
-        do {
-            assert( ~L & (1 << 31));
-            L += L;
-            if( !--ct) {
-                ct = 8;
-                if( !Bend  ||  B < Bend)
-                    L |= *B++;
-            }
-        } while( (s += s + rinc) < min_range);
-        bct = ct;
-    }
-    Low = L;
-    Range = s;
-    if( bc_v)
-        printf(
-            "...done, val %d  B %x  Low %x  Range %x  bct %d\n",
-            val? 1:0, B-Bstart, Low, Range, bct
-        );
-    return val;
-}
-
-
-/* C interfaces */
-
-// spec interface
-
-struct NS : bool_coder_namespace {
-    static Rounding r( vp8bc_c_prec *p, Rounding rr =down_full) {
-        return p? (Rounding) p->r : rr;
-    }
-};
-
-bool_coder_spec *vp8bc_vp6spec() {
-    return new bool_coder_spec_explicit_table( 0, bool_coder_namespace::Down, 8);
-}
-bool_coder_spec *vp8bc_float_spec(
-    unsigned int Ebits, unsigned int Mbits, vp8bc_c_prec *p
-) {
-    return new bool_coder_spec_float( Ebits, Mbits, NS::r( p), p? p->prec : 12);
-}
-bool_coder_spec *vp8bc_literal_spec(
-    const unsigned short m[256], vp8bc_c_prec *p
-) {
-    return new bool_coder_spec_explicit_table( m, NS::r( p), p? p->prec : 16);
-}
-bool_coder_spec *vp8bc_exponential_spec( unsigned int x, vp8bc_c_prec *p)
-{
-    return new bool_coder_spec_exponential_table( x, NS::r( p), p? p->prec : 16);
-}
-bool_coder_spec *vp8bc_spec_from_file( FILE *fp) {
-    return new bool_coder_spec( fp);
-}
-void vp8bc_destroy_spec( c_bool_coder_spec *p) { delete p;}
-
-void vp8bc_spec_to_file( c_bool_coder_spec *p, FILE *fp) { p->dump( fp);}
-
-vp8bc_index_t vp8bc_index( c_bool_coder_spec *p, double x) {
-    return ( *p)( x);
-}
-
-vp8bc_index_t vp8bc_index_from_counts(
-    c_bool_coder_spec *p, unsigned int L, unsigned int R
-) {
-    return ( *p)( (R += L)? (double) L/R : .5);
-}
-
-double vp8bc_probability( c_bool_coder_spec *p, vp8bc_index_t i) {
-    return ( *p)( i);
-}
-
-vp8bc_index_t vp8bc_complement( c_bool_coder_spec *p, vp8bc_index_t i) {
-    return p->complement( i);
-}
-unsigned int vp8bc_cost_zero( c_bool_coder_spec *p, vp8bc_index_t i) {
-    return p->cost_zero( i);
-}
-unsigned int vp8bc_cost_one( c_bool_coder_spec *p, vp8bc_index_t i) {
-    return p->cost_one( i);
-}
-unsigned int vp8bc_cost_bit( c_bool_coder_spec *p, vp8bc_index_t i, int v) {
-    return p->cost_bit( i, v);
-}
-
-#if tim_vp8
-    extern "C" int tok_verbose;
-
-#   define dbg_l 1000000
-
-    static vp8bc_index_t dbg_i [dbg_l];
-    static char dbg_v [dbg_l];
-    static size_t dbg_w = 0, dbg_r = 0;
-#endif
-
-// writer interface
-
-bool_writer *vp8bc_create_writer(
-    c_bool_coder_spec *p, unsigned char *D, size_t L
-) {
-    return new bool_writer( *p, D, L);
-}
-
-size_t vp8bc_destroy_writer( bool_writer *p) {
-    const size_t s = p->flush().bytes_written();
-    delete p;
-    return s;
-}
-
-void vp8bc_write_bool( bool_writer *p, int v, vp8bc_index_t i)
-{
-#   if tim_vp8
-        // bc_v = dbg_w < 10;
-        if( bc_v = tok_verbose)
-            printf( " writing %d at prob %d\n", v? 1:0, i);
-        accum_entropy_bc( &p->Spec(), i, v);
-
-        ( *p)( i, (bool) v);
-
-        if( dbg_w < dbg_l) {
-            dbg_i [dbg_w] = i;
-            dbg_v [dbg_w++] = v? 1:0;
-        }
-#   else
-        ( *p)( i, (bool) v);
-#   endif
-}
-
-void vp8bc_write_bits( bool_writer *p, unsigned int v, int n)
-{
-#   if tim_vp8
-        {
-            c_bool_coder_spec * const s = & p->Spec();
-            const vp8bc_index_t i = s->half_index();
-            int m = n;
-            while( --m >= 0)
-                accum_entropy_bc( s, i, (v>>m) & 1);
-        }
-#   endif
-
-    p->write_bits( n, v);
-}
-
-c_bool_coder_spec *vp8bc_writer_spec( c_bool_writer *w) { return & w->Spec();}
-
-// reader interface
-
-bool_reader *vp8bc_create_reader(
-    c_bool_coder_spec *p, const unsigned char *S, size_t L
-) {
-    return new bool_reader( *p, S, L);
-}
-
-void vp8bc_destroy_reader( bool_reader * p) { delete p;}
-
-int vp8bc_read_bool( bool_reader *p, vp8bc_index_t i)
-{
-#   if tim_vp8
-        // bc_v = dbg_r < 10;
-        bc_v = tok_verbose;
-        const int v = ( *p)( i)? 1:0;
-        if( tok_verbose)
-            printf( " reading %d at prob %d\n", v, i);
-        if( dbg_r < dbg_l) {
-            assert( dbg_r <= dbg_w);
-            if( i != dbg_i[dbg_r]  ||  v != dbg_v[dbg_r]) {
-                printf(
-        "Position %d: INCORRECTLY READING %d  prob %d, wrote %d  prob %d\n",
-                    dbg_r, v, i, dbg_v[dbg_r], dbg_i[dbg_r]
-                );
-            }
-            ++dbg_r;
-        }
-        return v;
-#   else
-        return ( *p)( i)? 1:0;
-#   endif
-}
-
-unsigned int vp8bc_read_bits( bool_reader *p, int n) { return p->read_bits( n);}
-
-c_bool_coder_spec *vp8bc_reader_spec( c_bool_reader *r) { return & r->Spec();}
-
-#undef bc_v
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -113,97 +113,6 @@ nextrow:
    ret


-;
-; THIS FUNCTION APPEARS TO BE UNUSED
-;
-;void vp8_filter_block1d_v6_mmx
-;(
-;   short *src_ptr,
-;   unsigned char *output_ptr,
-;   unsigned int pixels_per_line,
-;   unsigned int pixel_step,
-;   unsigned int output_height,
-;   unsigned int output_width,
-;   short * vp8_filter
-;)
-global sym(vp8_filter_block1d_v6_mmx)
-sym(vp8_filter_block1d_v6_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        movq      mm5, [GLOBAL(rd)]
-        push        rbx
-        mov         rbx, arg(6) ;vp8_filter
-        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
-        movq      mm2, [rbx + 32]         ;
-        movq      mm6, [rbx + 48]        ;
-        movq      mm7, [rbx + 64]        ;
-
-        movsxd      rdx, dword ptr arg(2) ;pixels_per_line
-        mov         rdi, arg(1) ;output_ptr
-        mov         rsi, arg(0) ;src_ptr
-        sub         rsi, rdx
-        sub         rsi, rdx
-        movsxd      rcx, DWORD PTR arg(4) ;output_height
-        movsxd      rax, DWORD PTR arg(5) ;output_width      ; destination pitch?
-        pxor        mm0, mm0              ; mm0 = 00000000
-
-
-nextrow_v:
-        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
-        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
-
-
-        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
-        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
-        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
-        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-
-        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
-        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
-        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
-        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-
-        paddsw      mm3, mm5               ; mm3 += round value
-        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
-        packuswb    mm3, mm0              ; pack and saturate
-
-        movd        [rdi],mm3             ; store the results in the destination
-
-        add         rdi,rax;
-
-        dec         rcx                   ; decrement count
-        jnz         nextrow_v             ; next row
-
-        pop         rbx
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 ;void vp8_filter_block1dc_v6_mmx
 ;(
 ;   short *src_ptr,
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -194,10 +194,6 @@ sym(vp8_filter_block1d16_h6_ssse3):

    mov         rdi, arg(2)                     ;output_ptr

-;;
-;;    cmp         esi, DWORD PTR [rax]
-;;    je          vp8_filter_block1d16_h4_ssse3
-
    mov         rsi, arg(0)                     ;src_ptr

    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
@@ -271,61 +267,7 @@ filter_block1d16_h6_rowloop_ssse3:
    pop rdi
    pop rsi
    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-vp8_filter_block1d16_h4_ssse3:
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-filter_block1d16_h4_rowloop_ssse3:
-    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
-
-    movdqa      xmm2, xmm1
-    pshufb      xmm1, [GLOBAL(shuf2b)]
-    pshufb      xmm2, [GLOBAL(shuf3b)]
-    pmaddubsw   xmm1, xmm5
-
-    movdqu      xmm3,   XMMWORD PTR [rsi + 6]
-
-    pmaddubsw   xmm2, xmm6
-    movdqa      xmm0, xmm3
-    pshufb      xmm3, [GLOBAL(shuf3b)]
-    pshufb      xmm0, [GLOBAL(shuf2b)]
-
-    paddsw      xmm1, [GLOBAL(rd)]
-    paddsw      xmm1, xmm2
-
-    pmaddubsw   xmm0, xmm5
-    pmaddubsw   xmm3, xmm6
-
-    psraw       xmm1, 7
-    packuswb    xmm1, xmm1
-    lea         rsi,    [rsi + rax]
-    paddsw      xmm3, xmm0
-    paddsw      xmm3, [GLOBAL(rd)]
-    psraw       xmm3, 7
-    packuswb    xmm3, xmm3
-
-    punpcklqdq  xmm1, xmm3
-
-    movdqa      XMMWORD Ptr [rdi], xmm1
-
-    add         rdi, rdx
-    dec         rcx
-    jnz         filter_block1d16_h4_rowloop_ssse3
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
+    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
--- a/vp8/decoder/arm/dequantize_arm.c
+++ b/vp8/decoder/arm/dequantize_arm.c
@@ -27,8 +27,8 @@ extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
 void vp8_dequantize_b_neon(BLOCKD *d)
 {
    int i;
-    short *DQ  = d->dqcoeff;
-    short *Q   = d->qcoeff;
+    short *DQ  = d->dqcoeff_base + d->dqcoeff_offset;
+    short *Q   = d->qcoeff_base + d->qcoeff_offset;
    short *DQC = d->dequant;

    vp8_dequantize_b_loop_neon(Q, DQC, DQ);
@@ -39,8 +39,8 @@ void vp8_dequantize_b_neon(BLOCKD *d)
 void vp8_dequantize_b_v6(BLOCKD *d)
 {
    int i;
-    short *DQ  = d->dqcoeff;
-    short *Q   = d->qcoeff;
+    short *DQ  = d->dqcoeff_base + d->dqcoeff_offset;
+    short *Q   = d->qcoeff_base + d->qcoeff_offset;
    short *DQC = d->dequant;

    vp8_dequantize_b_loop_v6(Q, DQC, DQ);
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -228,15 +228,8 @@ unsigned int vp8_mv_cont_count[5][4] =
 };
 #endif

-unsigned char vp8_mbsplit_offset[4][16] = {
-    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
-};
-
-unsigned char vp8_mbsplit_fill_count[4] = {8, 8, 4, 1};
-unsigned char vp8_mbsplit_fill_offset[4][16] = {
+static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};
+static const unsigned char mbsplit_fill_offset[4][16] = {
    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
    { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15},
    { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15},
@@ -246,7 +239,7 @@ unsigned char vp8_mbsplit_fill_offset[4][16] = {



-void vp8_mb_mode_mv_init(VP8D_COMP *pbi)
+static void mb_mode_mv_init(VP8D_COMP *pbi)
 {
    vp8_reader *const bc = & pbi->bc;
    MV_CONTEXT *const mvc = pbi->common.fc.mvc;
@@ -287,7 +280,7 @@ void vp8_mb_mode_mv_init(VP8D_COMP *pbi)
    }
 }

-void vp8_read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
+static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                            int mb_row, int mb_col)
 {
    const MV Zero = { 0, 0};
@@ -405,10 +398,10 @@ void vp8_read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                    /* Fill (uniform) modes, mvs of jth subset.
                     Must do it here because ensuing subsets can
                     refer back to us via "left" or "above". */
-                    unsigned char *fill_offset;
-                    unsigned int fill_count = vp8_mbsplit_fill_count[s];
+                    const unsigned char *fill_offset;
+                    unsigned int fill_count = mbsplit_fill_count[s];

-                    fill_offset = &vp8_mbsplit_fill_offset[s][(unsigned char)j * vp8_mbsplit_fill_count[s]];
+                    fill_offset = &mbsplit_fill_offset[s][(unsigned char)j * mbsplit_fill_count[s]];

                    do {
                        mi->bmi[ *fill_offset] = bmi;
@@ -525,7 +518,7 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi)
    MODE_INFO *mi = pbi->common.mi;
    int mb_row = -1;

-    vp8_mb_mode_mv_init(pbi);
+    mb_mode_mv_init(pbi);

    while (++mb_row < pbi->common.mb_rows)
    {
@@ -543,11 +536,11 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi)

        while (++mb_col < pbi->common.mb_cols)
        {
-            /*vp8_read_mb_modes_mv(pbi, xd->mode_info_context, &xd->mode_info_context->mbmi, mb_row, mb_col);*/
+            /*read_mb_modes_mv(pbi, xd->mode_info_context, &xd->mode_info_context->mbmi, mb_row, mb_col);*/
            if(pbi->common.frame_type == KEY_FRAME)
                vp8_kfread_modes(pbi, mi, mb_row, mb_col);
            else
-                vp8_read_mb_modes_mv(pbi, mi, &mi->mbmi, mb_row, mb_col);
+                read_mb_modes_mv(pbi, mi, &mi->mbmi, mb_row, mb_col);

            mi++;       /* next macroblock */
        }
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -17,7 +17,6 @@
 #include "vp8/common/reconinter.h"
 #include "dequantize.h"
 #include "detokenize.h"
-#include "vp8/common/invtrans.h"
 #include "vp8/common/alloccommon.h"
 #include "vp8/common/entropymode.h"
 #include "vp8/common/quant_common.h"
@@ -33,10 +32,21 @@
 #include "vp8/common/threading.h"
 #include "decoderthreading.h"
 #include "dboolhuff.h"
+#include "vp8/common/blockd.h"

 #include <assert.h>
 #include <stdio.h>

+#include "vpx_config.h"
+#if CONFIG_OPENCL
+#include "vp8/common/opencl/vp8_opencl.h"
+#include "vp8/common/opencl/blockd_cl.h"
+#include "opencl/dequantize_cl.h"
+#include "opencl/decodframe_cl.h"
+#endif
+
+#define PROFILE_OUTPUT 0
+
 void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
 {
    int i;
@@ -98,6 +108,10 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)

    xd->block[24].dequant = pc->Y2dequant[QIndex];

+#if CONFIG_OPENCL && ENABLE_CL_IDCT_DEQUANT
+    mb_init_dequantizer_cl(xd);
+#endif
+
 }

 #if CONFIG_RUNTIME_CPU_DETECT
@@ -121,6 +135,14 @@ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
    else
    {
        vp8_build_inter_predictors_mb_s(xd);
+#if CONFIG_OPENCL
+        VP8_CL_FINISH(xd->cl_commands);
+#if !ONE_CQ_PER_MB
+        VP8_CL_FINISH(xd->block[0].cl_commands);
+        VP8_CL_FINISH(xd->block[16].cl_commands);
+        VP8_CL_FINISH(xd->block[20].cl_commands);
+#endif
+#endif
    }
 }

@@ -175,8 +197,9 @@ void clamp_mvs(MACROBLOCKD *xd)

 }

-void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
+static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
+
    int eobtotal = 0;
    int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;

@@ -197,6 +220,27 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)

    xd->mode_info_context->mbmi.dc_diff = 1;

+#if PROFILE_OUTPUT
+     if (xd->frame_type == KEY_FRAME)
+         printf("Intra-Coded MB\n");
+     else{
+         if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME){
+             printf("Intra-Coded Inter-Frame MB\n");
+         } else {
+            printf("Inter-Coded MB\n");
+         }
+     }
+#endif
+
+#if CONFIG_OPENCL
+    //If OpenCL is enabled and initialized, use CL-specific decoder for remains
+    //of MB decoding.
+    if (cl_initialized == CL_SUCCESS){
+        vp8_decode_macroblock_cl(pbi, xd, eobtotal);
+        return;
+    }
+#endif
+
    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0)
    {
        xd->mode_info_context->mbmi.dc_diff = 0;
@@ -229,68 +273,68 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
    {
        BLOCKD *b = &xd->block[24];
+        short *qcoeff = b->qcoeff_base + b->qcoeff_offset;
+        vp8_second_order_fn_t second_order;
+
        DEQUANT_INVOKE(&pbi->dequant, block)(b);

        /* do 2nd order transform on the dc block */
-        if (xd->eobs[24] > 1)
-        {
-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
-            ((int *)b->qcoeff)[0] = 0;
-            ((int *)b->qcoeff)[1] = 0;
-            ((int *)b->qcoeff)[2] = 0;
-            ((int *)b->qcoeff)[3] = 0;
-            ((int *)b->qcoeff)[4] = 0;
-            ((int *)b->qcoeff)[5] = 0;
-            ((int *)b->qcoeff)[6] = 0;
-            ((int *)b->qcoeff)[7] = 0;
-        }
-        else
-        {
-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
-            ((int *)b->qcoeff)[0] = 0;
+        if (xd->eobs[24] > 1){
+            second_order = IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16);
+            ((int *)qcoeff)[0] = 0;
+            ((int *)qcoeff)[1] = 0;
+            ((int *)qcoeff)[2] = 0;
+            ((int *)qcoeff)[3] = 0;
+            ((int *)qcoeff)[4] = 0;
+            ((int *)qcoeff)[5] = 0;
+            ((int *)qcoeff)[6] = 0;
+            ((int *)qcoeff)[7] = 0;
+        } else {
+            second_order = IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1);
+            ((int *)qcoeff)[0] = 0;
        }

+        second_order(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset]);
        DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
-                        (xd->qcoeff, xd->block[0].dequant,
-                         xd->predictor, xd->dst.y_buffer,
-                         xd->dst.y_stride, xd->eobs, xd->block[24].diff);
+            (xd->qcoeff, xd->block[0].dequant,
+             xd->predictor, xd->dst.y_buffer,
+             xd->dst.y_stride, xd->eobs, &xd->block[24].diff_base[xd->block[24].diff_offset]);
    }
    else if ((xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)
    {
        for (i = 0; i < 16; i++)
        {
-
            BLOCKD *b = &xd->block[i];
-            vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+            short *qcoeff = b->qcoeff_base + b->qcoeff_offset;
+            vp8_predict_intra4x4(b, b->bmi.mode, b->predictor_base + b->predictor_offset);

            if (xd->eobs[i] > 1)
            {
                DEQUANT_INVOKE(&pbi->dequant, idct_add)
-                    (b->qcoeff, b->dequant,  b->predictor,
+                    (qcoeff, b->dequant,  b->predictor_base + b->predictor_offset,
                    *(b->base_dst) + b->dst, 16, b->dst_stride);
            }
            else
            {
                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-                    (b->qcoeff[0] * b->dequant[0], b->predictor,
+                    (qcoeff[0] * b->dequant[0], b->predictor_base + b->predictor_offset,
                    *(b->base_dst) + b->dst, 16, b->dst_stride);
-                ((int *)b->qcoeff)[0] = 0;
+                ((int *)qcoeff)[0] = 0;
            }
        }
-
    }
    else
    {
        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
-                        (xd->qcoeff, xd->block[0].dequant,
-                         xd->predictor, xd->dst.y_buffer,
-                         xd->dst.y_stride, xd->eobs);
+            (xd->qcoeff, xd->block[0].dequant,
+             xd->predictor, xd->dst.y_buffer,
+             xd->dst.y_stride, xd->eobs);
    }

    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
-                    (xd->qcoeff+16*16, xd->block[16].dequant,
-                     xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
-                     xd->dst.uv_stride, xd->eobs+16);
+        (xd->qcoeff+16*16, xd->block[16].dequant,
+         xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
+         xd->dst.uv_stride, xd->eobs+16);
 }


@@ -320,10 +364,8 @@ FILE *vpxlog = 0;



-void vp8_decode_mb_row(VP8D_COMP *pbi,
-                       VP8_COMMON *pc,
-                       int mb_row,
-                       MACROBLOCKD *xd)
+static void
+decode_mb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mb_row, MACROBLOCKD *xd)
 {

    int i;
@@ -345,6 +387,13 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
    xd->mb_to_top_edge = -((mb_row * 16)) << 3;
    xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;

+
+    xd->dst.buffer_alloc = pc->yv12_fb[dst_fb_idx].buffer_alloc;
+    xd->dst.buffer_size = pc->yv12_fb[dst_fb_idx].buffer_size;
+#if CONFIG_OPENCL
+    xd->dst.buffer_mem = pc->yv12_fb[dst_fb_idx].buffer_mem;
+#endif
+
    for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
    {

@@ -380,6 +429,11 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
        xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
        xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
        xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+        xd->pre.buffer_alloc = pc->yv12_fb[ref_fb_idx].buffer_alloc;
+        xd->pre.buffer_size = pc->yv12_fb[ref_fb_idx].buffer_size;
+#if CONFIG_OPENCL
+        xd->pre.buffer_mem = pc->yv12_fb[ref_fb_idx].buffer_mem;
+#endif

        if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME)
        {
@@ -395,7 +449,7 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
        else
        pbi->debugoutput =0;
        */
-        vp8_decode_macroblock(pbi, xd);
+        decode_macroblock(pbi, xd);

        /* check if the boolean decoder has suffered an error */
        xd->corrupted |= vp8dx_bool_error(xd->current_bc);
@@ -519,7 +573,7 @@ static void init_frame(VP8D_COMP *pbi)
        vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
        xd->mb_segement_abs_delta = SEGMENT_DELTADATA;

-        /* reset the mode ref deltasa for loop filter */
+        /* reset the mode ref deltas for loop filter */
        vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
        vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));

@@ -537,14 +591,13 @@ static void init_frame(VP8D_COMP *pbi)
    }
    else
    {
-        if (!pc->use_bilinear_mc_filter)
-            pc->mcomp_filter_type = SIXTAP;
-        else
-            pc->mcomp_filter_type = BILINEAR;

-        /* To enable choice of different interploation filters */
+        /* To enable choice of different interpolation filters */
        if (pc->mcomp_filter_type == SIXTAP)
        {
+#if CONFIG_OPENCL
+            xd->sixtap_filter = CL_TRUE;
+#endif
            xd->subpixel_predict      = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap4x4);
            xd->subpixel_predict8x4   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap8x4);
            xd->subpixel_predict8x8   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap8x8);
@@ -552,6 +605,9 @@ static void init_frame(VP8D_COMP *pbi)
        }
        else
        {
+#if CONFIG_OPENCL
+            xd->sixtap_filter = CL_FALSE;
+#endif
            xd->subpixel_predict      = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear4x4);
            xd->subpixel_predict8x4   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear8x4);
            xd->subpixel_predict8x8   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear8x8);
@@ -567,6 +623,7 @@ static void init_frame(VP8D_COMP *pbi)
    xd->corrupted = 0; /* init without corruption */
 }

+
 int vp8_decode_frame(VP8D_COMP *pbi)
 {
    vp8_reader *const bc = & pbi->bc;
@@ -616,9 +673,12 @@ int vp8_decode_frame(VP8D_COMP *pbi)
        pc->vert_scale = data[6] >> 6;
        data += 7;

+        //Allow resolution changes on key frames.
        if (Width != pc->Width  ||  Height != pc->Height)
        {
+#if CONFIG_MULTITHREAD
            int prev_mb_rows = pc->mb_rows;
+#endif

            if (pc->Width <= 0)
            {
@@ -809,19 +869,17 @@ int vp8_decode_frame(VP8D_COMP *pbi)

    pc->refresh_last_frame = pc->frame_type == KEY_FRAME  ||  vp8_read_bit(bc);

-    if (0)
-    {
-        FILE *z = fopen("decodestats.stt", "a");
-        fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
-                pc->current_video_frame,
-                pc->frame_type,
-                pc->refresh_golden_frame,
-                pc->refresh_alt_ref_frame,
-                pc->refresh_last_frame,
-                pc->base_qindex);
-        fclose(z);
-    }
-
+#if 0
+   FILE *z = fopen("decodestats.stt", "a");
+    fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
+            pc->current_video_frame,
+            pc->frame_type,
+            pc->refresh_golden_frame,
+            pc->refresh_alt_ref_frame,
+            pc->refresh_last_frame,
+            pc->base_qindex);
+    fclose(z);
+#endif

    {
        /* read coef probability tree */
@@ -842,6 +900,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
                    }
    }

+    //Set up the macroblock's previous/destination buffers
    vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));
    vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));

@@ -851,13 +910,13 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 #endif
        vp8_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);

+    /* clear out the coeff buffer */
+    vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+
    vp8_setup_block_dptrs(xd);

    vp8_build_block_doffsets(xd);

-    /* clear out the coeff buffer */
-    vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
-
    /* Read the mb_no_coeff_skip flag */
    pc->mb_no_coeff_skip = (int)vp8_read_bit(bc);

@@ -868,6 +927,13 @@ int vp8_decode_frame(VP8D_COMP *pbi)

    vpx_memcpy(&xd->block[0].bmi, &xd->mode_info_context->bmi[0], sizeof(B_MODE_INFO));

+#if PROFILE_OUTPUT
+    if (pc->frame_type == KEY_FRAME)
+        printf("Key Frame\n");
+    else
+        printf("Inter-Frame\n");
+#endif
+
 #if CONFIG_MULTITHREAD
    if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
    {
@@ -888,7 +954,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
        int ibc = 0;
        int num_part = 1 << pc->multi_token_partition;

-        /* Decode the individual macro block */
+        /* Decode the individual macro blocks */
        for (mb_row = 0; mb_row < pc->mb_rows; mb_row++)
        {

@@ -901,11 +967,14 @@ int vp8_decode_frame(VP8D_COMP *pbi)
                    ibc = 0;
            }

-            vp8_decode_mb_row(pbi, pc, mb_row, xd);
+            decode_mb_row(pbi, pc, mb_row, xd);
        }
    }

-
+#if CONFIG_OPENCL
+    vp8_decode_frame_cl_finish(pbi);
+#endif
+    
    stop_token_decoder(pbi);

    /* Collect information about decoder corruption. */
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@@ -21,8 +21,8 @@ extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
 void vp8_dequantize_b_c(BLOCKD *d)
 {
    int i;
-    short *DQ  = d->dqcoeff;
-    short *Q   = d->qcoeff;
+    short *DQ  = d->dqcoeff_base + d->dqcoeff_offset;
+    short *Q   = d->qcoeff_base + d->qcoeff_offset;
    short *DQC = d->dequant;

    for (i = 0; i < 16; i++)
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -19,7 +19,13 @@
 #define BOOL_DATA UINT8

 #define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
-DECLARE_ALIGNED(16, UINT8, vp8_coef_bands_x[16]) = { 0, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X};
+DECLARE_ALIGNED(16, static const unsigned char, coef_bands_x[16]) =
+{
+    0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X,
+    6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X,
+    6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+    6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X
+};
 #define EOB_CONTEXT_NODE            0
 #define ZERO_CONTEXT_NODE           1
 #define ONE_CONTEXT_NODE            2
@@ -135,7 +141,7 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
            Prob = coef_probs; \
            if(c<15) {\
            ++c; \
-            Prob += vp8_coef_bands_x[c]; \
+            Prob += coef_bands_x[c]; \
            goto branch; \
            } goto BLOCK_FINISHED; /*for malformed input */\
        } \
@@ -149,8 +155,8 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
    Prob = coef_probs + (ENTROPY_NODES*2); \
    if(c < 15){\
        qcoeff_ptr [ scan[c] ] = (INT16) v; \
-        ++c; \
-        goto DO_WHILE; }\
+        continue; \
+    }\
    qcoeff_ptr [ scan[15] ] = (INT16) v; \
    goto BLOCK_FINISHED;

@@ -175,7 +181,7 @@ int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
 {
    ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
    ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;
-    const VP8_COMMON *const oc = & dx->common;
+    const FRAME_CONTEXT * const fc = &dx->common.fc;

    BOOL_DECODER *bc = x->current_bc;

@@ -230,7 +236,7 @@ int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
    range   = bc->range;


-    coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];
+    coef_probs = fc->coef_probs [type] [ 0 ] [0];

 BLOCK_LOOP:
    a = A + vp8_block2above[i];
@@ -243,8 +249,9 @@ BLOCK_LOOP:
    Prob = coef_probs;
    Prob += v * ENTROPY_NODES;

-DO_WHILE:
-    Prob += vp8_coef_bands_x[c];
+do{
+
+    Prob += coef_bands_x[c];
    DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED);

 CHECK_0_:
@@ -322,9 +329,8 @@ ONE_CONTEXT_NODE_0_:
    if (c < 15)
    {
        qcoeff_ptr [ scan[c] ] = (INT16) v;
-        ++c;
-        goto DO_WHILE;
    }
+} while (c++ < 15);

    qcoeff_ptr [ scan[15] ] = (INT16) v;
 BLOCK_FINISHED:
@@ -342,7 +348,7 @@ BLOCK_FINISHED:
        type = 0;
        i = 0;
        stop = 16;
-        coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];
+        coef_probs = fc->coef_probs [type] [ 0 ] [0];
        qcoeff_ptr -= (24*16 + 16);
        goto BLOCK_LOOP;
    }
@@ -350,7 +356,7 @@ BLOCK_FINISHED:
    if (i == 16)
    {
        type = 2;
-        coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];
+        coef_probs = fc->coef_probs [type] [ 0 ] [0];
        stop = 24;
        goto BLOCK_LOOP;
    }
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -15,6 +15,7 @@

 extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi);
 extern void vp8_arch_arm_decode_init(VP8D_COMP *pbi);
+extern void vp8_arch_opencl_decode_init(VP8D_COMP *pbi);

 void vp8_dmachine_specific_config(VP8D_COMP *pbi)
 {
@@ -36,4 +37,8 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
 #if ARCH_ARM
    vp8_arch_arm_decode_init(pbi);
 #endif
+
+#if CONFIG_OPENCL && (ENABLE_CL_IDCT_DEQUANT)
+    vp8_arch_opencl_decode_init(pbi);
+#endif
 }
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -34,45 +34,21 @@
 #include "vpx_ports/arm.h"
 #endif

+#include "vpx_config.h"
+#if CONFIG_OPENCL
+#include "vp8/common/opencl/blockd_cl.h"
+#include "vp8/common/opencl/vp8_opencl.h"
+#endif
+
 extern void vp8_init_loop_filter(VP8_COMMON *cm);
 extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);

-#if CONFIG_DEBUG
-void vp8_recon_write_yuv_frame(unsigned char *name, YV12_BUFFER_CONFIG *s)
-{
-    FILE *yuv_file = fopen((char *)name, "ab");
-    unsigned char *src = s->y_buffer;
-    int h = s->y_height;
-
-    do
-    {
-        fwrite(src, s->y_width, 1,  yuv_file);
-        src += s->y_stride;
-    }
-    while (--h);
-
-    src = s->u_buffer;
-    h = s->uv_height;
-
-    do
-    {
-        fwrite(src, s->uv_width, 1,  yuv_file);
-        src += s->uv_stride;
-    }
-    while (--h);
-
-    src = s->v_buffer;
-    h = s->uv_height;
-
-    do
-    {
-        fwrite(src, s->uv_width, 1, yuv_file);
-        src += s->uv_stride;
-    }
-    while (--h);
-
-    fclose(yuv_file);
-}
+#define PROFILE_OUTPUT 0
+#if PROFILE_OUTPUT
+struct vpx_usec_timer frame_timer;
+struct vpx_usec_timer loop_filter_timer;
+unsigned int total_mb = 0;
+unsigned int total_loop_filter = 0;
 #endif

 void vp8dx_initialize()
@@ -113,7 +89,6 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
    pbi->common.current_video_frame = 0;
    pbi->ready_for_new_data = 1;

-    pbi->CPUFreq = 0; /*vp8_get_processor_freq();*/
 #if CONFIG_MULTITHREAD
    pbi->max_threads = oxcf->max_threads;
    vp8_decoder_create_threads(pbi);
@@ -151,39 +126,11 @@ void vp8dx_remove_decompressor(VP8D_PTR ptr)
    vp8_decoder_remove_threads(pbi);
 #endif
    vp8_remove_common(&pbi->common);
+
    vpx_free(pbi);
 }


-void vp8dx_set_setting(VP8D_PTR comp, VP8D_SETTING oxst, int x)
-{
-    VP8D_COMP *pbi = (VP8D_COMP *) comp;
-
-    (void) pbi;
-    (void) x;
-
-    switch (oxst)
-    {
-    case VP8D_OK:
-        break;
-    }
-}
-
-int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst)
-{
-    VP8D_COMP *pbi = (VP8D_COMP *) comp;
-
-    (void) pbi;
-
-    switch (oxst)
-    {
-    case VP8D_OK:
-        break;
-    }
-
-    return -1;
-}
-
 int vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
@@ -203,6 +150,8 @@ int vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_C

    return 0;
 }
+
+
 int vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
@@ -316,7 +265,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    VP8_COMMON *cm = &pbi->common;
    int retcode = 0;
-    struct vpx_usec_timer timer;

    /*if(pbi->ready_for_new_data == 0)
        return -1;*/
@@ -381,14 +329,87 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign

    pbi->common.error.setjmp = 1;

-    vpx_usec_timer_start(&timer);
-
    /*cm->current_video_frame++;*/
    pbi->Source = source;
    pbi->source_sz = size;

+#if CONFIG_OPENCL
+    pbi->mb.cl_commands = NULL;
+    if (cl_initialized == CL_SUCCESS){
+        int err;
+        //Create command queue for macroblock.
+        pbi->mb.cl_commands = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err);
+        if (!pbi->mb.cl_commands || err != CL_SUCCESS) {
+            printf("Error: Failed to create a command queue!\n");
+            cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
+        }
+
+        pbi->mb.cl_diff_mem = NULL;
+        pbi->mb.cl_predictor_mem = NULL;
+        pbi->mb.cl_qcoeff_mem = NULL;
+        pbi->mb.cl_dqcoeff_mem = NULL;
+        pbi->mb.cl_eobs_mem = NULL;
+
+#define SET_ON_ALLOC 0
+#if SET_ON_ALLOC
+        
+#if ENABLE_CL_SUBPIXEL || ENABLE_CL_IDCT_DEQUANT
+            VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_predictor_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+                    sizeof(cl_uchar)*384, pbi->mb.predictor, goto BUF_DONE, -1);
+#endif
+
+#if ENABLE_CL_IDCT_DEQUANT
+            VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_diff_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+                    sizeof(cl_short)*400, pbi->mb.diff, goto BUF_DONE, -1);
+
+            VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_qcoeff_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+                    sizeof(cl_short)*400, pbi->mb.qcoeff, goto BUF_DONE,-1);
+
+            VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_dqcoeff_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+                    sizeof(cl_short)*400, pbi->mb.dqcoeff, goto BUF_DONE,-1);
+
+            VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_eobs_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+                    sizeof(cl_char)*25, pbi->mb.eobs, goto BUF_DONE,-1);
+#endif
+#else
+#if ENABLE_CL_IDCT_DEQUANT || ENABLE_CL_SUBPIXEL
+            VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_predictor_mem, CL_MEM_READ_WRITE,
+                    sizeof(cl_uchar)*384, NULL, goto BUF_DONE,-1);
+#endif
+
+#if ENABLE_CL_IDCT_DEQUANT
+            VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_diff_mem, CL_MEM_READ_WRITE,
+                    sizeof(cl_short)*400, NULL, goto BUF_DONE,-1);
+
+            VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_qcoeff_mem, CL_MEM_READ_WRITE,
+                    sizeof(cl_short)*400, NULL, goto BUF_DONE,-1);
+
+            VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_dqcoeff_mem, CL_MEM_READ_WRITE,
+                    sizeof(cl_short)*400, NULL, goto BUF_DONE,-1);
+
+            VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_eobs_mem, CL_MEM_READ_WRITE,
+                    sizeof(cl_char) * 25, NULL, goto BUF_DONE,-1);
+#endif
+#endif
+    }
+#if ENABLE_CL_IDCT_DEQUANT || ENABLE_CL_SUBPIXEL
+    BUF_DONE:
+#endif
+#endif
+
+#if PROFILE_OUTPUT
+    printf("Frame size = %d * %d\n", cm->Height, cm->Width);
+    printf("Macroblocks = %d * %d\n", cm->mb_rows, cm->mb_cols);
+
+    vpx_usec_timer_start(&frame_timer);
+#endif
    retcode = vp8_decode_frame(pbi);

+#if PROFILE_OUTPUT
+    vpx_usec_timer_mark(&frame_timer);
+    total_mb += vpx_usec_timer_elapsed(&frame_timer);
+#endif
+
    if (retcode < 0)
    {
 #if HAVE_ARMV7
@@ -443,36 +464,56 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign

        if(pbi->common.filter_level)
        {
+
+#if PROFILE_OUTPUT
            struct vpx_usec_timer lpftimer;
            vpx_usec_timer_start(&lpftimer);
+#endif
+           
            /* Apply the loop filter if appropriate. */
-
            vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);

+#if PROFILE_OUTPUT
            vpx_usec_timer_mark(&lpftimer);
            pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);

+            printf("Loop Filter\n");
+            total_loop_filter += vpx_usec_timer_elapsed(&lpftimer);
+#if 0
+            if (pbi->common.filter_type == NORMAL_LOOPFILTER){
+                printf("Normal LF Time (us): %d\n", vpx_usec_timer_elapsed(&lpftimer));
+            } else {
+                printf("Simple LF Time (us): %d\n", vpx_usec_timer_elapsed(&lpftimer));
+            }
+#endif
+#endif
+
            cm->last_frame_type = cm->frame_type;
            cm->last_filter_type = cm->filter_type;
            cm->last_sharpness_level = cm->sharpness_level;
        }
+#if PROFILE_OUTPUT
+        else {
+            printf("No Loop Filter\n");
+        }
+#endif
        vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
    }

-#if 0
-    /* DEBUG code */
-    /*vp8_recon_write_yuv_frame("recon.yuv", cm->frame_to_show);*/
-    if (cm->current_video_frame <= 5)
-        write_dx_frame_to_file(cm->frame_to_show, cm->current_video_frame);
+#if CONFIG_OPENCL
+    if (cl_initialized == CL_SUCCESS){
+        //Copy buffer_alloc to buffer_mem so YV12_BUFFER_CONFIG can be used as
+        //a reference frame (e.g. YV12..buffer_mem contains same as buffer_alloc).
+        vp8_cl_mb_prep(&pbi->mb, DST_BUF);
+
+        if (pbi->mb.cl_commands != NULL)
+            clReleaseCommandQueue(pbi->mb.cl_commands);
+        pbi->mb.cl_commands = NULL;
+    }
 #endif

    vp8_clear_system_state();

-    vpx_usec_timer_mark(&timer);
-    pbi->decode_microseconds = vpx_usec_timer_elapsed(&timer);
-
-    pbi->time_decoding += pbi->decode_microseconds;
-
    /*vp8_print_modes_and_motion_vectors( cm->mi, cm->mb_rows,cm->mb_cols, cm->current_video_frame);*/

    if (cm->show_frame)
@@ -524,8 +565,18 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
    }
 #endif
    pbi->common.error.setjmp = 0;
+
+
+#if PROFILE_OUTPUT
+    //Dump the total MB/Loop Filter processing times.
+    //This is cumulative between frames, so only use the last output value.
+    printf("MB Time (us): %d, LF Time (us): %d\n", total_mb, total_loop_filter);
+#endif
+
+
    return retcode;
 }
+
 int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags)
 {
    int ret = -1;
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -81,12 +81,6 @@ typedef struct VP8Decompressor
    const unsigned char *Source;
    unsigned int   source_sz;

-
-    unsigned int CPUFreq;
-    unsigned int decode_microseconds;
-    unsigned int time_decoding;
-    unsigned int time_loop_filtering;
-
 #if CONFIG_MULTITHREAD
    /* variable for threading */

--- a/vp8/decoder/opencl/decodframe_cl.c
+++ b/vp8/decoder/opencl/decodframe_cl.c
@@ -0,0 +1,357 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "../onyxd_int.h"
+#include "vp8/common/header.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/reconinter.h"
+//#include "../dequantize.h"
+//#include "../detokenize.h"
+//#include "vp8/common/alloccommon.h"
+//#include "vp8/common/entropymode.h"
+//#include "vp8/common/quant_common.h"
+//#include "vpx_scale/vpxscale.h"
+//#include "vpx_scale/yv12extend.h"
+//#include "vp8/common/setupintrarecon.h"
+
+//#include "../decodemv.h"
+//#include "vp8/common/extend.h"
+//#include "vpx_mem/vpx_mem.h"
+//#include "vp8/common/idct.h"
+//#include "../dequantize.h"
+//#include "vp8/common/predictdc.h"
+//#include "vp8/common/threading.h"
+//#include "../decoderthreading.h"
+//#include "../dboolhuff.h"
+//#include "vp8/common/blockd.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "vpx_config.h"
+#if CONFIG_OPENCL
+#include "vp8/common/opencl/vp8_opencl.h"
+#include "vp8/common/opencl/blockd_cl.h"
+#include "vp8/common/opencl/reconinter_cl.h"
+#include "dequantize_cl.h"
+#endif
+
+#define PROFILE_OUTPUT 0
+
+//Implemented in ../decodframe.c
+extern void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
+
+void mb_init_dequantizer_cl(MACROBLOCKD *xd){
+    int i, err;
+    //Set up per-block dequant CL memory. Eventually, might be able to set up
+    //one large buffer containing the entire large dequant buffer.
+    if (cl_initialized == CL_SUCCESS){
+        for (i=0; i < 25; i++){
+
+#if 1 //Initialize CL memory on allocation?
+            VP8_CL_CREATE_BUF(xd->cl_commands, xd->block[i].cl_dequant_mem,
+                ,
+                16*sizeof(cl_short),
+                xd->block[i].dequant,,
+            );
+#else
+            VP8_CL_CREATE_BUF(xd->cl_commands, xd->block[i].cl_dequant_mem,
+                ,
+                16*sizeof(cl_short),
+                NULL,,
+            );
+#endif
+        }
+    }
+}
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x)
+#else
+#define RTCD_VTABLE(x) NULL
+#endif
+
+/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
+ *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
+ */
+static void skip_recon_mb_cl(VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+    if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    {
+
+        vp8_build_intra_predictors_mbuv_s(xd);
+        RECON_INVOKE(&pbi->common.rtcd.recon,
+                     build_intra_predictors_mby_s)(xd);
+
+    }
+    else
+    {
+#if ENABLE_CL_SUBPIXEL
+        if (cl_initialized == CL_SUCCESS)
+        {
+            vp8_build_inter_predictors_mb_s_cl(xd);
+        } else
+#endif
+        {
+            vp8_build_inter_predictors_mb_s(xd);
+        }
+        VP8_CL_FINISH(xd->cl_commands);
+#if !ONE_CQ_PER_MB
+        VP8_CL_FINISH(xd->block[0].cl_commands);
+        VP8_CL_FINISH(xd->block[16].cl_commands);
+        VP8_CL_FINISH(xd->block[20].cl_commands);
+#endif
+    }
+}
+
+void vp8_decode_macroblock_cl(VP8D_COMP *pbi, MACROBLOCKD *xd, int eobtotal)
+{
+    int i;
+
+    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0)
+    {
+        xd->mode_info_context->mbmi.dc_diff = 0;
+        skip_recon_mb_cl(pbi, xd);
+        return;
+    }
+
+    if (xd->segmentation_enabled)
+        mb_init_dequantizer(pbi, xd);
+
+    /* do prediction */
+    if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    {
+        vp8_build_intra_predictors_mbuv(xd);
+
+        if (xd->mode_info_context->mbmi.mode != B_PRED)
+        {
+            RECON_INVOKE(&pbi->common.rtcd.recon,
+                         build_intra_predictors_mby)(xd);
+        } else {
+            vp8_intra_prediction_down_copy(xd);
+        }
+    }
+    else
+    {
+#if ENABLE_CL_SUBPIXEL
+        vp8_build_inter_predictors_mb_cl(xd);
+#else
+        vp8_build_inter_predictors_mb(xd);
+#endif
+
+#if !ENABLE_CL_IDCT_DEQUANT
+        //Wait for inter-predict if dequant/IDCT is being done on the CPU
+        VP8_CL_FINISH(xd->cl_commands);
+#endif
+    }
+
+    /* dequantization and idct */
+    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        BLOCKD *b = &xd->block[24];
+        short *qcoeff = b->qcoeff_base + b->qcoeff_offset;
+        vp8_second_order_fn_t second_order;
+
+#if ENABLE_CL_IDCT_DEQUANT
+        if (cl_initialized == CL_SUCCESS){
+            vp8_cl_block_prep(b, DEQUANT|QCOEFF);
+            vp8_dequantize_b_cl(b);
+            vp8_cl_block_finish(b, DQCOEFF);
+            VP8_CL_FINISH(b->cl_commands); //Keep until qcoeff memset below is CL
+        }
+        else
+#endif
+        {
+            DEQUANT_INVOKE(&pbi->dequant, block)(b);
+        }
+
+
+        /* do 2nd order transform on the dc block */
+        if (xd->eobs[24] > 1){
+            second_order = IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16);
+            ((int *)qcoeff)[0] = 0;
+            ((int *)qcoeff)[1] = 0;
+            ((int *)qcoeff)[2] = 0;
+            ((int *)qcoeff)[3] = 0;
+            ((int *)qcoeff)[4] = 0;
+            ((int *)qcoeff)[5] = 0;
+            ((int *)qcoeff)[6] = 0;
+            ((int *)qcoeff)[7] = 0;
+        } else {
+            second_order = IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1);
+            ((int *)qcoeff)[0] = 0;
+        }
+
+#if ENABLE_CL_IDCT_DEQUANT
+        if (cl_initialized == CL_SUCCESS){
+            int y_off = xd->dst.y_buffer - xd->dst.buffer_alloc;
+            vp8_cl_block_prep(b, DQCOEFF|DIFF);
+
+            if (xd->eobs[24] > 1)
+            {
+                vp8_short_inv_walsh4x4_cl(b);
+            } else {
+                vp8_short_inv_walsh4x4_1_cl(b);
+            }
+            vp8_cl_block_finish(b, DIFF);
+
+            vp8_dequant_dc_idct_add_y_block_cl(&xd->block[0], 
+                    xd->dst.buffer_alloc, xd->dst.buffer_mem, y_off, xd->dst.y_stride, xd->eobs,
+                    xd->block[24].diff_offset);
+        }
+        else
+#endif
+        {
+            second_order(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset]);
+            DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
+                (xd->qcoeff, xd->block[0].dequant,
+                 xd->predictor, xd->dst.y_buffer,
+                 xd->dst.y_stride, xd->eobs, &xd->block[24].diff_base[xd->block[24].diff_offset]);
+        }
+    }
+    else if ((xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)
+    {
+#if ENABLE_CL_IDCT_DEQUANT
+        if (cl_initialized == CL_SUCCESS)
+            vp8_cl_mb_prep(xd, DST_BUF);
+#endif
+        for (i = 0; i < 16; i++)
+        {
+            BLOCKD *b = &xd->block[i];
+            short *qcoeff = b->qcoeff_base + b->qcoeff_offset;
+#if ENABLE_CL_IDCT_DEQUANT
+            VP8_CL_FINISH(b->cl_commands);
+#endif
+            vp8_predict_intra4x4(b, b->bmi.mode, b->predictor_base + b->predictor_offset);
+
+#if ENABLE_CL_IDCT_DEQUANT
+            if (cl_initialized == CL_SUCCESS){
+                size_t dst_size = (4*b->dst_stride + b->dst + 4);
+                cl_mem dst_mem = xd->dst.buffer_mem;
+
+                int dst_off = *(b->base_dst) - xd->dst.buffer_alloc;
+
+                if (xd->eobs[i] > 1)
+                {
+                    vp8_cl_block_prep(b, QCOEFF|DEQUANT|PREDICTOR);
+                    vp8_dequant_idct_add_cl(b, *(b->base_dst), dst_mem, dst_off+b->dst, dst_size, b->qcoeff_offset, b->predictor_offset, 16, b->dst_stride, DEQUANT_INVOKE(&pbi->dequant, idct_add));
+                    vp8_cl_block_finish(b, QCOEFF);
+                }
+                else
+                {
+                    vp8_cl_block_prep(b, PREDICTOR|DIFF|QCOEFF|DEQUANT);
+                    vp8_dc_only_idct_add_cl(b, CL_FALSE, 0, b->qcoeff_offset, b->predictor_offset,
+                        *(b->base_dst), dst_mem, dst_off+b->dst, dst_size, 16, b->dst_stride);
+                    VP8_CL_FINISH(b->cl_commands);
+                    ((int *)(b->qcoeff_base + b->qcoeff_offset))[0] = 0; //Move into follow-up kernel?
+                }
+                vp8_cl_mb_finish(xd,DST_BUF);
+            }
+            else
+#endif
+            {
+                if (xd->eobs[i] > 1)
+                {
+                    DEQUANT_INVOKE(&pbi->dequant, idct_add)
+                        (qcoeff, b->dequant,  b->predictor_base + b->predictor_offset,
+                        *(b->base_dst) + b->dst, 16, b->dst_stride);
+                }
+                else
+                {
+                    IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+                        (qcoeff[0] * b->dequant[0], b->predictor_base + b->predictor_offset,
+                        *(b->base_dst) + b->dst, 16, b->dst_stride);
+                    ((int *)qcoeff)[0] = 0;
+                }
+            }
+            
+        }
+    }
+    else
+    {
+#if ENABLE_CL_IDCT_DEQUANT
+        if (cl_initialized == CL_SUCCESS){
+            vp8_cl_mb_prep(xd,DST_BUF);
+            vp8_dequant_idct_add_y_block_cl(pbi, xd);
+            vp8_cl_mb_finish(xd,DST_BUF);
+        }
+        else
+#endif
+        {
+            DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
+                (xd->qcoeff, xd->block[0].dequant,
+                 xd->predictor, xd->dst.y_buffer,
+                 xd->dst.y_stride, xd->eobs);
+        }
+    }
+
+#if ENABLE_CL_IDCT_DEQUANT
+    if (cl_initialized == CL_SUCCESS){
+        vp8_cl_mb_prep(xd,DST_BUF);
+        vp8_dequant_idct_add_uv_block_cl(pbi, xd,  DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block));
+        vp8_cl_mb_finish(xd,DST_BUF);
+        VP8_CL_FINISH(xd->cl_commands);
+    } else
+#endif
+    {
+    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
+        (xd->qcoeff+16*16, xd->block[16].dequant,
+         xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
+         xd->dst.uv_stride, xd->eobs+16);
+    }
+}
+
+void vp8_decode_frame_cl_finish(VP8D_COMP *pbi){
+
+    //If using OpenCL, free all of the GPU buffers we've allocated.
+    if (cl_initialized == CL_SUCCESS){
+#if ENABLE_CL_IDCT_DEQUANT
+        int i;
+#endif
+
+        //Wait for stuff to finish, just in case
+        clFinish(pbi->mb.cl_commands);
+
+#if !ONE_CQ_PER_MB
+        clFinish(pbi->mb.block[0].cl_commands);
+        clFinish(pbi->mb.block[16].cl_commands);
+        clFinish(pbi->mb.block[20].cl_commands);
+        clReleaseCommandQueue(pbi->mb.block[0].cl_commands);
+        clReleaseCommandQueue(pbi->mb.block[16].cl_commands);
+        clReleaseCommandQueue(pbi->mb.block[20].cl_commands);
+#endif
+
+#if ENABLE_CL_IDCT_DEQUANT || ENABLE_CL_SUBPIXEL
+        //Free Predictor CL buffer
+        if (pbi->mb.cl_predictor_mem != NULL)
+            clReleaseMemObject(pbi->mb.cl_predictor_mem);
+#endif
+
+#if ENABLE_CL_IDCT_DEQUANT
+        //Free other CL Block/MBlock buffers
+        if (pbi->mb.cl_diff_mem != NULL)
+            clReleaseMemObject(pbi->mb.cl_diff_mem);
+        if (pbi->mb.cl_qcoeff_mem != NULL)
+            clReleaseMemObject(pbi->mb.cl_qcoeff_mem);
+        if (pbi->mb.cl_dqcoeff_mem != NULL)
+            clReleaseMemObject(pbi->mb.cl_dqcoeff_mem);
+        if (pbi->mb.cl_eobs_mem != NULL)
+            clReleaseMemObject(pbi->mb.cl_eobs_mem);
+
+        for (i = 0; i < 25; i++){
+            clReleaseMemObject(pbi->mb.block[i].cl_dequant_mem);
+            pbi->mb.block[i].cl_dequant_mem = NULL;
+        }
+#endif
+    }
+}
--- a/vp8/decoder/opencl/decodframe_cl.h
+++ b/vp8/decoder/opencl/decodframe_cl.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_DECODFRAME_CL_H
+#define VP8_DECODFRAME_CL_H
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#include "../onyxd_int.h"
+#include "vp8/common/blockd.h"
+
+//Implemented in decodframe_cl.c
+extern void mb_init_dequantizer_cl(MACROBLOCKD *xd);
+extern void vp8_decode_frame_cl_finish(VP8D_COMP *pbi);
+extern void vp8_decode_macroblock_cl(VP8D_COMP *pbi, MACROBLOCKD *xd, int eobtotal);
+
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif  /* VP8_DECODFRAME_CL_H */
--- a/vp8/decoder/opencl/dequantize_cl.c
+++ b/vp8/decoder/opencl/dequantize_cl.c
@@ -0,0 +1,214 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+//ACW: Remove me after debugging.
+#include <stdio.h>
+#include <string.h>
+
+#include "vp8/common/opencl/blockd_cl.h"
+#include "vp8/common/opencl/idct_cl.h"
+#include "dequantize_cl.h"
+
+const char *dequantCompileOptions = "";
+const char *dequant_cl_file_name = "vp8/decoder/opencl/dequantize_cl.cl";
+
+void cl_memset_short(short *s, int c, size_t n) {
+    for (n /= sizeof(short); n > 0; --n)
+        *s++ = c;
+}
+
+void vp8_memset_short_cl(cl_mem mem, int offset, short val){
+
+}
+
+int cl_destroy_dequant(){
+    printf("Freeing dequant decoder resources\n");
+
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_dequant_dc_idct_add_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_dequant_idct_add_kernel);
+    VP8_CL_RELEASE_KERNEL(cl_data.vp8_dequantize_b_kernel);
+
+    if (cl_data.dequant_program)
+        clReleaseProgram(cl_data.dequant_program);
+    cl_data.dequant_program = NULL;
+
+    return CL_SUCCESS;
+}
+
+int cl_init_dequant() {
+    int err;
+
+    //printf("Initializing dequant program/kernels\n");
+
+    // Create the compute program from the file-defined source code
+    if (cl_load_program(&cl_data.dequant_program, dequant_cl_file_name,
+            dequantCompileOptions) != CL_SUCCESS)
+        return VP8_CL_TRIED_BUT_FAILED;
+
+    // Create the compute kernels in the program we wish to run
+    VP8_CL_CREATE_KERNEL(cl_data,dequant_program,vp8_dequant_dc_idct_add_kernel,"vp8_dequant_dc_idct_add_kernel");
+    VP8_CL_CREATE_KERNEL(cl_data,dequant_program,vp8_dequant_idct_add_kernel,"vp8_dequant_idct_add_kernel");
+    VP8_CL_CREATE_KERNEL(cl_data,dequant_program,vp8_dequantize_b_kernel,"vp8_dequantize_b_kernel");
+
+    //printf("Created dequant kernels\n");
+
+    return CL_SUCCESS;
+}
+
+void vp8_dequantize_b_cl(BLOCKD *d)
+{
+    int err;
+    size_t global = 16;
+
+    /* Set kernel arguments */
+    err = 0;
+    err = clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 0, sizeof (cl_mem), &d->cl_dqcoeff_mem);
+    err |= clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 1, sizeof (cl_int), &d->dqcoeff_offset);
+    err |= clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 2, sizeof (cl_mem), &d->cl_qcoeff_mem);
+    err |= clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 3, sizeof (cl_int), &d->qcoeff_offset);
+    err |= clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 4, sizeof (cl_mem), &d->cl_dequant_mem);
+    VP8_CL_CHECK_SUCCESS( d->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",
+        vp8_dequantize_b_c(d),
+    );
+
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel( d->cl_commands, cl_data.vp8_dequantize_b_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( d->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);\
+        vp8_dequantize_b_c(d),
+    );
+
+}
+
+void vp8_dequant_idct_add_cl(BLOCKD *b, unsigned char *dest_base, cl_mem dest_mem, int dest_offset, size_t dst_size, int q_offset, int pred_offset, int pitch, int stride, vp8_dequant_idct_add_fn_t idct_add)
+{
+    int err;
+    size_t global = 1;
+    //cl_mem dest_mem = NULL;
+    int free_mem = 0;
+
+    if (dest_mem == NULL){
+        //Initialize destination memory
+        VP8_CL_CREATE_BUF(b->cl_commands, dest_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+                dst_size, dest_base,,
+        );
+        free_mem = 1;
+    }
+    
+    /* Set kernel arguments */
+    err = 0;
+    err = clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 0, sizeof (cl_mem), &b->cl_qcoeff_mem);
+    err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 1, sizeof (int), &q_offset);
+    err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 2, sizeof (cl_mem), &b->cl_dequant_mem);
+    err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 3, sizeof (cl_mem), &b->cl_predictor_mem);
+    err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 4, sizeof (int), &pred_offset);
+    err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 5, sizeof (cl_mem), &dest_mem);
+    err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 6, sizeof (int), &dest_offset);
+    err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 7, sizeof (int), &pitch);
+    err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 8, sizeof (int), &stride);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",
+        idct_add(b->qcoeff_base+q_offset, b->dequant,  b->predictor_base + pred_offset,
+            dest_base + dest_offset, pitch, stride),
+    );
+
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel( b->cl_commands, cl_data.vp8_dequant_idct_add_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);\
+        idct_add(b->qcoeff_base+q_offset, b->dequant,  b->predictor_base + pred_offset,
+            dest_base + dest_offset, pitch, stride),
+    );
+
+    if (free_mem == 1){
+        /* Read back the result data from the device */
+        err = clEnqueueReadBuffer(b->cl_commands, dest_mem, CL_FALSE, 0, dst_size, dest_base, 0, NULL, NULL);
+        VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+            "Error: Failed to read output array!\n",
+            idct_add(b->qcoeff_base+q_offset, b->dequant,  b->predictor_base + pred_offset,
+                dest_base + dest_offset, pitch, stride),
+        );
+
+        //CL Spec says this can be freed without clFinish first
+        clReleaseMemObject(dest_mem);
+    }
+
+    return;
+}
+
+//Can modify arguments. Only called from vp8_dequant_dc_idct_add_y_block_cl.
+void vp8_dequant_dc_idct_add_cl(
+    BLOCKD *b,
+    int qcoeff_offset,
+    int pred_offset,
+    unsigned char *dest_base,
+    int dest_off,
+    int pitch,
+    int stride,
+    int Dc_offset)
+{
+    int err;
+    int dq_offset = 0;
+    unsigned char *dest = dest_base + dest_off;
+ 
+    cl_mem dest_mem = NULL;
+    size_t dest_size;
+    size_t global = 1;
+    int dest_offset=0;
+
+    //Initialize dest_mem
+    dest_size = sizeof(cl_uchar)*(4*stride + dest_offset + 4);
+    VP8_CL_CREATE_BUF(b->cl_commands, dest_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+            dest_size, dest,,
+    );
+
+    //Assuming that all input cl_mem has been initialized outside of this Fn.
+
+    /* Set kernel arguments */
+    err = 0;
+    err = clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 0, sizeof (cl_mem), &b->cl_qcoeff_mem);
+    err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 1, sizeof (int), &qcoeff_offset);
+    err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 2, sizeof (cl_mem), &b->cl_dequant_mem);
+    err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 3, sizeof(int), &dq_offset);
+    err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 4, sizeof (cl_mem), &b->cl_predictor_mem);
+    err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 5, sizeof (int), &pred_offset);
+    err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 6, sizeof (cl_mem), &b->cl_diff_mem);
+    err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 7, sizeof (int), &Dc_offset);
+    err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 8, sizeof (cl_mem), &dest_mem);
+    err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 9, sizeof (int), &pitch);
+    err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 10, sizeof (int), &stride);
+
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to set kernel arguments!\n",,
+    );
+
+    /* Execute the kernel */
+    err = clEnqueueNDRangeKernel( b->cl_commands, cl_data.vp8_dequant_dc_idct_add_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to execute kernel!\n",
+        printf("err = %d\n",err);,
+    );
+
+    /* Read back the result data from the device */
+    err = clEnqueueReadBuffer(b->cl_commands, dest_mem, CL_FALSE, 0, dest_size, dest, 0, NULL, NULL);
+    VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+        "Error: Failed to read output array!\n",,
+    );
+
+    //CL Spec says this can be freed without clFinish first
+    clReleaseMemObject(dest_mem);
+    dest_mem = NULL;
+
+    return;
+
+}
--- a/vp8/decoder/opencl/dequantize_cl.cl
+++ b/vp8/decoder/opencl/dequantize_cl.cl
@@ -0,0 +1,272 @@
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+__constant int cospi8sqrt2minus1 = 20091;
+__constant int sinpi8sqrt2      = 35468;
+__constant int rounding = 0;
+
+void vp8_short_idct4x4llm(__global short*, short*, int);
+void cl_memset_short(__global short*, int, size_t);
+
+#define USE_VECTORS 0
+
+__kernel void vp8_dequantize_b_kernel(
+    __global short *dqcoeff_base,
+    int dqcoeff_offset,
+    __global short *qcoeff_base,
+    int qcoeff_offset,
+    __global short *dequant
+)
+{
+    __global short *DQ  = dqcoeff_base + dqcoeff_offset;
+    __global short *Q   = qcoeff_base  + qcoeff_offset;
+
+#if USE_VECTORS
+    vstore16(vload16(0,Q) * vload16(0,dequant), 0, DQ);
+#else
+    int tid = get_global_id(0);
+    if (tid < 16)
+    {
+        DQ[tid] = Q[tid] * dequant[tid];
+    }
+
+#endif
+}
+
+__kernel void vp8_dequant_idct_add_kernel(
+    __global short *input_base,
+    int input_offset,
+    __global short *dq,
+    __global unsigned char *pred_base,
+    int pred_offset,
+    __global unsigned char *dest_base,
+    int dest_offset,
+    int pitch,
+    int stride
+)
+{
+    short output[16];
+    short *diff_ptr = output;
+    int r, c;
+    int i;
+    __global unsigned char *dest = dest_base + dest_offset;
+    __global short *input = input_base + input_offset;
+    __global unsigned char *pred = pred_base + pred_offset;
+
+#if USE_VECTORS
+    vstore16( (short16)vload16(0,dq) * (short16)vload16(0,input) , 0, input);
+#else
+    for (i = 0; i < 16; i++)
+    {
+        input[i] = dq[i] * input[i];
+    }
+#endif
+
+    /* the idct halves ( >> 1) the pitch */
+    vp8_short_idct4x4llm(input, output, 4 << 1);
+
+    //Note, remember to copy back the input buffer (qcoeff) to system memory.
+    cl_memset_short(input, 0, 32);
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            int a = diff_ptr[c] + pred[c];
+
+            if (a < 0)
+                a = 0;
+
+            if (a > 255)
+                a = 255;
+
+            dest[c] = (unsigned char) a;
+        }
+
+        dest += stride;
+        diff_ptr += 4;
+        pred += pitch;
+    }
+}
+
+
+__kernel void vp8_dequant_dc_idct_add_kernel(
+    __global short *qcoeff_base,
+    int qcoeff_offset,
+
+    __global short *dequant_base,
+    int dequant_offset,
+
+    __global unsigned char *pred_base,
+    int pred_offset,
+
+    __global short *diff_base,
+    int diff_offset,
+
+    __global unsigned char *dest,
+
+    int pitch,
+    int stride
+)
+{
+    int i;
+    short output[16];
+    short *diff_ptr = output;
+    int r, c;
+
+    global short *input = &qcoeff_base[qcoeff_offset];
+    global short *dq = &dequant_base[dequant_offset];
+    global unsigned char *pred = pred_base + pred_offset;
+
+    //A modified input buffer... copy back to System memory when done!
+    input[0] = diff_base[diff_offset];
+
+#if USE_VECTORS
+    vstore16( (short16)vload16(0,dq) * (short16)vload16(0,input) , 0, input);
+#else
+    for (i = 1; i < 16; i++)
+    {
+        input[i] = dq[i] * input[i];
+    }
+#endif
+    
+    /* the idct halves ( >> 1) the pitch */
+    vp8_short_idct4x4llm(input, output, 4 << 1);
+
+    cl_memset_short(input, 0, 32);
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            int a = diff_ptr[c] + pred[c];
+
+            if (a < 0)
+                a = 0;
+
+            if (a > 255)
+                a = 255;
+
+            dest[c] = (unsigned char) a;
+        }
+
+        dest += stride;
+        diff_ptr += 4;
+        pred += pitch;
+    }
+}
+
+
+
+
+//Note that this kernel has been copied from common/opencl/idctllm_cl.cl
+void vp8_short_idct4x4llm(
+    __global short *input,
+    short *output,
+    int pitch
+)
+{
+    int i;
+    int a1, b1, c1, d1;
+
+    __global short *ip = input;
+    short *op = output;
+    int temp1, temp2;
+    int shortpitch = pitch >> 1;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[8];
+        b1 = ip[0] - ip[8];
+
+        temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
+        temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
+        c1 = temp1 - temp2;
+
+        temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
+        temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
+        d1 = temp1 + temp2;
+
+        op[shortpitch*0] = a1 + d1;
+        op[shortpitch*3] = a1 - d1;
+
+        op[shortpitch*1] = b1 + c1;
+        op[shortpitch*2] = b1 - c1;
+
+        ip++;
+        op++;
+    }
+
+    op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = op[0] + op[2];
+        b1 = op[0] - op[2];
+
+        temp1 = (op[1] * sinpi8sqrt2 + rounding) >> 16;
+        temp2 = op[3] + ((op[3] * cospi8sqrt2minus1 + rounding) >> 16);
+        c1 = temp1 - temp2;
+
+        temp1 = op[1] + ((op[1] * cospi8sqrt2minus1 + rounding) >> 16);
+        temp2 = (op[3] * sinpi8sqrt2 + rounding) >> 16;
+        d1 = temp1 + temp2;
+
+
+        op[0] = (a1 + d1 + 4) >> 3;
+        op[3] = (a1 - d1 + 4) >> 3;
+
+        op[1] = (b1 + c1 + 4) >> 3;
+        op[2] = (b1 - c1 + 4) >> 3;
+
+        op += shortpitch;
+    }
+
+}
+
+void vp8_dc_only_idct_add_kernel(
+    short input_dc,
+    __global unsigned char *pred_ptr,
+    __global unsigned char *dst_ptr,
+    int pitch,
+    int stride
+)
+{
+    int a1 = ((input_dc + 4) >> 3);
+    int r, c;
+    int pred_offset,dst_offset;
+
+    int tid = get_global_id(0);
+    if (tid < 16){
+        r = tid / 4;
+        c = tid % 4;
+
+        pred_offset = r * pitch;
+        dst_offset = r * stride;
+        int a = a1 + pred_ptr[pred_offset + c] ;
+
+        if (a < 0)
+            a = 0;
+        else if (a > 255)
+            a = 255;
+
+        dst_ptr[dst_offset + c] = (unsigned char) a ;
+    }
+}
+
+void cl_memset_short(__global short *s, int c, size_t n) {
+    int i;
+    for (i = 0; i < n/2; i++)
+        *s++ = c;
+}
--- a/vp8/decoder/opencl/dequantize_cl.h
+++ b/vp8/decoder/opencl/dequantize_cl.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DEQUANTIZE_CL_H
+#define DEQUANTIZE_CL_H
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#include "vp8/decoder/onyxd_int.h"
+#include "vp8/decoder/dequantize.h"
+#include "vp8/common/opencl/vp8_opencl.h"
+
+#define prototype_dequant_block_cl(sym) \
+    void sym(BLOCKD *x)
+
+#define prototype_dequant_idct_add_cl(sym) \
+    void sym(BLOCKD *b, unsigned char *dest_base,cl_mem dest_mem, int dest_offset, size_t dest_size, int q_offset, \
+             int pred_offset, int pitch, int stride, \
+             vp8_dequant_idct_add_fn_t idct_add)
+
+#define prototype_dequant_dc_idct_add_cl(sym) \
+    void sym(BLOCKD* b, int qcoeff_offset, \
+             int pred_offset, unsigned char *dest_base, int dst_offset, \
+             int pitch, int stride, \
+             int dc)
+
+#define prototype_dequant_dc_idct_add_y_block_cl(sym) \
+    void sym(BLOCKD *b, \
+             unsigned char *dst_base, cl_mem dst_mem, int dst_off,\
+             int stride, char *eobs, int dc_offset)
+
+#define prototype_dequant_idct_add_y_block_cl(sym) \
+    void sym(VP8D_COMP *pbi, MACROBLOCKD *xd)
+
+#define prototype_dequant_idct_add_uv_block_cl(sym) \
+    void sym(VP8D_COMP *pbi, MACROBLOCKD *xd, \
+        vp8_dequant_idct_add_uv_block_fn_t idct_add_uv_block)
+
+
+    
+extern prototype_dequant_block_cl(vp8_dequantize_b_cl);
+
+//CL functions
+extern prototype_dequant_idct_add_cl(vp8_dequant_idct_add_cl);
+
+//C functions
+extern prototype_dequant_dc_idct_add_cl(vp8_dequant_dc_idct_add_cl);
+
+
+//Might be CL... check implementation.
+extern prototype_dequant_dc_idct_add_y_block_cl(vp8_dequant_dc_idct_add_y_block_cl);
+extern prototype_dequant_idct_add_y_block_cl(vp8_dequant_idct_add_y_block_cl);
+extern prototype_dequant_idct_add_uv_block_cl(vp8_dequant_idct_add_uv_block_cl);
+
+
+
+extern const char *dequantCompileOptions;
+extern const char *dequant_cl_file_name;
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
--- a/vp8/decoder/opencl/idct_blk_cl.c
+++ b/vp8/decoder/opencl/idct_blk_cl.c
@@ -0,0 +1,196 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/decoder/onyxd_int.h"
+#include "vpx_ports/config.h"
+#include "../../common/idct.h"
+#include "vp8/common/opencl/blockd_cl.h"
+#include "dequantize_cl.h"
+
+//change q/dq/pre/eobs/dc to offsets
+void vp8_dequant_dc_idct_add_y_block_cl(
+    BLOCKD *b,
+    unsigned char *dst_base, //xd->dst.buffer_alloc
+    cl_mem dst_mem,
+    int dst_off,
+    int stride,         //xd->dst.y_stride
+    char *eobs,         //xd->eobs
+    int dc_offset       //xd->block[24].diff_offset
+)
+{
+    int i, j;
+    int q_offset = 0;
+    int pre_offset = 0;
+    int dst_offset = 0;
+    unsigned char *dst = dst_base+dst_off;
+    size_t dst_size = 16*(stride+1);
+
+    vp8_cl_block_prep(b, QCOEFF|DEQUANT|DIFF|PREDICTOR);
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            if (*eobs++ > 1){
+                vp8_dequant_dc_idct_add_cl (b, q_offset, pre_offset, dst, dst_offset, 16, stride, dc_offset);
+            }
+            else{
+                vp8_dc_only_idct_add_cl(b, CL_TRUE, dc_offset, 0, pre_offset, dst, NULL, dst_offset, dst_size, 16, stride);
+            }
+
+            q_offset   += 16;
+            pre_offset += 4;
+            dst_offset += 4;
+            dc_offset++;
+        }
+
+        pre_offset += 64 - 16;
+        dst_offset += 4*stride - 16;
+    }
+
+    vp8_cl_block_finish(b, QCOEFF);
+
+}
+
+void vp8_dequant_idct_add_y_block_cl (VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+    int i, j;
+
+    short *q = xd->qcoeff;
+    int q_offset = 0;
+    int pre_offset = 0;
+    cl_mem dst_mem = xd->dst.buffer_mem;
+    unsigned char *dst = xd->dst.buffer_alloc;
+    int dst_offset = xd->dst.y_buffer - dst;
+    int stride = xd->dst.y_stride;
+    char *eobs = xd->eobs;
+    int dst_size = 16 * (stride + 1);
+
+
+    vp8_cl_mb_prep(xd,PREDICTOR|DIFF|QCOEFF);
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            if (*eobs++ > 1){
+                vp8_cl_block_prep(&xd->block[0], DEQUANT);
+                vp8_dequant_idct_add_cl(&xd->block[0], dst, dst_mem, dst_offset, dst_size+dst_offset, q_offset, pre_offset, 16, stride, pbi->dequant.idct_add);
+                vp8_cl_block_finish(&xd->block[0], QCOEFF);
+            }
+            else
+            {
+                vp8_cl_block_prep(&xd->block[0], DEQUANT);
+                vp8_dc_only_idct_add_cl(&xd->block[0], CL_FALSE, 0, q_offset, pre_offset, dst, dst_mem, dst_offset, dst_size+dst_offset, 16, stride);
+                VP8_CL_FINISH(xd->cl_commands);
+                ((int *)(q+q_offset))[0] = 0;
+                vp8_cl_mb_prep(xd,QCOEFF);
+            }
+
+            q_offset   += 16;
+            pre_offset += 4;
+            dst_offset += 4;
+        }
+
+        pre_offset += 64 - 16;
+        dst_offset += 4*stride - 16;
+    }
+
+}
+
+void vp8_dequant_idct_add_uv_block_cl(VP8D_COMP *pbi, MACROBLOCKD *xd,
+        vp8_dequant_idct_add_uv_block_fn_t idct_add_uv_block
+)
+{
+    int i, j;
+
+    int block_num = 16;
+    BLOCKD b = xd->block[block_num];
+
+    short *q = xd->qcoeff;
+
+    cl_mem dst_mem = xd->dst.buffer_mem;
+    unsigned char *dst = xd->dst.buffer_alloc;
+    int u_off = xd->dst.u_buffer - dst;
+    int v_off = xd->dst.v_buffer - dst;
+
+    int stride = xd->dst.uv_stride;
+    size_t dst_size = 8*(stride+1);
+    char *eobs = xd->eobs+16;
+
+    int pre_offset = block_num*16;
+    int q_offset = block_num*16;
+    int dst_offset = 0;
+
+    vp8_cl_mb_prep(xd, DIFF|QCOEFF|PREDICTOR);
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            if (*eobs++ > 1){
+                vp8_cl_block_prep(&xd->block[0], DEQUANT);
+                vp8_dequant_idct_add_cl(&b, dst, dst_mem, u_off+dst_offset, u_off+dst_size, q_offset, pre_offset, 8, stride, DEQUANT_INVOKE (&pbi->dequant, idct_add));
+            }
+            else
+            {
+                vp8_cl_block_prep(&xd->block[block_num], DEQUANT);
+                vp8_dc_only_idct_add_cl (&b, CL_FALSE, 0, q_offset, pre_offset, dst, dst_mem, u_off+dst_offset, u_off+dst_size, 8, stride);
+                
+                //Need round trip + finish until qcoeff set in CL
+                vp8_cl_block_finish(&xd->block[0], QCOEFF);
+                VP8_CL_FINISH(xd->cl_commands);
+                ((int *)(q+q_offset))[0] = 0;
+                vp8_cl_mb_prep(xd,QCOEFF);
+            }
+
+            q_offset    += 16;
+            pre_offset  += 4;
+            dst_offset += 4;
+        }
+
+        pre_offset  += 32 - 8;
+        dst_offset += 4*stride - 8;
+    }
+
+    //Swap dstu out of cl_mem and dstv into it
+
+    dst_offset = 0;
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            if (*eobs++ > 1){
+                vp8_cl_block_prep(&b, DEQUANT);
+                vp8_dequant_idct_add_cl (&b, dst, dst_mem, v_off+dst_offset, v_off+dst_size, q_offset,
+                        pre_offset, 8, stride, DEQUANT_INVOKE (&pbi->dequant, idct_add));
+            }
+            else
+            {
+                vp8_cl_block_prep(&b, DEQUANT);
+                vp8_dc_only_idct_add_cl (&b, CL_FALSE, 0, q_offset, pre_offset,
+                        dst, dst_mem, v_off+dst_offset, v_off+dst_size, 8, stride);
+
+                //Eventually replace with memset kernel call to prevent round trip
+                vp8_cl_mb_finish(xd,QCOEFF);
+                VP8_CL_FINISH(xd->cl_commands);
+                ((int *)(q+q_offset))[0] = 0;
+                vp8_cl_mb_prep(xd,QCOEFF);
+            }
+
+            q_offset    += 16;
+            pre_offset  += 4;
+            dst_offset += 4;
+        }
+
+        pre_offset  += 32 - 8;
+        dst_offset += 4*stride - 8;
+    }
+    
+    vp8_cl_mb_finish(xd,QCOEFF);
+
+}
--- a/vp8/decoder/opencl/opencl_systemdependent.c
+++ b/vp8/decoder/opencl/opencl_systemdependent.c
@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vp8/decoder/onyxd_int.h"
+
+#include "vp8/common/opencl/vp8_opencl.h"
+#include "vp8_decode_cl.h"
+
+void vp8_arch_opencl_decode_init(VP8D_COMP *pbi)
+{
+
+    if (cl_initialized == CL_SUCCESS){
+        cl_decode_init();
+    }
+
+}
--- a/vp8/decoder/opencl/vp8_decode_cl.c
+++ b/vp8/decoder/opencl/vp8_decode_cl.c
@@ -0,0 +1,38 @@
+#include "vpx_ports/config.h"
+
+#include "../../common/opencl/vp8_opencl.h"
+#include "vp8_decode_cl.h"
+
+#include <stdio.h>
+
+extern int cl_init_dequant();
+extern int cl_destroy_dequant();
+
+int cl_decode_destroy(){
+
+#if ENABLE_CL_IDCT_DEQUANT
+    int err;
+    err = cl_destroy_dequant();
+#endif
+    
+    return CL_SUCCESS;
+}
+
+int cl_decode_init()
+{
+#if ENABLE_CL_IDCT_DEQUANT
+    int err;
+#endif
+
+    //Initialize programs to null value
+    //Enables detection of if they've been initialized as well.
+    cl_data.dequant_program = NULL;
+
+#if ENABLE_CL_IDCT_DEQUANT
+    err = cl_init_dequant();
+    if (err != CL_SUCCESS)
+        return err;
+#endif
+
+    return CL_SUCCESS;
+}
--- a/vp8/decoder/opencl/vp8_decode_cl.h
+++ b/vp8/decoder/opencl/vp8_decode_cl.h
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_OPENCL_DECODE_H
+#define VP8_OPENCL_DECODE_H
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+int cl_decode_init();
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif  /* VP8_OPENCL_H */
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -33,7 +33,7 @@ extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
 #define RTCD_VTABLE(x) NULL
 #endif

-void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
+static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
    VP8_COMMON *const pc = & pbi->common;
    int i, j;
@@ -87,7 +87,7 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
 }


-void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col)
+static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col)
 {
    int eobtotal = 0;
    int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;
@@ -151,51 +151,53 @@ void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb
    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
    {
        BLOCKD *b = &xd->block[24];
+        short *qcoeff = b->qcoeff_base + b->qcoeff_offset;
        DEQUANT_INVOKE(&pbi->dequant, block)(b);

        /* do 2nd order transform on the dc block */
        if (xd->eobs[24] > 1)
        {
-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
-            ((int *)b->qcoeff)[0] = 0;
-            ((int *)b->qcoeff)[1] = 0;
-            ((int *)b->qcoeff)[2] = 0;
-            ((int *)b->qcoeff)[3] = 0;
-            ((int *)b->qcoeff)[4] = 0;
-            ((int *)b->qcoeff)[5] = 0;
-            ((int *)b->qcoeff)[6] = 0;
-            ((int *)b->qcoeff)[7] = 0;
+            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset]);
+            ((int *)qcoeff)[0] = 0;
+            ((int *)qcoeff)[1] = 0;
+            ((int *)qcoeff)[2] = 0;
+            ((int *)qcoeff)[3] = 0;
+            ((int *)qcoeff)[4] = 0;
+            ((int *)qcoeff)[5] = 0;
+            ((int *)qcoeff)[6] = 0;
+            ((int *)qcoeff)[7] = 0;
        }
        else
        {
-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
-            ((int *)b->qcoeff)[0] = 0;
+            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset]);
+            ((int *)qcoeff)[0] = 0;
        }

        DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
                        (xd->qcoeff, xd->block[0].dequant,
                         xd->predictor, xd->dst.y_buffer,
-                         xd->dst.y_stride, xd->eobs, xd->block[24].diff);
+                         xd->dst.y_stride, xd->eobs, &xd->block[24].diff_base[xd->block[24].diff_offset]);
    }
    else if ((xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)
    {
        for (i = 0; i < 16; i++)
        {
            BLOCKD *b = &xd->block[i];
-            vp8mt_predict_intra4x4(pbi, xd, b->bmi.mode, b->predictor, mb_row, mb_col, i);
+            short *qcoeff = b->qcoeff_base + b->qcoeff_offset;
+            vp8mt_predict_intra4x4(pbi, xd, b->bmi.mode, b->predictor_base + b->predictor_offset, mb_row, mb_col, i);

            if (xd->eobs[i] > 1)
            {
                DEQUANT_INVOKE(&pbi->dequant, idct_add)
-                    (b->qcoeff, b->dequant,  b->predictor,
+                    (qcoeff, b->dequant,  b->predictor_base + b->predictor_offset,
                    *(b->base_dst) + b->dst, 16, b->dst_stride);
            }
            else
            {
                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-                    (b->qcoeff[0] * b->dequant[0], b->predictor,
+                    (qcoeff[0] * b->dequant[0], b->predictor_base + b->predictor_offset,
                    *(b->base_dst) + b->dst, 16, b->dst_stride);
-                ((int *)b->qcoeff)[0] = 0;
+                ((int *)qcoeff)[0] = 0;
            }
        }
    }
@@ -214,7 +216,7 @@ void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb
 }


-THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
+static THREAD_FUNCTION thread_decoding_proc(void *p_data)
 {
    int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
    VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
@@ -318,7 +320,7 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                        xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

                        vp8_build_uvmvs(xd, pc->full_pixel);
-                        vp8mt_decode_macroblock(pbi, xd, mb_row, mb_col);
+                        decode_macroblock(pbi, xd, mb_row, mb_col);

                        if (pbi->common.filter_level)
                        {
@@ -450,7 +452,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)
            pbi->de_thread_data[ithread].ptr1     = (void *)pbi;
            pbi->de_thread_data[ithread].ptr2     = (void *) &pbi->mb_row_di[ithread];

-            pthread_create(&pbi->h_decoding_thread[ithread], 0, vp8_thread_decoding_proc, (&pbi->de_thread_data[ithread]));
+            pthread_create(&pbi->h_decoding_thread[ithread], 0, thread_decoding_proc, (&pbi->de_thread_data[ithread]));
        }

        sem_init(&pbi->h_event_end_decoding, 0, 0);
@@ -632,7 +634,7 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
 }


-void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
+static void lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
 {
    VP8_COMMON *cm  = &pbi->common;
    MACROBLOCKD *mbd = &pbi->mb;
@@ -715,10 +717,10 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
            vpx_memset(pbi->mt_uleft_col[i], (unsigned char)129, 8);
            vpx_memset(pbi->mt_vleft_col[i], (unsigned char)129, 8);
        }
-        vp8mt_lpf_init(pbi, pc->filter_level);
+        lpf_init(pbi, pc->filter_level);
    }

-    vp8_setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
+    setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);

    for (i = 0; i < pbi->decoding_thread_count; i++)
        sem_post(&pbi->h_event_start_decoding[i]);
@@ -803,7 +805,7 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                }

                vp8_build_uvmvs(xd, pc->full_pixel);
-                vp8mt_decode_macroblock(pbi, xd, mb_row, mb_col);
+                decode_macroblock(pbi, xd, mb_row, mb_col);

                /* check if the boolean decoder has suffered an error */
                xd->corrupted |= vp8dx_bool_error(xd->current_bc);
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -17,10 +17,10 @@
 #if HAVE_MMX
 void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);

-void vp8_dequantize_b_mmx(BLOCKD *d)
+static void dequantize_b_mmx(BLOCKD *d)
 {
-    short *sq = (short *) d->qcoeff;
-    short *dq = (short *) d->dqcoeff;
+    short *sq = (short *) d->qcoeff_base + d->qcoeff_offset;
+    short *dq = (short *) d->dqcoeff_base + d->dqcoeff_offset;
    short *q = (short *) d->dequant;
    vp8_dequantize_b_impl_mmx(sq, dq, q);
 }
@@ -41,7 +41,7 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
 #if HAVE_MMX
    if (flags & HAS_MMX)
    {
-        pbi->dequant.block               = vp8_dequantize_b_mmx;
+        pbi->dequant.block               = dequantize_b_mmx;
        pbi->dequant.idct_add            = vp8_dequant_idct_add_mmx;
        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_mmx;
        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx;
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -59,17 +59,17 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;*/

        /*cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
-        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_c;
-        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_c;*/
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;*/
+        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_armv6;
+        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_armv6;
        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_armv6;

        /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
-        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;
-        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_c;
-        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;
-        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;*/
+        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;*/
+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_armv6;
+        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_armv6;
+        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_armv6;

        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;*/
        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_armv6;
--- a/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
@@ -0,0 +1,262 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_fast_fdct4x4_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_fast_fdct4x4_armv6| PROC
+
+    stmfd       sp!, {r4 - r12, lr}
+
+    ; PART 1
+
+    ; coeffs 0-3
+    ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]
+
+    ldr         r10, c7500
+    ldr         r11, c14500
+    ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]
+    ldr         lr, c0x00080008
+    ror         r5, r5, #16         ; [i2 | i3]
+
+    qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8
+    smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8
+
+    smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]
+
+    pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]
+
+    str         r6, [r1, #4]
+
+    ; coeffs 4-7
+    ror         r9, r9, #16         ; [i6 | i7]
+
+    qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift
+    qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8
+    smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8
+
+    smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]
+
+    pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]
+
+    str         r6, [r1, #12]
+
+    ; coeffs 8-11
+    ror         r5, r5, #16         ; [i10 | i11]
+
+    qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8
+    smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8
+
+    smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]
+
+    pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]
+
+    str         r6, [r1, #20]
+
+    ; coeffs 12-15
+    ror         r5, r5, #16         ; [i14 | i15]
+
+    qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift
+    qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8
+    smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8
+
+    smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]
+
+    str         r6, [r1, #28]
+
+
+    ; PART 2 -------------------------------------------------
+    ldr         r11, c12000
+    ldr         r10, c51000
+    ldr         lr, c0x00070007
+
+    qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]
+    qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]
+    qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]
+    qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    add         r0, r11, #0x10000   ; add (d!=0)
+
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    ldr         r12, c0x08a914e8    ; [2217 | 5352]
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #0]        ; [     o1 |      o0]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #16]       ; [     o9 |      o8]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    ldr         r3, [r1, #4]        ; [i3 | i2]
+
+    pkhtb       r5, r5, r4, asr #16 ; [o13|o12]
+
+    str         r9, [r1, #8]        ; [o5 | 04]
+
+    ldr         r9, [r1, #12]       ; [i7 | i6]
+    ldr         r8, [r1, #28]       ; [i15|i14]
+    ldr         r2, [r1, #20]       ; [i11|i10]
+    str         r5, [r1, #24]       ; [o13|o12]
+
+    qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]
+    qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #4]        ; [     o3 |      o2]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #20]       ; [    o11 |     o10]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    str         r9, [r1, #12]       ; [o7 | o6]
+    pkhtb       r5, r5, r4, asr #16 ; [o15|o14]
+
+    str         r5, [r1, #28]       ; [o15|o14]
+
+    ldmfd       sp!, {r4 - r12, pc}
+
+    ENDP
+
+; Used constants
+c7500
+    DCD     7500
+c14500
+    DCD     14500
+c0x22a453a0
+    DCD     0x22a453a0
+c0x00080008
+    DCD     0x00080008
+c12000
+    DCD     12000
+c51000
+    DCD     51000
+c0x00070007
+    DCD     0x00070007
+c0x08a914e8
+    DCD     0x08a914e8
+
+    END
--- a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
@@ -0,0 +1,265 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_subtract_mby_armv6|
+    EXPORT  |vp8_subtract_mbuv_armv6|
+    EXPORT  |vp8_subtract_b_armv6|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    BLOCK *be
+; r1    BLOCKD *bd
+; r2    int pitch
+|vp8_subtract_b_armv6| PROC
+
+    stmfd   sp!, {r4-r9}
+
+    ldr     r4, [r0, #vp8_block_base_src]
+    ldr     r5, [r0, #vp8_block_src]
+    ldr     r6, [r0, #vp8_block_src_diff]
+
+    ldr     r3, [r4]
+    ldr     r7, [r0, #vp8_block_src_stride]
+    add     r3, r3, r5          ; src = *base_src + src
+    ldr     r8, [r1, #vp8_blockd_predictor]
+
+    mov     r9, #4              ; loop count
+
+loop_block
+
+    ldr     r0, [r3], r7        ; src
+    ldr     r1, [r8], r2        ; pred
+
+    uxtb16  r4, r0              ; [s2 | s0]
+    uxtb16  r5, r1              ; [p2 | p0]
+    uxtb16  r0, r0, ror #8      ; [s3 | s1]
+    uxtb16  r1, r1, ror #8      ; [p3 | p1]
+
+    usub16  r4, r4, r5          ; [d2 | d0]
+    usub16  r5, r0, r1          ; [d3 | d1]
+
+    subs    r9, r9, #1          ; decrement loop counter
+
+    pkhbt   r0, r4, r5, lsl #16 ; [d1 | d0]
+    pkhtb   r1, r5, r4, asr #16 ; [d3 | d2]
+
+    str     r0, [r6, #0]        ; diff
+    str     r1, [r6, #4]        ; diff
+
+    add     r6, r6, r2, lsl #1  ; update diff pointer
+    bne     loop_block
+
+    ldmfd   sp!, {r4-r9}
+    mov     pc, lr
+
+    ENDP
+
+
+; r0    short *diff
+; r1    unsigned char *usrc
+; r2    unsigned char *vsrc
+; r3    unsigned char *pred
+; stack int stride
+|vp8_subtract_mbuv_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    add     r0, r0, #512        ; set *diff point to Cb
+    add     r3, r3, #256        ; set *pred point to Cb
+
+    mov     r4, #8              ; loop count
+    ldr     r5, [sp, #40]       ; stride
+
+    ; Subtract U block
+loop_u
+    ldr     r6, [r1]            ; src       (A)
+    ldr     r7, [r3], #4        ; pred      (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r1, #4]       ; src       (B)
+    ldr     r11, [r3], #4       ; pred      (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    add     r1, r1, r5          ; update usrc pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (B)
+
+    bne     loop_u
+
+    mov     r4, #8              ; loop count
+
+    ; Subtract V block
+loop_v
+    ldr     r6, [r2]            ; src       (A)
+    ldr     r7, [r3], #4        ; pred      (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r2, #4]       ; src       (B)
+    ldr     r11, [r3], #4       ; pred      (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    add     r2, r2, r5          ; update vsrc pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (B)
+
+    bne     loop_v
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+
+; r0    short *diff
+; r1    unsigned char *src
+; r2    unsigned char *pred
+; r3    int stride
+|vp8_subtract_mby_armv6| PROC
+
+    stmfd   sp!, {r4-r11}
+
+    mov     r4, #16
+loop
+    ldr     r6, [r1]            ; src       (A)
+    ldr     r7, [r2], #4        ; pred      (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r1, #4]       ; src       (B)
+    ldr     r11, [r2], #4       ; pred      (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    ldr     r10, [r1, #8]       ; src       (C)
+    ldr     r11, [r2], #4       ; pred      (C)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    uxtb16  r8, r10             ; [s2 | s0] (C)
+    str     r9, [r0], #4        ; diff      (B)
+
+    uxtb16  r9, r11             ; [p2 | p0] (C)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (C)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (C)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (C)
+    usub16  r7, r10, r11        ; [d3 | d1] (C)
+
+    ldr     r10, [r1, #12]      ; src       (D)
+    ldr     r11, [r2], #4       ; pred      (D)
+
+    pkhbt   r8, r6, r7, lsl #16  ; [d1 | d0] (C)
+    pkhtb   r9, r7, r6, asr #16  ; [d3 | d2] (C)
+
+    str     r8, [r0], #4        ; diff      (C)
+    uxtb16  r8, r10             ; [s2 | s0] (D)
+    str     r9, [r0], #4        ; diff      (C)
+
+    uxtb16  r9, r11             ; [p2 | p0] (D)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (D)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (D)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (D)
+    usub16  r7, r10, r11        ; [d3 | d1] (D)
+
+    add     r1, r1, r3          ; update src pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)
+
+    str     r8, [r0], #4        ; diff      (D)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (D)
+
+    bne     loop
+
+    ldmfd   sp!, {r4-r11}
+    mov     pc, lr
+
+    ENDP
+
+    END
+
--- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
@@ -25,14 +25,14 @@
 |vp8_variance16x16_armv6| PROC

    stmfd   sp!, {r4-r12, lr}
-    mov     r12, #16            ; set loop counter to 16 (=block height)
    mov     r8, #0              ; initialize sum = 0
    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)

 loop
    ; 1st 4 pixels
-    ldr     r4, [r0, #0x0]      ; load 4 src pixels
-    ldr     r5, [r2, #0x0]      ; load 4 ref pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r5, [r2, #0]        ; load 4 ref pixels

    mov     lr, #0              ; constant zero

@@ -55,8 +55,8 @@ loop
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

    ; 2nd 4 pixels
-    ldr     r4, [r0, #0x4]      ; load 4 src pixels
-    ldr     r5, [r2, #0x4]      ; load 4 ref pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

    usub8   r6, r4, r5          ; calculate difference
@@ -79,8 +79,8 @@ loop
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

    ; 3rd 4 pixels
-    ldr     r4, [r0, #0x8]      ; load 4 src pixels
-    ldr     r5, [r2, #0x8]      ; load 4 ref pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

    usub8   r6, r4, r5          ; calculate difference
@@ -103,8 +103,8 @@ loop
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

    ; 4th 4 pixels
-    ldr     r4, [r0, #0xc]      ; load 4 src pixels
-    ldr     r5, [r2, #0xc]      ; load 4 ref pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

    usub8   r6, r4, r5          ; calculate difference
@@ -135,13 +135,14 @@ loop
    bne     loop

    ; return stuff
-    ldr     r6, [sp, #0x28]     ; get address of sse
+    ldr     r6, [sp, #40]       ; get address of sse
    mul     r0, r8, r8          ; sum * sum
    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, ASR #8 ; return (sse - ((sum * sum) >> 8))
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))

    ldmfd   sp!, {r4-r12, pc}

    ENDP

    END
+
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@@ -0,0 +1,176 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_h_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_h_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@@ -0,0 +1,216 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_hv_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_hv_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; pointer to pixels on the next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load source pixels a, row N
+    ldr     r6, [r0, #1]        ; load source pixels b, row N
+    ldr     r5, [r9, #0]        ; load source pixels c, row N+1
+    ldr     r7, [r9, #1]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load source pixels a, row N
+    ldr     r6, [r0, #5]        ; load source pixels b, row N
+    ldr     r5, [r9, #4]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #5]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load source pixels a, row N
+    ldr     r6, [r0, #9]        ; load source pixels b, row N
+    ldr     r5, [r9, #8]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #9]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load source pixels a, row N
+    ldr     r6, [r0, #13]       ; load source pixels b, row N
+    ldr     r5, [r9, #12]       ; load source pixels c, row N+1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+    ldr     r7, [r9, #13]       ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -0,0 +1,178 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_v_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_v_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; set src pointer to next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r9, #0]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r9, #4]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r9, #8]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r9, #12]       ; load 4 src pixels from next row
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
--- a/vp8/encoder/arm/dct_arm.c
+++ b/vp8/encoder/arm/dct_arm.c
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/encoder/dct.h"
+
+#if HAVE_ARMV6
+
+void vp8_fast_fdct8x4_armv6(short *input, short *output, int pitch)
+{
+    vp8_fast_fdct4x4_armv6(input,   output,    pitch);
+    vp8_fast_fdct4x4_armv6(input + 4, output + 16, pitch);
+}
+
+#endif /* HAVE_ARMV6 */
+
+
--- a/vp8/encoder/arm/dct_arm.h
+++ b/vp8/encoder/arm/dct_arm.h
@@ -14,12 +14,21 @@

 #if HAVE_ARMV6
 extern prototype_fdct(vp8_short_walsh4x4_armv6);
+extern prototype_fdct(vp8_fast_fdct4x4_armv6);
+extern prototype_fdct(vp8_fast_fdct8x4_armv6);

 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
+
+#undef  vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_armv6
+
+#undef  vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_armv6
 #endif
-#endif
+
+#endif /* HAVE_ARMV6 */

 #if HAVE_ARMV7
 extern prototype_fdct(vp8_short_fdct4x4_neon);
--- a/vp8/encoder/arm/encodemb_arm.c
+++ b/vp8/encoder/arm/encodemb_arm.c
@@ -1,31 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/encodemb.h"
-#include "vp8/common/reconinter.h"
-#include "vp8/encoder/quantize.h"
-#include "vp8/common/invtrans.h"
-#include "vp8/common/recon.h"
-#include "vp8/common/reconintra.h"
-#include "vp8/encoder/dct.h"
-#include "vpx_mem/vpx_mem.h"
-
-extern void vp8_subtract_b_neon_func(short *diff, unsigned char *src, unsigned char *pred, int stride, int pitch);
-
-void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
-{
-    unsigned char *src_ptr = (*(be->base_src) + be->src);
-    short *diff_ptr = be->src_diff;
-    unsigned char *pred_ptr = bd->predictor;
-    int src_stride = be->src_stride;
-
-    vp8_subtract_b_neon_func(diff_ptr, src_ptr, pred_ptr, src_stride, pitch);
-}
--- a/vp8/encoder/arm/encodemb_arm.h
+++ b/vp8/encoder/arm/encodemb_arm.h
@@ -12,6 +12,24 @@
 #ifndef ENCODEMB_ARM_H
 #define ENCODEMB_ARM_H

+#if HAVE_ARMV6
+extern prototype_subb(vp8_subtract_b_armv6);
+extern prototype_submby(vp8_subtract_mby_armv6);
+extern prototype_submbuv(vp8_subtract_mbuv_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_armv6
+
+#undef  vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_armv6
+
+#undef  vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_armv6
+#endif
+
+#endif /* HAVE_ARMV6 */
+
 #if HAVE_ARMV7
 //extern prototype_berr(vp8_block_error_c);
 //extern prototype_mberr(vp8_mbblock_error_c);
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ b/vp8/encoder/arm/neon/subtract_neon.asm
@@ -8,45 +8,58 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;

-
-    EXPORT |vp8_subtract_b_neon_func|
+    EXPORT |vp8_subtract_b_neon|
    EXPORT |vp8_subtract_mby_neon|
    EXPORT |vp8_subtract_mbuv_neon|

+    INCLUDE asm_enc_offsets.asm
+
    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2
-;=========================================
-;void vp8_subtract_b_neon_func(short *diff, unsigned char *src, unsigned char *pred, int stride, int pitch);
-|vp8_subtract_b_neon_func| PROC
-    ldr             r12, [sp]               ;load pitch

-    vld1.8          {d0}, [r1], r3          ;load src
-    vld1.8          {d1}, [r2], r12         ;load pred
-    vld1.8          {d2}, [r1], r3
-    vld1.8          {d3}, [r2], r12
-    vld1.8          {d4}, [r1], r3
-    vld1.8          {d5}, [r2], r12
-    vld1.8          {d6}, [r1], r3
-    vld1.8          {d7}, [r2], r12
+;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
+|vp8_subtract_b_neon| PROC
+
+    stmfd   sp!, {r4-r7}
+
+    ldr     r3, [r0, #vp8_block_base_src]
+    ldr     r4, [r0, #vp8_block_src]
+    ldr     r5, [r0, #vp8_block_src_diff]
+    ldr     r3, [r3]
+    ldr     r6, [r0, #vp8_block_src_stride]
+    add     r3, r3, r4                      ; src = *base_src + src
+    ldr     r7, [r1, #vp8_blockd_predictor]
+
+    vld1.8          {d0}, [r3], r6          ;load src
+    vld1.8          {d1}, [r7], r2          ;load pred
+    vld1.8          {d2}, [r3], r6
+    vld1.8          {d3}, [r7], r2
+    vld1.8          {d4}, [r3], r6
+    vld1.8          {d5}, [r7], r2
+    vld1.8          {d6}, [r3], r6
+    vld1.8          {d7}, [r7], r2

    vsubl.u8        q10, d0, d1
    vsubl.u8        q11, d2, d3
    vsubl.u8        q12, d4, d5
    vsubl.u8        q13, d6, d7

-    mov             r12, r12, lsl #1
+    mov             r2, r2, lsl #1

-    vst1.16         {d20}, [r0], r12        ;store diff
-    vst1.16         {d22}, [r0], r12
-    vst1.16         {d24}, [r0], r12
-    vst1.16         {d26}, [r0], r12
+    vst1.16         {d20}, [r5], r2         ;store diff
+    vst1.16         {d22}, [r5], r2
+    vst1.16         {d24}, [r5], r2
+    vst1.16         {d26}, [r5], r2

+    ldmfd   sp!, {r4-r7}
    bx              lr
+
    ENDP

+
 ;==========================================
 ;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
 |vp8_subtract_mby_neon| PROC
--- a/vp8/encoder/arm/quantize_arm.c
+++ b/vp8/encoder/arm/quantize_arm.c
@@ -28,7 +28,7 @@ extern int vp8_fast_quantize_b_neon_func(short *coeff_ptr, short *zbin_ptr, shor

 void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d)
 {
-    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant_fast);
+    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff_base + d->qcoeff_offset, d->dqcoeff_base + d->dqcoeff_offset, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant_fast);
 }

 /*
@@ -42,8 +42,8 @@ void vp8_fast_quantize_b_neon(BLOCK *b,BLOCKD *d)
    short *zbin_ptr   = &b->Zbin[0][0];
    short *round_ptr  = &b->Round[0][0];
    short *quant_ptr  = &b->Quant[0][0];
-    short *qcoeff_ptr = d->qcoeff;
-    short *dqcoeff_ptr= d->dqcoeff;
+    short *qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset;
+    short *dqcoeff_ptr= d->dqcoeff_base + d->dqcoeff_offset;
    short *dequant_ptr= &d->Dequant[0][0];

    eob = 0;
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -57,51 +57,38 @@ unsigned int vp8_sub_pixel_variance16x16_armv6
    unsigned short first_pass[36*16];
    unsigned char  second_pass[20*16];
    const short *HFilter, *VFilter;
+    unsigned int var;

-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
+    if (xoffset == 4 && yoffset == 0)
+    {
+        var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
+                                                   dst_ptr, dst_pixels_per_line, sse);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
+                                                   dst_ptr, dst_pixels_per_line, sse);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
+                                                   dst_ptr, dst_pixels_per_line, sse);
+    }
+    else
+    {
+        HFilter = vp8_bilinear_filters[xoffset];
+        VFilter = vp8_bilinear_filters[yoffset];

-    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
-                                            src_pixels_per_line,
-                                            17, 16, HFilter);
-    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
-                                             16, 16, 16, VFilter);
+        vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                                src_pixels_per_line,
+                                                17, 16, HFilter);
+        vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                                 16, 16, 16, VFilter);

-    return vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
-                                   dst_pixels_per_line, sse);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_h_armv6(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 4, 0,
-                                         ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_v_armv6(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 0, 4,
-                                         ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_hv_armv6(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 4, 4,
-                                         ref_ptr, recon_stride, sse);
+        var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
+                                       dst_pixels_per_line, sse);
+    }
+    return var;
 }

 #endif /* HAVE_ARMV6 */
--- a/vp8/encoder/asm_enc_offsets.c
+++ b/vp8/encoder/asm_enc_offsets.c
@@ -12,9 +12,11 @@
 #include "vpx_ports/config.h"
 #include <stddef.h>

+#include "block.h"
+#include "vp8/common/blockd.h"
+#include "onyx_int.h"
 #include "treewriter.h"
 #include "tokenize.h"
-#include "onyx_int.h"

 #define ct_assert(name,cond) \
    static void assert_##name(void) UNUSED;\
@@ -31,6 +33,32 @@
 * {
 */

+//regular quantize
+DEFINE(vp8_block_coeff,                         offsetof(BLOCK, coeff));
+DEFINE(vp8_block_zbin,                          offsetof(BLOCK, zbin));
+DEFINE(vp8_block_round,                         offsetof(BLOCK, round));
+DEFINE(vp8_block_quant,                         offsetof(BLOCK, quant));
+DEFINE(vp8_block_quant_fast,                    offsetof(BLOCK, quant_fast));
+DEFINE(vp8_block_zbin_extra,                    offsetof(BLOCK, zbin_extra));
+DEFINE(vp8_block_zrun_zbin_boost,               offsetof(BLOCK, zrun_zbin_boost));
+DEFINE(vp8_block_quant_shift,                   offsetof(BLOCK, quant_shift));
+
+DEFINE(vp8_blockd_qcoeff_base,                  offsetof(BLOCKD, qcoeff_base));
+DEFINE(vp8_blockd_qcoeff_offset,                offsetof(BLOCKD, qcoeff_offset));
+DEFINE(vp8_blockd_dequant,                      offsetof(BLOCKD, dequant));
+DEFINE(vp8_blockd_dqcoeff_base,                 offsetof(BLOCKD, dqcoeff_base));
+DEFINE(vp8_blockd_dqcoeff_offset,               offsetof(BLOCKD, dqcoeff_offset));
+DEFINE(vp8_blockd_eob,                          offsetof(BLOCKD, eob));
+
+// subtract
+DEFINE(vp8_block_base_src,                      offsetof(BLOCK, base_src));
+DEFINE(vp8_block_src,                           offsetof(BLOCK, src));
+DEFINE(vp8_block_src_diff,                      offsetof(BLOCK, src_diff));
+DEFINE(vp8_block_src_stride,                    offsetof(BLOCK, src_stride));
+
+DEFINE(vp8_blockd_predictor_base,               offsetof(BLOCKD, predictor_base));
+DEFINE(vp8_blockd_predictor_offset,             offsetof(BLOCKD, predictor_offset));
+
 //pack tokens
 DEFINE(vp8_writer_lowvalue,                     offsetof(vp8_writer, lowvalue));
 DEFINE(vp8_writer_range,                        offsetof(vp8_writer, range));
@@ -65,17 +93,6 @@ DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));

 DEFINE(vp8_common_mb_rows,                      offsetof(VP8_COMMON, mb_rows));

-// offsets from BLOCK structure
-DEFINE(vp8_block_coeff,                         offsetof(BLOCK, coeff));
-DEFINE(vp8_block_quant_fast,                    offsetof(BLOCK, quant_fast));
-DEFINE(vp8_block_round,                         offsetof(BLOCK, round));
-
-// offsets from BLOCKD structure
-DEFINE(vp8_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));
-DEFINE(vp8_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));
-DEFINE(vp8_blockd_dequant,                      offsetof(BLOCKD, dequant));
-DEFINE(vp8_blockd_eob,                          offsetof(BLOCKD, eob));
-
 // These two sizes are used in vp8cx_pack_tokens.  They are hard coded
 // so if the size changes this will have to be adjusted.
 #if HAVE_ARMV5TE
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -34,7 +34,7 @@ typedef struct
    // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
    short *quant;
    short *quant_fast;
-    short *quant_shift;
+    unsigned char *quant_shift;
    short *zbin;
    short *zrun_zbin_boost;
    short *round;
@@ -86,7 +86,7 @@ typedef struct

    int mvcosts[2][MVvals+1];
    int *mvcost[2];
-    int mvsadcosts[2][MVvals+1];
+    int mvsadcosts[2][MVfpvals+1];
    int *mvsadcost[2];
    int mbmode_cost[2][MB_MODE_COUNT];
    int intra_uv_mode_cost[2][MB_MODE_COUNT];
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -147,7 +147,7 @@ static const int qzbin_factors_y2[129] =
 #define EXACT_QUANT
 #ifdef EXACT_QUANT
 static void vp8cx_invert_quant(int improved_quant, short *quant,
-                               short *shift, short d)
+                               unsigned char *shift, short d)
 {
    if(improved_quant)
    {
@@ -808,7 +808,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);

            for (i = 0; i < cm->mb_rows; i++)
-                cpi->mt_current_mb_col[i] = 0;
+                cpi->mt_current_mb_col[i] = -1;

            for (i = 0; i < cpi->encoding_thread_count; i++)
            {
@@ -979,7 +979,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
        }

        if (flag[0] || flag[1])
-            vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
+            vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
    }
 #endif

@@ -1147,7 +1147,7 @@ static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x)
 }
 int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 {
-    int Error4x4, Error16x16, error_uv;
+    int Error4x4, Error16x16;
    int rate4x4, rate16x16, rateuv;
    int dist4x4, dist16x16, distuv;
    int rate = 0;
@@ -1157,10 +1157,9 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)

    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

-#if !(CONFIG_REALTIME_ONLY)
    if (cpi->sf.RD && cpi->compressor_speed != 2)
    {
-        error_uv = vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
+        vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
        rate += rateuv;

        Error16x16 = vp8_rd_pick_intra16x16mby_mode(cpi, x, &rate16x16, &rate16x16_tokenonly, &dist16x16);
@@ -1170,7 +1169,6 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
        rate += (Error4x4 < Error16x16) ? rate4x4 : rate16x16;
    }
    else
-#endif
    {
        int rate2, best_distortion;
        MB_PREDICTION_MODE mode, best_mode = DC_PRED;
@@ -1188,7 +1186,7 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
                (&x->e_mbd);
            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
            rate2  = x->mbmode_cost[x->e_mbd.frame_type][mode];
-            this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);

            if (Error16x16 > this_rd)
            {
@@ -1231,7 +1229,6 @@ int vp8cx_encode_inter_macroblock
 )
 {
    MACROBLOCKD *const xd = &x->e_mbd;
-    int inter_error;
    int intra_error = 0;
    int rate;
    int distortion;
@@ -1243,8 +1240,6 @@ int vp8cx_encode_inter_macroblock
    else
        x->encode_breakout = cpi->oxcf.encode_breakout;

-#if !(CONFIG_REALTIME_ONLY)
-
    if (cpi->sf.RD)
    {
        int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
@@ -1258,7 +1253,7 @@ int vp8cx_encode_inter_macroblock
             * do not recalculate */
            cpi->zbin_mode_boost_enabled = 0;
        }
-        inter_error = vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);
+        vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);

        /* switch back to the regular quantizer for the encode */
        if (cpi->sf.improved_quant)
@@ -1271,11 +1266,9 @@ int vp8cx_encode_inter_macroblock

    }
    else
-#endif
-        inter_error = vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);
+        vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);

-
-    cpi->prediction_error += inter_error;
+    cpi->prediction_error += distortion;
    cpi->intra_error += intra_error;

 #if 0
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -15,7 +15,7 @@
 #include "vp8/common/reconintra.h"
 #include "vp8/common/reconintra4x4.h"
 #include "encodemb.h"
-#include "vp8/common/invtrans.h"
+#include "invtrans.h"
 #include "vp8/common/recon.h"
 #include "dct.h"
 #include "vp8/common/g_common.h"
@@ -25,27 +25,15 @@
 #define intra4x4pbias_rate    256


-void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode)
-{
-    if (i < 12)
-    {
-        abmode[i+4] = best_mode;
-    }
-
-    if ((i & 3) != 3)
-    {
-        lbmode[i+1] = best_mode;
-    }
-
-}
 #if CONFIG_RUNTIME_CPU_DETECT
 #define IF_RTCD(x) (x)
 #else
 #define IF_RTCD(x) NULL
 #endif
+
 void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode)
 {
-    vp8_predict_intra4x4(b, best_mode, b->predictor);
+    vp8_predict_intra4x4(b, best_mode, b->predictor_base + b->predictor_offset);

    ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);

@@ -55,7 +43,7 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK

    vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 32);

-    RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
 }

 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
@@ -88,14 +76,9 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    vp8_quantize_mby(x);

-#if !(CONFIG_REALTIME_ONLY)
-#if 1
    if (x->optimize)
        vp8_optimize_mby(x, rtcd);

-#endif
-#endif
-
    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

    RECON_INVOKE(&rtcd->common->recon, recon_mby)
@@ -139,15 +122,9 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    vp8_quantize_mbuv(x);

-#if !(CONFIG_REALTIME_ONLY)
-#if 1
-
    if (x->optimize==2 ||(x->optimize && x->rddiv > 1))
        vp8_optimize_mbuv(x, rtcd);

-#endif
-#endif
-
    vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

    vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
--- a/vp8/encoder/encodeintra.h
+++ b/vp8/encoder/encodeintra.h
@@ -17,7 +17,6 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *, MACROBLOCK *mb);
 void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);
-void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode);
 void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);

 #endif
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -14,11 +14,12 @@
 #include "vp8/common/reconinter.h"
 #include "quantize.h"
 #include "tokenize.h"
-#include "vp8/common/invtrans.h"
+#include "invtrans.h"
 #include "vp8/common/recon.h"
 #include "vp8/common/reconintra.h"
 #include "dct.h"
 #include "vpx_mem/vpx_mem.h"
+#include "rdopt.h"

 #if CONFIG_RUNTIME_CPU_DETECT
 #define IF_RTCD(x) (x)
@@ -29,7 +30,7 @@ void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
 {
    unsigned char *src_ptr = (*(be->base_src) + be->src);
    short *diff_ptr = be->src_diff;
-    unsigned char *pred_ptr = bd->predictor;
+    unsigned char *pred_ptr = bd->predictor_base + bd->predictor_offset;
    int src_stride = be->src_stride;

    int r, c;
@@ -104,7 +105,7 @@ static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
 }

-void vp8_build_dcblock(MACROBLOCK *x)
+static void build_dcblock(MACROBLOCK *x)
 {
    short *src_diff_ptr = &x->src_diff[384];
    int i;
@@ -138,7 +139,7 @@ void vp8_transform_intra_mby(MACROBLOCK *x)
    }

    // build dc block from 16 y dc values
-    vp8_build_dcblock(x);
+    build_dcblock(x);

    // do 2nd order transform on the dc block
    x->short_walsh4x4(&x->block[24].src_diff[0],
@@ -147,7 +148,7 @@ void vp8_transform_intra_mby(MACROBLOCK *x)
 }


-void vp8_transform_mb(MACROBLOCK *x)
+static void transform_mb(MACROBLOCK *x)
 {
    int i;

@@ -159,7 +160,7 @@ void vp8_transform_mb(MACROBLOCK *x)

    // build dc block from 16 y dc values
    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
-        vp8_build_dcblock(x);
+        build_dcblock(x);

    for (i = 16; i < 24; i += 2)
    {
@@ -174,7 +175,8 @@ void vp8_transform_mb(MACROBLOCK *x)

 }

-void vp8_transform_mby(MACROBLOCK *x)
+
+static void transform_mby(MACROBLOCK *x)
 {
    int i;

@@ -187,7 +189,7 @@ void vp8_transform_mby(MACROBLOCK *x)
    // build dc block from 16 y dc values
    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
    {
-        vp8_build_dcblock(x);
+        build_dcblock(x);
        x->short_walsh4x4(&x->block[24].src_diff[0],
            &x->block[24].coeff[0], 8);
    }
@@ -201,7 +203,7 @@ void vp8_stuff_inter16x16(MACROBLOCK *x)
        // recon = copy from predictors to destination
        {
            BLOCKD *b = &x->e_mbd.block[0];
-            unsigned char *pred_ptr = b->predictor;
+            unsigned char *pred_ptr = b->predictor_base + b->predictor_offset;
            unsigned char *dst_ptr = *(b->base_dst) + b->dst;
            int stride = b->dst_stride;

@@ -210,7 +212,7 @@ void vp8_stuff_inter16x16(MACROBLOCK *x)
                vpx_memcpy(dst_ptr+i*stride,pred_ptr+16*i,16);

            b = &x->e_mbd.block[16];
-            pred_ptr = b->predictor;
+            pred_ptr = b->predictor_base + b->predictor_offset;
            dst_ptr = *(b->base_dst) + b->dst;
            stride = b->dst_stride;

@@ -218,7 +220,7 @@ void vp8_stuff_inter16x16(MACROBLOCK *x)
                vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);

            b = &x->e_mbd.block[20];
-            pred_ptr = b->predictor;
+            pred_ptr = b->predictor_base + b->predictor_offset;
            dst_ptr = *(b->base_dst) + b->dst;
            stride = b->dst_stride;

@@ -228,8 +230,6 @@ void vp8_stuff_inter16x16(MACROBLOCK *x)
    */
 }

-#if !(CONFIG_REALTIME_ONLY)
-#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )

 typedef struct vp8_token_state vp8_token_state;
@@ -255,9 +255,9 @@ static const int plane_rd_mult[4]=
    Y1_RD_MULT
 };

-void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
-                    ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                    const VP8_ENCODER_RTCD *rtcd)
+static void optimize_b(MACROBLOCK *mb, int ib, int type,
+                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                       const VP8_ENCODER_RTCD *rtcd)
 {
    BLOCK *b;
    BLOCKD *d;
@@ -302,8 +302,8 @@ void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,

    dequant_ptr = d->dequant;
    coeff_ptr = b->coeff;
-    qcoeff_ptr = d->qcoeff;
-    dqcoeff_ptr = d->dqcoeff;
+    qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset;
+    dqcoeff_ptr = d->dqcoeff_base + d->qcoeff_offset;
    i0 = !type;
    eob = d->eob;

@@ -501,7 +501,7 @@ void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
    *a = *l = (d->eob != !type);
 }

-void vp8_optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+static void optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
 {
    int b;
    int type;
@@ -522,20 +522,20 @@ void vp8_optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)

    for (b = 0; b < 16; b++)
    {
-        vp8_optimize_b(x, b, type,
+        optimize_b(x, b, type,
            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
    }

    for (b = 16; b < 24; b++)
    {
-        vp8_optimize_b(x, b, PLANE_TYPE_UV,
+        optimize_b(x, b, PLANE_TYPE_UV,
            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
    }

    if (has_2nd_order)
    {
        b=24;
-        vp8_optimize_b(x, b, PLANE_TYPE_Y2,
+        optimize_b(x, b, PLANE_TYPE_Y2,
            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
    }
 }
@@ -569,7 +569,7 @@ void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)

    for (b = 0; b < 16; b++)
    {
-        vp8_optimize_b(x, b, type,
+        optimize_b(x, b, type,
        ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
    }

@@ -577,7 +577,7 @@ void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
    if (has_2nd_order)
    {
        b=24;
-        vp8_optimize_b(x, b, PLANE_TYPE_Y2,
+        optimize_b(x, b, PLANE_TYPE_Y2,
            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
    }
 }
@@ -603,11 +603,10 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)

    for (b = 16; b < 24; b++)
    {
-        vp8_optimize_b(x, b, PLANE_TYPE_UV,
+        optimize_b(x, b, PLANE_TYPE_UV,
            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
    }
 }
-#endif

 void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
@@ -615,14 +614,12 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    vp8_subtract_mb(rtcd, x);

-    vp8_transform_mb(x);
+    transform_mb(x);

    vp8_quantize_mb(x);

-#if !(CONFIG_REALTIME_ONLY)
    if (x->optimize)
-        vp8_optimize_mb(x, rtcd);
-#endif
+        optimize_mb(x, rtcd);

    vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

@@ -638,7 +635,7 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);

-    vp8_transform_mby(x);
+    transform_mby(x);

    vp8_quantize_mby(x);

@@ -649,22 +646,6 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 }


-void vp8_encode_inter16x16uv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
-{
-    vp8_build_inter_predictors_mbuv(&x->e_mbd);
-
-    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
-
-    vp8_transform_mbuv(x);
-
-    vp8_quantize_mbuv(x);
-
-    vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-
-    vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
-}
-
-
 void vp8_encode_inter16x16uvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
    vp8_build_inter_predictors_mbuv(&x->e_mbd);
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -101,9 +101,6 @@ void vp8_build_dcblock(MACROBLOCK *b);
 void vp8_transform_mb(MACROBLOCK *mb);
 void vp8_transform_mbuv(MACROBLOCK *x);
 void vp8_transform_intra_mby(MACROBLOCK *x);
-void Encode16x16Y(MACROBLOCK *x);
-void Encode16x16UV(MACROBLOCK *x);
-void vp8_encode_inter16x16uv(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
 void vp8_encode_inter16x16uvrd(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
 void vp8_optimize_mby(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
 void vp8_optimize_mbuv(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -134,31 +134,14 @@ static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc)

    return cost;   // + vp8_cost_bit( p [MVPsign], v < 0);
 }
-//#define M_LOG2_E 0.693147180559945309417
-//#define log2f(x) (log (x) / (float) M_LOG2_E)

-void vp8_build_component_cost_table(int *mvcost[2], int *mvsadcost[2], const MV_CONTEXT *mvc, int mvc_flag[2])
+void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2])
 {
    int i = 1;   //-mv_max;
    unsigned int cost0 = 0;
    unsigned int cost1 = 0;

    vp8_clear_system_state();
-#if 0
-    mvsadcost [0] [0] = 300;
-    mvsadcost [1] [0] = 300;
-
-    do
-    {
-        double z = 256 * (2 * (log2f(2 * i) + .6));
-        mvsadcost [0][i] = (int) z;
-        mvsadcost [1][i] = (int) z;
-        mvsadcost [0][-i] = (int) z;
-        mvsadcost [1][-i] = (int) z;
-    }
-    while (++i <= mv_max);
-
-#endif

    i = 1;

@@ -193,16 +176,6 @@ void vp8_build_component_cost_table(int *mvcost[2], int *mvsadcost[2], const MV_
        }
        while (++i <= mv_max);
    }
-
-    /*
-        i=-mv_max;
-        do
-        {
-            mvcost [0] [i] = cost_mvcomponent( i, mvc[0]);
-            mvcost [1] [i] = cost_mvcomponent( i, mvc[1]);
-        }
-        while( ++i <= mv_max);
-    */
 }


@@ -436,7 +409,7 @@ void vp8_write_mvprobs(VP8_COMP *cpi)
    );

    if (flags[0] || flags[1])
-        vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags);
+        vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags);

 #ifdef ENTROPY_STATS
    active_section = 5;
--- a/vp8/encoder/encodemv.h
+++ b/vp8/encoder/encodemv.h
@@ -16,6 +16,6 @@

 void vp8_write_mvprobs(VP8_COMP *);
 void vp8_encode_motion_vector(vp8_writer *, const MV *, const MV_CONTEXT *);
-void vp8_build_component_cost_table(int *mvcost[2], int *mvsadcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]);
+void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]);

 #endif
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -24,8 +24,6 @@ extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
 extern void vp8_build_block_offsets(MACROBLOCK *x);
 extern void vp8_setup_block_ptrs(MACROBLOCK *x);

-#if CONFIG_MULTITHREAD
-
 extern void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);

 static THREAD_FUNCTION loopfilter_thread(void *p_data)
@@ -51,7 +49,6 @@ static THREAD_FUNCTION loopfilter_thread(void *p_data)

    return 0;
 }
-#endif

 static
 THREAD_FUNCTION thread_encoding_proc(void *p_data)
@@ -322,8 +319,8 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    vpx_memcpy(z->mvcosts,          x->mvcosts,         sizeof(x->mvcosts));
    z->mvcost[0] = &z->mvcosts[0][mv_max+1];
    z->mvcost[1] = &z->mvcosts[1][mv_max+1];
-    z->mvsadcost[0] = &z->mvsadcosts[0][mv_max+1];
-    z->mvsadcost[1] = &z->mvsadcosts[1][mv_max+1];
+    z->mvsadcost[0] = &z->mvsadcosts[0][mvfp_max+1];
+    z->mvsadcost[1] = &z->mvsadcosts[1][mvfp_max+1];


    vpx_memcpy(z->token_costs,       x->token_costs,      sizeof(x->token_costs));
@@ -458,53 +455,58 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,

 void vp8cx_create_encoder_threads(VP8_COMP *cpi)
 {
-    cpi->b_multi_threaded = 0;
+    const VP8_COMMON * cm = &cpi->common;

+    cpi->b_multi_threaded = 0;
+    cpi->encoding_thread_count = 0;
    cpi->processor_core_count = 32; //vp8_get_proc_core_count();

    if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
    {
        int ithread;
+        int th_count = cpi->oxcf.multi_threaded - 1;

        if (cpi->oxcf.multi_threaded > cpi->processor_core_count)
-            cpi->encoding_thread_count = cpi->processor_core_count - 1;
-        else
-            cpi->encoding_thread_count = cpi->oxcf.multi_threaded - 1;
+            th_count = cpi->processor_core_count - 1;

-        CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->h_event_start_encoding, vpx_malloc(sizeof(sem_t) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count));
-        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count);
-        CHECK_MEM_ERROR(cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->mt_current_mb_col, vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cpi->common.mb_rows));
+        /* we have th_count + 1 (main) threads processing one row each */
+        /* no point to have more threads than the sync range allows */
+        if(th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1))
+        {
+            th_count = (cm->mb_cols / cpi->mt_sync_range) - 1;
+        }
+
+        if(th_count == 0)
+            return;
+
+        CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * th_count));
+        CHECK_MEM_ERROR(cpi->h_event_start_encoding, vpx_malloc(sizeof(sem_t) * th_count));
+        CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
+        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
+        CHECK_MEM_ERROR(cpi->en_thread_data,
+                        vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
+        CHECK_MEM_ERROR(cpi->mt_current_mb_col,
+                        vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));

-        //cpi->h_event_main = CreateEvent(NULL, FALSE, FALSE, NULL);
        sem_init(&cpi->h_event_end_encoding, 0, 0);

        cpi->b_multi_threaded = 1;
+        cpi->encoding_thread_count = th_count;

-        //printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n", (cpi->encoding_thread_count +1));
+        /*
+        printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n",
+               (cpi->encoding_thread_count +1));
+        */

-        for (ithread = 0; ithread < cpi->encoding_thread_count; ithread++)
+        for (ithread = 0; ithread < th_count; ithread++)
        {
            ENCODETHREAD_DATA * ethd = &cpi->en_thread_data[ithread];

-            //cpi->h_event_mbrencoding[ithread] = CreateEvent(NULL, FALSE, FALSE, NULL);
            sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
            ethd->ithread = ithread;
            ethd->ptr1 = (void *)cpi;
            ethd->ptr2 = (void *)&cpi->mb_row_ei[ithread];

-            //printf(" call begin thread %d \n", ithread);
-
-            //cpi->h_encoding_thread[ithread] =   (HANDLE)_beginthreadex(
-            //  NULL,           // security
-            //  0,              // stksize
-            //  thread_encoding_proc,
-            //  (&cpi->en_thread_data[ithread]),          // Thread data
-            //  0,
-            //  NULL);
-
            pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, ethd);
        }

--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -67,7 +67,7 @@ static int vscale_lookup[7] = {0, 1, 1, 2, 2, 3, 3};
 static int hscale_lookup[7] = {0, 0, 1, 1, 2, 2, 3};


-const int cq_level[QINDEX_RANGE] =
+static const int cq_level[QINDEX_RANGE] =
 {
    0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
    9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
@@ -79,10 +79,9 @@ const int cq_level[QINDEX_RANGE] =
    86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
 };

-void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
-int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps);
+static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);

-int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
+static int encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
 {

    int i;
@@ -146,7 +145,7 @@ static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    /*start_pos = cpi->stats_in;
    sum_iiratio = 0.0;
    i = 0;
-    while ( (i < 1) && vp8_input_stats(cpi,&next_frame) != EOF )
+    while ( (i < 1) && input_stats(cpi,&next_frame) != EOF )
    {

        next_iiratio = next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error);
@@ -212,7 +211,7 @@ static const double weight_table[256] = {
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
 };

-double vp8_simple_weight(YV12_BUFFER_CONFIG *source)
+static double simple_weight(YV12_BUFFER_CONFIG *source)
 {
    int i, j;

@@ -240,7 +239,7 @@ double vp8_simple_weight(YV12_BUFFER_CONFIG *source)


 // This function returns the current per frame maximum bitrate target
-int frame_max_bits(VP8_COMP *cpi)
+static int frame_max_bits(VP8_COMP *cpi)
 {
    // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left
    int max_bits;
@@ -281,9 +280,9 @@ int frame_max_bits(VP8_COMP *cpi)
 }


-void vp8_output_stats(const VP8_COMP            *cpi,
-                      struct vpx_codec_pkt_list *pktlist,
-                      FIRSTPASS_STATS            *stats)
+static void output_stats(const VP8_COMP            *cpi,
+                         struct vpx_codec_pkt_list *pktlist,
+                         FIRSTPASS_STATS            *stats)
 {
    struct vpx_codec_cx_pkt pkt;
    pkt.kind = VPX_CODEC_STATS_PKT;
@@ -323,7 +322,7 @@ void vp8_output_stats(const VP8_COMP            *cpi,
 #endif
 }

-int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
+static int input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
 {
    if (cpi->stats_in >= cpi->stats_in_end)
        return EOF;
@@ -333,7 +332,7 @@ int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
    return 1;
 }

-void vp8_zero_stats(FIRSTPASS_STATS *section)
+static void zero_stats(FIRSTPASS_STATS *section)
 {
    section->frame      = 0.0;
    section->intra_error = 0.0;
@@ -353,7 +352,7 @@ void vp8_zero_stats(FIRSTPASS_STATS *section)
    section->count      = 0.0;
    section->duration   = 1.0;
 }
-void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
+static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
 {
    section->frame += frame->frame;
    section->intra_error += frame->intra_error;
@@ -373,7 +372,7 @@ void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
    section->count      += frame->count;
    section->duration   += frame->duration;
 }
-void vp8_avg_stats(FIRSTPASS_STATS *section)
+static void avg_stats(FIRSTPASS_STATS *section)
 {
    if (section->count < 1.0)
        return;
@@ -397,15 +396,15 @@ void vp8_avg_stats(FIRSTPASS_STATS *section)

 void vp8_init_first_pass(VP8_COMP *cpi)
 {
-    vp8_zero_stats(cpi->total_stats);
+    zero_stats(cpi->total_stats);
 }

 void vp8_end_first_pass(VP8_COMP *cpi)
 {
-    vp8_output_stats(cpi, cpi->output_pkt_list, cpi->total_stats);
+    output_stats(cpi, cpi->output_pkt_list, cpi->total_stats);
 }

-void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
+static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
 {
    MACROBLOCKD * const xd = & x->e_mbd;
    BLOCK *b = &x->block[0];
@@ -424,7 +423,7 @@ void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * r
    VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16) ( src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err));
 }

-void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset )
+static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset )
 {
    MACROBLOCKD *const xd = & x->e_mbd;
    BLOCK *b = &x->block[0];
@@ -447,7 +446,7 @@ void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *
    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;

    // Initial step/diamond search centred on best mv
-    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);
+    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvcost, ref_mv);
    if ( tmp_err < INT_MAX-new_mv_mode_penalty )
        tmp_err += new_mv_mode_penalty;

@@ -470,7 +469,7 @@ void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *
            num00--;
        else
        {
-            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);
+            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvcost, ref_mv);
            if ( tmp_err < INT_MAX-new_mv_mode_penalty )
                tmp_err += new_mv_mode_penalty;

@@ -491,7 +490,6 @@ void vp8_first_pass(VP8_COMP *cpi)
    VP8_COMMON *const cm = & cpi->common;
    MACROBLOCKD *const xd = & x->e_mbd;

-    int col_blocks = 4 * cm->mb_cols;
    int recon_yoffset, recon_uvoffset;
    YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
    YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
@@ -541,7 +539,7 @@ void vp8_first_pass(VP8_COMP *cpi)
        int flag[2] = {1, 1};
        vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
        vpx_memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
-        vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
+        vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
    }

    // for each macroblock row in image
@@ -565,7 +563,6 @@ void vp8_first_pass(VP8_COMP *cpi)
        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
        {
            int this_error;
-            int zz_to_best_ratio;
            int gf_motion_error = INT_MAX;
            int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);

@@ -575,7 +572,7 @@ void vp8_first_pass(VP8_COMP *cpi)
            xd->left_available = (mb_col != 0);

            // do intra 16x16 prediction
-            this_error = vp8_encode_intra(cpi, x, use_dc_pred);
+            this_error = encode_intra(cpi, x, use_dc_pred);

            // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
            // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
@@ -593,20 +590,19 @@ void vp8_first_pass(VP8_COMP *cpi)
            // Other than for the first frame do a motion search
            if (cm->current_video_frame > 0)
            {
-                BLOCK *b = &x->block[0];
                BLOCKD *d = &x->e_mbd.block[0];
                MV tmp_mv = {0, 0};
                int tmp_err;
                int motion_error = INT_MAX;

                // Simple 0,0 motion with no mv overhead
-                vp8_zz_motion_search( cpi, x, lst_yv12, &motion_error, recon_yoffset );
+                zz_motion_search( cpi, x, lst_yv12, &motion_error, recon_yoffset );
                d->bmi.mv.as_mv.row = 0;
                d->bmi.mv.as_mv.col = 0;

                // Test last reference frame using the previous best mv as the
                // starting point (best reference) for the search
-                vp8_first_pass_motion_search(cpi, x, &best_ref_mv.as_mv,
+                first_pass_motion_search(cpi, x, &best_ref_mv.as_mv,
                                        &d->bmi.mv.as_mv, lst_yv12,
                                        &motion_error, recon_yoffset);

@@ -614,7 +610,7 @@ void vp8_first_pass(VP8_COMP *cpi)
                if (best_ref_mv.as_int)
                {
                   tmp_err = INT_MAX;
-                   vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv,
+                   first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv,
                                     lst_yv12, &tmp_err, recon_yoffset);

                   if ( tmp_err < motion_error )
@@ -628,7 +624,7 @@ void vp8_first_pass(VP8_COMP *cpi)
                // Experimental search in a second reference frame ((0,0) based only)
                if (cm->current_video_frame > 1)
                {
-                    vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, gld_yv12, &gf_motion_error, recon_yoffset);
+                    first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, gld_yv12, &gf_motion_error, recon_yoffset);

                    if ((gf_motion_error < motion_error) && (gf_motion_error < this_error))
                    {
@@ -752,7 +748,7 @@ void vp8_first_pass(VP8_COMP *cpi)
        fps.frame      = cm->current_video_frame ;
        fps.intra_error = intra_error >> 8;
        fps.coded_error = coded_error >> 8;
-        weight = vp8_simple_weight(cpi->Source);
+        weight = simple_weight(cpi->Source);


        if (weight < 0.1)
@@ -790,14 +786,15 @@ void vp8_first_pass(VP8_COMP *cpi)

        // TODO:  handle the case when duration is set to 0, or something less
        // than the full time between subsequent cpi->source_time_stamp s  .
-        fps.duration = cpi->source_end_time_stamp - cpi->source_time_stamp;
+        fps.duration = cpi->source->ts_end
+                       - cpi->source->ts_start;

        // don't want to do output stats with a stack variable!
        memcpy(cpi->this_frame_stats,
               &fps,
               sizeof(FIRSTPASS_STATS));
-        vp8_output_stats(cpi, cpi->output_pkt_list, cpi->this_frame_stats);
-        vp8_accumulate_stats(cpi->total_stats, &fps);
+        output_stats(cpi, cpi->output_pkt_list, cpi->this_frame_stats);
+        accumulate_stats(cpi->total_stats, &fps);
    }

    // Copy the previous Last Frame into the GF buffer if specific conditions for doing so are met
@@ -864,9 +861,6 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_
    // Calculate a corrective factor based on a rolling ratio of bits spent vs target bits
    if ((cpi->rolling_target_bits > 0.0) && (cpi->active_worst_quality < cpi->worst_quality))
    {
-        //double adjustment_rate = 0.985 + (0.00005 * cpi->active_worst_quality);
-        double adjustment_rate = 0.99;
-
        rolling_ratio = (double)cpi->rolling_actual_bits / (double)cpi->rolling_target_bits;

        //if ( cpi->est_max_qcorrection_factor > rolling_ratio )
@@ -1168,7 +1162,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)

    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);

-    vp8_zero_stats(cpi->total_stats);
+    zero_stats(cpi->total_stats);

    if (!cpi->stats_in_end)
        return;
@@ -1202,7 +1196,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)
    cpi->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
    cpi->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;

-    vp8_avg_stats(cpi->total_stats);
+    avg_stats(cpi->total_stats);

    // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
    {
@@ -1211,7 +1205,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)

        start_pos = cpi->stats_in;               // Note starting "file" position

-        while (vp8_input_stats(cpi, &this_frame) != EOF)
+        while (input_stats(cpi, &this_frame) != EOF)
        {
            IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
            IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
@@ -1232,7 +1226,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)
        cpi->modified_error_total = 0.0;
        cpi->modified_error_used = 0.0;

-        while (vp8_input_stats(cpi, &this_frame) != EOF)
+        while (input_stats(cpi, &this_frame) != EOF)
        {
            cpi->modified_error_total += calculate_modified_err(cpi, &this_frame);
        }
@@ -1255,7 +1249,7 @@ void vp8_end_second_pass(VP8_COMP *cpi)

 // This function gives and estimate of how badly we believe
 // the prediction quality is decaying from frame to frame.
-double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+static double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
 {
    double prediction_decay_rate;
    double motion_decay;
@@ -1293,7 +1287,7 @@ double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
 // Function to test for a condition where a complex transition is followed
 // by a static section. For example in slide shows where there is a fade
 // between slides. This is to help with more optimal kf and gf positioning.
-BOOL detect_transition_to_still(
+static int detect_transition_to_still(
    VP8_COMP *cpi,
    int frame_interval,
    int still_interval,
@@ -1318,7 +1312,7 @@ BOOL detect_transition_to_still(
        // persists...
        for ( j = 0; j < still_interval; j++ )
        {
-            if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
+            if (EOF == input_stats(cpi, &tmp_next_frame))
                break;

            decay_rate = get_prediction_decay_rate(cpi, &tmp_next_frame);
@@ -1342,9 +1336,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    FIRSTPASS_STATS next_frame;
    FIRSTPASS_STATS *start_pos;
    int i;
-    int y_width  = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_width;
-    int y_height = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_height;
-    int image_size = y_width  * y_height;
    double boost_score = 0.0;
    double old_boost_score = 0.0;
    double gf_group_err = 0.0;
@@ -1403,7 +1394,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        double r;
        double this_frame_mvr_ratio;
        double this_frame_mvc_ratio;
-        double motion_decay;
        //double motion_pct = next_frame.pcnt_motion;
        double motion_pct;

@@ -1417,7 +1407,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        mod_err_per_mb_accumulator +=
            mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);

-        if (EOF == vp8_input_stats(cpi, &next_frame))
+        if (EOF == input_stats(cpi, &next_frame))
            break;

        // Accumulate motion stats.
@@ -1691,7 +1681,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        {
            while (cpi->baseline_gf_interval < cpi->frames_to_key)
            {
-                if (EOF == vp8_input_stats(cpi, this_frame))
+                if (EOF == input_stats(cpi, this_frame))
                    break;

                cpi->baseline_gf_interval++;
@@ -1870,16 +1860,16 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        FIRSTPASS_STATS sectionstats;
        double Ratio;

-        vp8_zero_stats(&sectionstats);
+        zero_stats(&sectionstats);
        reset_fpf_position(cpi, start_pos);

        for (i = 0 ; i < cpi->baseline_gf_interval ; i++)
        {
-            vp8_input_stats(cpi, &next_frame);
-            vp8_accumulate_stats(&sectionstats, &next_frame);
+            input_stats(cpi, &next_frame);
+            accumulate_stats(&sectionstats, &next_frame);
        }

-        vp8_avg_stats(&sectionstats);
+        avg_stats(&sectionstats);

        cpi->section_intra_rating =
            sectionstats.intra_error /
@@ -1962,8 +1952,6 @@ void vp8_second_pass(VP8_COMP *cpi)
    FIRSTPASS_STATS this_frame;
    FIRSTPASS_STATS this_frame_copy;

-    VP8_COMMON *cm = &cpi->common;
-
    double this_frame_error;
    double this_frame_intra_error;
    double this_frame_coded_error;
@@ -1977,7 +1965,7 @@ void vp8_second_pass(VP8_COMP *cpi)

    vp8_clear_system_state();

-    if (EOF == vp8_input_stats(cpi, &this_frame))
+    if (EOF == input_stats(cpi, &this_frame))
        return;

    this_frame_error = this_frame.ssim_weighted_pred_err;
@@ -1998,7 +1986,7 @@ void vp8_second_pass(VP8_COMP *cpi)
    {
        // Define next KF group and assign bits to it
        vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
-        vp8_find_next_key_frame(cpi, &this_frame_copy);
+        find_next_key_frame(cpi, &this_frame_copy);

        // Special case: Error error_resilient_mode mode does not make much sense for two pass but with its current meaning but this code is designed to stop
        // outlandish behaviour if someone does set it when using two pass. It effectively disables GF groups.
@@ -2239,7 +2227,7 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST
            old_boost_score = boost_score;

            // Get the next frame details
-            if (EOF == vp8_input_stats(cpi, &local_next_frame))
+            if (EOF == input_stats(cpi, &local_next_frame))
                break;
        }

@@ -2257,7 +2245,7 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST

    return is_viable_kf;
 }
-void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
    int i,j;
    FIRSTPASS_STATS last_frame;
@@ -2274,7 +2262,6 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    double kf_group_err = 0.0;
    double kf_group_intra_err = 0.0;
    double kf_group_coded_err = 0.0;
-    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
    double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};

    vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
@@ -2317,7 +2304,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        // load a the next frame's stats
        vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
-        vp8_input_stats(cpi, this_frame);
+        input_stats(cpi, this_frame);

        // Provided that we are not at the end of the file...
        if (cpi->oxcf.auto_key
@@ -2395,7 +2382,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            kf_group_coded_err += tmp_frame.coded_error;

            // Load a the next frame's stats
-            vp8_input_stats(cpi, &tmp_frame);
+            input_stats(cpi, &tmp_frame);
        }

        // Reset to the start of the group
@@ -2497,10 +2484,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    for (i = 0 ; i < cpi->frames_to_key ; i++)
    {
        double r;
-        double motion_decay;
-        double motion_pct;

-        if (EOF == vp8_input_stats(cpi, &next_frame))
+        if (EOF == input_stats(cpi, &next_frame))
            break;

        if (next_frame.intra_error > cpi->kf_intra_err_min)
@@ -2535,16 +2520,16 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        FIRSTPASS_STATS sectionstats;
        double Ratio;

-        vp8_zero_stats(&sectionstats);
+        zero_stats(&sectionstats);
        reset_fpf_position(cpi, start_position);

        for (i = 0 ; i < cpi->frames_to_key ; i++)
        {
-            vp8_input_stats(cpi, &next_frame);
-            vp8_accumulate_stats(&sectionstats, &next_frame);
+            input_stats(cpi, &next_frame);
+            accumulate_stats(&sectionstats, &next_frame);
        }

-        vp8_avg_stats(&sectionstats);
+        avg_stats(&sectionstats);

         cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);

@@ -2799,7 +2784,6 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        {
            long long clip_bits = (long long)(cpi->total_stats->count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
            long long over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;
-            long long over_spend2 = cpi->oxcf.starting_buffer_level - projected_buffer_level;

            if ((last_kf_resampled && (kf_q > cpi->worst_quality)) ||                                               // If triggered last time the threshold for triggering again is reduced
                ((kf_q > cpi->worst_quality) &&                                                                  // Projected Q higher than allowed and ...
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -17,8 +17,6 @@
 void vp8_arch_x86_encoder_init(VP8_COMP *cpi);
 void vp8_arch_arm_encoder_init(VP8_COMP *cpi);

-
-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
 extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);

 void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
@@ -91,9 +89,7 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)

    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
-#if !(CONFIG_REALTIME_ONLY)
    cpi->rtcd.search.full_search             = vp8_full_search_sad;
-#endif
    cpi->rtcd.search.diamond_search          = vp8_diamond_search_sad;
 #if !(CONFIG_REALTIME_ONLY)
    cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_c;
--- a/vp8/encoder/invtrans.c
+++ b/vp8/encoder/invtrans.c
@@ -11,8 +11,6 @@

 #include "invtrans.h"

-
-
 static void recon_dcblock(MACROBLOCKD *x)
 {
    BLOCKD *b = &x->block[24];
@@ -20,7 +18,7 @@ static void recon_dcblock(MACROBLOCKD *x)

    for (i = 0; i < 16; i++)
    {
-        x->block[i].dqcoeff[0] = b->diff[i];
+        *(x->block[i].dqcoeff_base+x->block[i].dqcoeff_offset) = b->diff_base[b->diff_offset+i];
    }

 }
@@ -28,18 +26,18 @@ static void recon_dcblock(MACROBLOCKD *x)
 void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch)
 {
    if (b->eob > 1)
-        IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);
+        IDCT_INVOKE(rtcd, idct16)(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset], pitch);
    else
-        IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch);
+        IDCT_INVOKE(rtcd, idct1)(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset], pitch);
 }

-
+/* Only used in the encoder */
 void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
 {
    int i;

    /* do 2nd order transform on the dc block */
-    IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff);
+    IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff_base + x->block[23].dqcoeff_offset, &x->block[24].diff_base[x->block[24].diff_offset]);

    recon_dcblock(x);

@@ -49,6 +47,8 @@ void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *
    }

 }
+
+/* Only used in encoder */
 void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
 {
    int i;
@@ -57,7 +57,6 @@ void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD
    {
        vp8_inverse_transform_b(rtcd, &x->block[i], 16);
    }
-
 }


@@ -69,8 +68,10 @@ void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x
        x->mode_info_context->mbmi.mode != SPLITMV)
    {
        /* do 2nd order transform on the dc block */
+        BLOCKD b = x->block[24];
+
+        IDCT_INVOKE(rtcd, iwalsh16)(b.dqcoeff_base+b.dqcoeff_offset, &b.diff_base[b.diff_offset]);

-        IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff);
        recon_dcblock(x);
    }

--- a/vp8/encoder/invtrans.h
+++ b/vp8/encoder/invtrans.h
@@ -13,8 +13,8 @@
 #define __INC_INVTRANS_H

 #include "vpx_ports/config.h"
-#include "idct.h"
-#include "blockd.h"
+#include "vp8/common/idct.h"
+#include "vp8/common/blockd.h"
 extern void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch);
 extern void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
 extern void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
--- a/vp8/encoder/lookahead.c
+++ b/vp8/encoder/lookahead.c
@@ -0,0 +1,157 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include "vpx_config.h"
+#include "lookahead.h"
+#include "vp8/common/extend.h"
+
+#define MAX_LAG_BUFFERS (CONFIG_REALTIME_ONLY? 1 : 25)
+
+struct lookahead_ctx
+{
+    unsigned int max_sz;         /* Absolute size of the queue */
+    unsigned int sz;             /* Number of buffers currently in the queue */
+    unsigned int read_idx;       /* Read index */
+    unsigned int write_idx;      /* Write index */
+    struct lookahead_entry *buf; /* Buffer list */
+};
+
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *
+pop(struct lookahead_ctx *ctx,
+    unsigned int         *idx)
+{
+    unsigned int            index = *idx;
+    struct lookahead_entry *buf = ctx->buf + index;
+
+    assert(index < ctx->max_sz);
+    if(++index >= ctx->max_sz)
+        index -= ctx->max_sz;
+    *idx = index;
+    return buf;
+}
+
+
+void
+vp8_lookahead_destroy(struct lookahead_ctx *ctx)
+{
+    if(ctx)
+    {
+        if(ctx->buf)
+        {
+            int i;
+
+            for(i = 0; i < ctx->max_sz; i++)
+                vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
+            free(ctx->buf);
+        }
+        free(ctx);
+    }
+}
+
+
+struct lookahead_ctx*
+vp8_lookahead_init(unsigned int width,
+                   unsigned int height,
+                   unsigned int depth)
+{
+    struct lookahead_ctx *ctx = NULL;
+    int i;
+
+    /* Clamp the lookahead queue depth */
+    if(depth < 1)
+        depth = 1;
+    else if(depth > MAX_LAG_BUFFERS)
+        depth = MAX_LAG_BUFFERS;
+
+    /* Align the buffer dimensions */
+    width = (width + 15) & ~15;
+    height = (height + 15) & ~15;
+
+    /* Allocate the lookahead structures */
+    ctx = calloc(1, sizeof(*ctx));
+    if(ctx)
+    {
+        ctx->max_sz = depth;
+        ctx->buf = calloc(depth, sizeof(*ctx->buf));
+        if(!ctx->buf)
+            goto bail;
+        for(i=0; i<depth; i++)
+            if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img, width, height, 16))
+                goto bail;
+    }
+    return ctx;
+bail:
+    vp8_lookahead_destroy(ctx);
+    return NULL;
+}
+
+
+int
+vp8_lookahead_push(struct lookahead_ctx *ctx,
+                   YV12_BUFFER_CONFIG   *src,
+                   int64_t               ts_start,
+                   int64_t               ts_end,
+                   unsigned int          flags)
+{
+    struct lookahead_entry* buf;
+
+    if(ctx->sz + 1 > ctx->max_sz)
+        return 1;
+    ctx->sz++;
+    buf = pop(ctx, &ctx->write_idx);
+    vp8_copy_and_extend_frame(src, &buf->img);
+    buf->ts_start = ts_start;
+    buf->ts_end = ts_end;
+    buf->flags = flags;
+    return 0;
+}
+
+
+struct lookahead_entry*
+vp8_lookahead_pop(struct lookahead_ctx *ctx,
+                  int                   drain)
+{
+    struct lookahead_entry* buf = NULL;
+
+    if(ctx->sz && (drain || ctx->sz == ctx->max_sz))
+    {
+        buf = pop(ctx, &ctx->read_idx);
+        ctx->sz--;
+    }
+    return buf;
+}
+
+
+struct lookahead_entry*
+vp8_lookahead_peek(struct lookahead_ctx *ctx,
+                   int                   index)
+{
+    struct lookahead_entry* buf = NULL;
+
+    assert(index < ctx->max_sz);
+    if(index < ctx->sz)
+    {
+        index += ctx->read_idx;
+        if(index >= ctx->max_sz)
+            index -= ctx->max_sz;
+        buf = ctx->buf + index;
+    }
+    return buf;
+}
+
+
+unsigned int
+vp8_lookahead_depth(struct lookahead_ctx *ctx)
+{
+    return ctx->sz;
+}
--- a/vp8/encoder/lookahead.h
+++ b/vp8/encoder/lookahead.h
@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LOOKAHEAD_H
+#define LOOKAHEAD_H
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+
+struct lookahead_entry
+{
+    YV12_BUFFER_CONFIG  img;
+    int64_t             ts_start;
+    int64_t             ts_end;
+    unsigned int        flags;
+};
+
+
+struct lookahead_ctx;
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ *
+ *
+ */
+struct lookahead_ctx* vp8_lookahead_init(unsigned int width,
+                                         unsigned int height,
+                                         unsigned int depth
+                                         );
+
+
+/**\brief Destroys the lookahead stage
+ *
+ */
+void vp8_lookahead_destroy(struct lookahead_ctx *ctx);
+
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] src       Pointer to the image to enqueue
+ * \param[in] ts_start  Timestamp for the start of this frame
+ * \param[in] ts_end    Timestamp for the end of this frame
+ * \param[in] flags     Flags set on this frame
+ */
+int
+vp8_lookahead_push(struct lookahead_ctx *ctx,
+                   YV12_BUFFER_CONFIG   *src,
+                   int64_t               ts_start,
+                   int64_t               ts_end,
+                   unsigned int          flags);
+
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] drain     Flag indicating the buffer should be drained
+ *                      (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ *
+ */
+struct lookahead_entry*
+vp8_lookahead_pop(struct lookahead_ctx *ctx,
+                  int                   drain);
+
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] index     Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ *
+ */
+struct lookahead_entry*
+vp8_lookahead_peek(struct lookahead_ctx *ctx,
+                   int                   index);
+
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ */
+unsigned int
+vp8_lookahead_depth(struct lookahead_ctx *ctx);
+
+
+#endif
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Aaron Watry	84ae235450	Initial OpenCL implementation of the VP8 decoder. Change-Id: I74c334af09f13473ce07bbac74b0f9ea57573347 Note: very slow, but functional. Encoder is untested, but should still work.	2011-04-18 13:50:23 -04:00
Johann	cd103a5721	Merge "store quant_shift as an unsigned char"	2011-04-18 10:03:40 -07:00
Yaowu Xu	05d9421e8b	Merge "Add spin-wait pause intrinsic for Windows x64 platform."	2011-04-18 09:53:26 -07:00
Yaowu Xu	c619f6cb0f	Merge "fixed an overflow in ssim calculation"	2011-04-18 07:44:34 -07:00
Scott LaVarnway	e1a8b6c8d5	Removed unused timers Change-Id: I209803b9dbed2b2f6d02258fd7a3963a6645f4ab	2011-04-18 09:09:57 -04:00
John Koleszar	8fcb801d15	Merge "added -fomit-frame-pointer flag for gcc builds"	2011-04-18 06:07:57 -07:00
Johann	d889035fe6	Merge "remove dead code, add missing RESTORE_XMM"	2011-04-15 13:32:54 -07:00
Scott LaVarnway	9409e38050	added -fomit-frame-pointer flag for gcc builds According to the docs, this should have been enabled, but the disassembled output shows otherwise. This improved the encode/decode performance. Change-Id: I45ad7e6d299b89ac3166d7ef7da75b74994344c6	2011-04-15 15:59:21 -04:00
Johann	f64f425a50	remove executable bit source files are not executable Change-Id: Id2c7294695a22217468426423979f68f02d82340	2011-04-15 13:43:24 -04:00
Adrian Grange	0d2abe3084	Merge "Fix usage of value returned by vp8_pick_intra4x4mby_modes"	2011-04-15 08:37:19 -07:00
Yunqing Wang	1312a7a2e2	Merge "Reduce unnecessary distortion computation"	2011-04-15 08:17:03 -07:00
Johann	487c0299c9	remove dead code, add missing RESTORE_XMM vp8_filter_block1d16_h4_ssse3 was never called because UNSHADOW_ARGS moves the stack by 'mov rsp, rbp', the issue was masked. however, if/when win64 used those registers for persistant data, issues could/will arise. Change-Id: I56d6effca0aeba1f86082689771cb10145d39651	2011-04-15 10:11:53 -04:00
John Koleszar	a3399291ad	Fix off-by-one in copy_and_extend_plane Should only copy h lines, not h+1. Change-Id: I802a85686635900459c6dc79596189033e5298d8	2011-04-15 08:44:39 -04:00
Yunqing Wang	918fb5487e	Reduce unnecessary distortion computation In vp8_pick_inter_mode(), for NEWMV mode, use the error result got from motion search as distortion. This helps performance in real- time mode. Change-Id: I398c4e46cc5381f7d874e748cf78827ef0e0860c	2011-04-14 15:53:33 -04:00
John Koleszar	63f15987a5	Merge "Refactor lookahead ring buffer"	2011-04-14 12:35:01 -07:00
Fritz Koenig	e749ae510f	Merge "Use consistent delimiters."	2011-04-14 11:56:18 -07:00
Adrian Grange	8608de1c6f	Fix usage of value returned by vp8_pick_intra4x4mby_modes The value of distortion2 returned by vp8_pick_intra4x4mby_modes was being overwritten by the value returned by get16x16prederror before it was tested. Change-Id: If00e80332b272c5545c3a7e381c8041e8319b41a	2011-04-14 10:50:00 -07:00
Johann	ab48305fb6	Merge "update configure for ios sdk 4.3"	2011-04-14 08:55:22 -07:00
Joshua Bleecher Snyder	5e7a3bb69a	update configure for ios sdk 4.3 update for the latest version of the ios sdk. adding usr/lib/system fixes a missing libcache.dylib issue make isysroot path more DRY Change-Id: Ib748ef3dac3cac2e4848fbffa1e9a0112eac826b	2011-04-14 11:22:33 -04:00
Fritz Koenig	33cefd6f6e	Use consistent delimiters. opsnr.stt file was using \t for delimiters on everything except between VPXSSIM and Time. Change-Id: I6284c4e40c05ff642bf4b0170dca062c279a42df	2011-04-13 15:06:17 -07:00
Adrian Grange	8861174624	Fixed use of early breakout in vp8_pick_intra4x4mby_modes Index i is used to detect early breakout from the first loop, but its value is lost due to reuse in the second for loop. I moved the position of the second loop and did some format cleanup. Change-Id: I02780eae1bd89df4b6c000fb8a018b0837aac2e5	2011-04-13 12:56:46 -07:00
John Koleszar	88841f1059	Refactor lookahead ring buffer This patch cleans up the source buffer storage and copy mechanism to allow access through a standard push/pop/peek interface. This approach also avoids an extra copy in the case where the source is not a multiple of 16, fixing issue #102. Change-Id: I05808c39f5743625cb4c7af54cc841b9b10fdbd9	2011-04-13 14:26:45 -04:00
Johann	70f30aa95d	store quant_shift as an unsigned char in encodframe.c, quant_shift is set to 0 or 1 in vp8cx_invert_quant only use 8 bits to store this, instead of 16. will allow saving an xmm register in an updated version of the regular quantize Change-Id: Ie88c47fe2aff5af0283dab1147fb2791e4b12f90	2011-04-13 13:50:12 -04:00
John Koleszar	538f110407	Merge "Bugfix for error accumulator stats"	2011-04-12 06:59:00 -07:00
John Koleszar	e689a27d62	Bugfix for error accumulator stats Previous to commit `de4e9e3`, there was an early return in the alt-ref case that was inadvertantly removed when the function was refactored to return void. This patch restores the prior behavior. Change-Id: I783ffd594a4690297e2742f99526fd7ad67698b2	2011-04-12 08:47:33 -04:00
John Koleszar	fd09009227	Merge "Fix encoder range check for frame width and height"	2011-04-12 05:34:12 -07:00
Attila Nagy	1aadcedcfb	Fix encoder range check for frame width and height 14 bits available in the bistream => valid range [1..16383] Removed unused local vars. Change-Id: Icf3385e47a9fa13af70053129c2248671f285583	2011-04-12 15:07:37 +03:00
Yunqing Wang	4fd81a99f8	Set cpu_used range to [-16, 16] in real-time mode Remove encoding speed limitation in real-time mode. Change-Id: Ib5e35d8bb522b2a25f3e4ad5cfe2788ebebb3617	2011-04-11 15:55:04 -04:00
Yunqing Wang	d1abe62d1c	Define RDCOST only once Clean up the code. Change-Id: I7db048efa4d972b528d553a7921bc45979621129	2011-04-11 11:53:56 -04:00
John Koleszar	a9ce3e3834	Remove unused files Change-Id: I36ca3f2f4620358033da34daf764f0b388dacd08	2011-04-11 10:34:40 -04:00
Yunqing Wang	4b43167ad1	Fix input MV for full search Input MV needs to be modified to full-pixel precision. Change-Id: Ic5d78e41bf27077e325024332b9fe89f76c44f0c	2011-04-08 16:29:41 -04:00
Johann Koenig	6e156a4cd7	Merge "use asm_offsets with vp8_fast_quantize_b_sse3"	2011-04-08 10:05:47 -07:00
John Koleszar	921a32a306	Merge "Error accumulator stats bug."	2011-04-08 08:20:32 -07:00
Paul Wilkins	de4e9e3b44	Error accumulator stats bug. The error accumulator stats values cpi->prediction_error and cpi->intra_error were being populated with rd values not distortion values. These are only "currently" used in a limited way for RT compress key frame detection. Change-Id: I2702ba1cab6e49ab8dc096ba75b6b34ab3573021	2011-04-08 14:21:36 +01:00
Jim Bankoski	d4cdb683a4	fixed an overflow in ssim calculation This commit fixed an overflow in ssim calculation, added register save and restore to make sure assembly code working for x64 platform. It also changed the sampling points to every 4x4 instead of 8x8 and adjusted the constants in SSIM calculation to match the scale of previous VPXSSIM. Change-Id: Ia4dbb8c69eac55812f4662c88ab4653b6720537b	2011-04-07 14:25:25 -07:00
Johann Koenig	08702002e8	use asm_offsets with vp8_fast_quantize_b_sse3 on the same order as the sse2 fast quantize change: ~2% except for 32bit. only a slight improvment there. Change-Id: Iff80e5f1ce7e646eebfdc8871405458ff911986b	2011-04-07 16:40:05 -04:00
James Berry	aec5487cdd	Use correct 32 bit comparisons for SAD breakout. Rax updated to eax to avoid uninitialized memory usage. Change-Id: Iedb953f104329ede2a786fc648a47f1be2f3798a	2011-04-07 15:08:03 -04:00
Johann	2de858b9fc	Merge "use asm_offsets with vp8_fast_quantize_b_sse2"	2011-04-06 10:53:55 -07:00
Yunqing Wang	9e9f61a317	Merge "Minor modification"	2011-04-06 06:12:13 -07:00
Yunqing Wang	02423b2e92	Minor modification A small change. Change-Id: I2e7726e58370a95d0319361f4f6ad231138d1328	2011-04-06 09:08:47 -04:00
Johann	c32e0ecc59	use asm_offsets with vp8_fast_quantize_b_sse2 on the same order as the regular quantize change: ~2% Change-Id: I5c9eec18e89ae7345dd96945cb740e6f349cee86	2011-04-04 16:23:29 -04:00
Scott LaVarnway	f212a98ee7	Fixed unused variable warnings for firstpass.c Change-Id: I8378a9a541ade2f098359a7b20fa08e6c1596d80	2011-04-04 14:18:31 -04:00
John Koleszar	91036996ac	Merge "Slightly simplify vp8_decode_mb_tokens."	2011-04-04 08:58:25 -07:00
Johann	610dd90288	Merge "tweak vp8_regular_quantize_b_sse2"	2011-04-04 08:56:25 -07:00
Gaute Strokkenes	15f03c2f13	Slightly simplify vp8_decode_mb_tokens. Change-Id: I0058ba7dcfc50a3374b712197639ac337f8726be	2011-04-04 16:47:22 +01:00
Yunqing Wang	f5c0d95e8c	Merge "Use full-pixel MV in mvsadcost calculation"	2011-04-04 08:40:51 -07:00
John Koleszar	af1acc851b	Merge "support obj_int_extract on cygwin"	2011-04-04 08:29:50 -07:00
Yunqing Wang	3d6815817c	Use full-pixel MV in mvsadcost calculation MV sad cost error is only used in full-pixel motion search, which only need full-pixel resolution instead of quarter-pixel resolution. This change reduced mvsadcost table size, and removed unneccessary pamameter passing since this table is constant once it is generated. Change-Id: I9f931e55f6abc3c99011321f1dfb2f3562e6f6b0	2011-04-01 16:41:58 -04:00
Johann	fd7040d2b6	support obj_int_extract on cygwin cygwin doesn't support _sopen. drop down to the lowest common denominator and merge main for all platforms. this also opens the door for supporting multiple object formats with a single binary. Change-Id: I7cd45091639d447434e6d5db2e19cfc9988f8630	2011-04-01 13:23:44 -04:00
John Koleszar	82315be75d	Merge "vpxenc: die on realloc failures"	2011-04-01 07:55:55 -07:00
Johann	8520b5c785	tweak vp8_regular_quantize_b_sse2 rather than look up rc in the zig zag table, embed it in the macro. this also allows us to shuffle some values in the macro and keep *d in rsi gains of about the same order as the obj_int_extract implementation: ~2% Change-Id: Ib7252dd10eee66e0af8b0e567426122781dc053d	2011-04-01 09:58:23 -04:00
Johann	ba11e24d47	Merge "Wrapper function removed from vp8_subtract_b_neon function call"	2011-04-01 05:47:21 -07:00
Tero Rintaluoma	cec76a36d6	Wrapper function removed from vp8_subtract_b_neon function call Address calculations moved from encodemb_arm.c file to neon optimized assembly function to save cycles in function calls. - vp8_subtract_b_neon_func replaced with vp8_subtract_b_neon that contains all needed address calculations - unnecessary file encodemb_arm.c removed - consistent with ARMv6 optimized version Change-Id: I6cbc1a2670b56c2077f59995fcf8f70786b4990b	2011-04-01 10:06:44 +03:00
Johann	9d138379a2	Merge "ARMv6 optimized subtract functions"	2011-03-31 08:40:10 -07:00
John Koleszar	f56b9ee92e	Merge changes I4e32a8fb,Ic6a9d4c5 * changes: Generate a vpx.pc file for pkg-config. Export the version string as a makefile variable.	2011-03-31 06:21:11 -07:00
Ralph Giles	607f8420f3	Generate a vpx.pc file for pkg-config. Rules are added to libs.mk to generate a vpx.pc, which is installed as pkgconfig/vpx.pc under the target library directory. This also requires the install path prefix be exported directly in config.mk. Some systems use a tool called pkg-config to query information about intalled libraries or other resources, based on database files provided by the packages themselves at install time. Providing such a file for libvpx simplifies integration with other build systems, and provides an easy avenue for developers to test against their own builds of the library. Change-Id: I4e32a8fbb53fc331aa95eb207c63dd70a76d18ed	2011-03-30 20:56:16 -07:00
Ralph Giles	53e9987b4d	Export the version string as a makefile variable. The configure script exports the major/minor/patch version numbers, but didn't make the full version string available to Makefile recipes and rules, the way it is available to C code from vpx_version.h. Change-Id: Ic6a9d4c574a6ea66a50c928f4eedeb91d7668eb5	2011-03-30 20:53:43 -07:00
Attila Nagy	7d335868df	Fix: lpf semaphore was signaled in single threaded run After picking filter level, post the loopfilter semaphore just when multiple threads are in use. Change-Id: If7bfb64601d906adef703f454dafc25e978b93c6	2011-03-30 15:55:29 +03:00
John Koleszar	26b6a3b088	vpxenc: die on realloc failures Identified as a possible cause of issue #308, the code was silently ignoring realloc failures, which would lead to corruption, memory leaks, and likely a crash. The best we can do in this case is die gracefully. Change-Id: Ie5f6a853d367015be5b9712bd742778f3baeefd9	2011-03-30 06:37:02 -04:00
Johann	0e43668546	Merge "Half pixel variance further optimized for ARMv6"	2011-03-29 12:14:54 -07:00
Yunqing Wang	534ea700bd	Merge "Fix a crash while enabling shared (--enable-shared)"	2011-03-29 09:04:22 -07:00
Yunqing Wang	b843aa4eda	Fix a crash while enabling shared (--enable-shared) Fixed a bug in SSSE3 sub-pixel filter functions. Change-Id: I2e2126652970eb78307ffcefcace1efd5966fb0a	2011-03-29 11:31:06 -04:00
Johann	f0c22a3f33	use GLOBAL correctly on 32bit shared libraries http://code.google.com/p/webm/issues/detail?id=309 Change-Id: I6fce9e2f74bc09a9f258df7f91ab599812324e8c	2011-03-29 11:27:03 -04:00
John Koleszar	49c31dc2b4	Merge "configure: enable unused variable warnings"	2011-03-29 07:38:04 -07:00
Tero Rintaluoma	6fdc9aa79f	ARMv6 optimized subtract functions Adds following ARMv6 optimized functions to encoder: - vp8_subtract_b_armv6 - vp8_subtract_mby_armv6 - vp8_subtract_mbuv_armv6 Gives 1-5% speed-up depending on input sequence and encoding parameters. Functions have one stall cycle inside the loop body on Cortex pipeline. Change-Id: I19cca5408b9861b96f378e818eefeb3855238639	2011-03-29 16:52:00 +03:00
Johann	4be062bbc3	add asm_enc_offsets.c for all targets now that we need asm_enc_offsets.c for x86 and arm and it is harmless to build it for other targets, add it unconditionally Change-Id: I320c5220afd94fee2b98bda9ff4e5e34c67062f3	2011-03-28 10:43:47 -04:00
Tero Rintaluoma	f5e433464b	Half pixel variance further optimized for ARMv6 Half pixel interpolations optimized in variance calculations. Separate function calls to vp8_filter_block2d_bil_x_pass_armv6 are avoided.On average, performance improvement is 6-7% for VGA@30fps sequences. Change-Id: Idb5f118a9d51548e824719d2cfe5be0fa6996628	2011-03-28 09:51:51 +03:00
Johann	beaafefcf1	Merge "use asm_offsets with vp8_regular_quantize_b_sse2"	2011-03-24 11:06:36 -07:00
Johann	8edaf6e2f2	use asm_offsets with vp8_regular_quantize_b_sse2 remove helper function and avoid shadowing all the arguments to the stack on 64bit systems when running with --good --cpu-used=0: ~2% on linux x86 and x86_64 ~2% on win32 x86 msys and visual studio more on darwin10 x86_64 significantly more on x86_64-win64-vs9 Change-Id: Ib7be12edf511fbf2922f191afd5b33b19a0c4ae6	2011-03-24 13:34:48 -04:00
Johann	4cde2ab765	Merge "ARMv6 optimized fdct4x4"	2011-03-23 07:52:51 -07:00
John Koleszar	edfc93aeba	Merge "Allow specifying --end-usage by enum name"	2011-03-21 12:29:11 -07:00
John Koleszar	577910b464	Merge "vpx_codec_dec_init: check that the iface is a decoder"	2011-03-21 09:12:58 -07:00
John Koleszar	2fced87e75	vpx_codec_dec_init: check that the iface is a decoder Make sure the given interface is actually a decoder interface before initializing it. Change-Id: Ie48d737f2956cc2f0891666de5ea87251e96bc49	2011-03-21 12:12:14 -04:00
Yunqing Wang	73065b67e4	Merge "Fix multithreaded encoding for 1 MB wide frame"	2011-03-21 07:41:31 -07:00
John Koleszar	2cbd962088	Remove unused vp8_get4x4sse_cs_mmx declaration This declaration did not match the prototype_sad() prototype, but was unused in this translation unit, so it is removed instead. Fixes issue 290. Change-Id: I168854f88a85f73ca9aaf61d1e5dc0f43fc3fdb3	2011-03-21 07:53:53 -04:00
John Koleszar	769c74c0ac	Merge "Increase static linkage, remove unused functions"	2011-03-21 04:51:51 -07:00
John Koleszar	500fec2d5f	Allow specifying --end-usage by enum name Map an enum to the --end-usage values, so you can specify --end-usage=cq instead of --end-usage=2. The numerical values still work for historical scripts, etc, but this is more user friendly. Change-Id: I445ecd9638f801f5924a71eabf449bee293cdd34	2011-03-21 07:50:42 -04:00
Tero Rintaluoma	a61785b6a1	ARMv6 optimized fdct4x4 Optimized fdct4x4 (8x4) for ARMv6 instruction set. - No interlocks in Cortex-A8 pipeline - One interlock cycle in ARM11 pipeline - About 2.16 times faster than current C-code compiled with -O3 Change-Id: I60484ecd144365da45bb68a960d30196b59952b8	2011-03-21 13:33:45 +02:00
Attila Nagy	bfe803bda3	Fix multithreaded encoding for 1 MB wide frame Thread synchronization was not correct when frame width was 1 MB. Number of allocated encoding threads is limited by the sync_range. There is no point having more because each thread lags sync_range MBs behind the thread processing the row above. http://code.google.com/p/webm/issues/detail?id=302 Change-Id: Icaf67a883beecc5ebf2f11e9be47b6997fdf6f26	2011-03-18 12:35:30 +02:00
John Koleszar	429dc676b1	Increase static linkage, remove unused functions A large number of functions were defined with external linkage, even though they were only used from within one file. This patch changes their linkage to static and removes the vp8_ prefix from their names, which should make it more obvious to the reader that the function is contained within the current translation unit. Functions that were not referenced were removed. These symbols were identified by: $ nm -A libvpx.a \| sort -k3 \| uniq -c -f2 \| grep ' [A-Z] ' \ \| sort \| grep '^ *1 ' Change-Id: I59609f58ab65312012c047036ae1e0634f795779	2011-03-17 20:53:47 -04:00
Aron Rosenberg	8e87d58712	Add spin-wait pause intrinsic for Windows x64 platform. Change-Id: I7504370c67a3c551627c6bb7e67c65f83d88b78e	2011-03-04 14:49:50 -08:00
John Koleszar	b601eb8cda	configure: enable unused variable warnings Only suppress unused function warnings, rather than supprressing all unused-* warnings. Unused functions can still be seen with --enable-extra-warnings. Change-Id: Ibca20d859dbffedd76bd082ffe0fa685c3ac198e	2011-02-04 11:53:11 -05:00