diff --git a/doc/APIchanges b/doc/APIchanges
index 1f5e8aad39..aa2827af08 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -13,6 +13,12 @@ libavutil:   2011-04-18
 
 API changes, most recent first:
 
+2011-05-10 - xxxxxxx - lavc 53.3.0 - avcodec.h
+  Deprecate AVLPCType and the following fields in
+  AVCodecContext: lpc_coeff_precision, prediction_order_method,
+  min_partition_order, max_partition_order, lpc_type, lpc_passes.
+  Corresponding FLAC encoder options should be used instead.
+
 2011-05-07 - xxxxxxx - lavfi 2.5.0 - avcodec.h
   Add libavfilter/avcodec.h header and avfilter_copy_frame_props()
   function.
diff --git a/ffmpeg.c b/ffmpeg.c
index 74461d368f..2875d8aec8 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -292,7 +292,6 @@ typedef struct AVOutputStream {
     int resample_pix_fmt;
 
     float frame_aspect_ratio;
-
     /* forced key frames */
     int64_t *forced_kf_pts;
     int forced_kf_count;
@@ -1505,7 +1504,7 @@ static int output_packet(AVInputStream *ist, int ist_index,
     AVFormatContext *os;
     AVOutputStream *ost;
     int ret, i;
-    int got_picture;
+    int got_output;
     AVFrame picture;
     void *buffer_to_free = NULL;
     static unsigned int samples_size= 0;
@@ -1537,7 +1536,7 @@ static int output_packet(AVInputStream *ist, int ist_index,
         pkt_pts = av_rescale_q(pkt->pts, ist->st->time_base, AV_TIME_BASE_Q);
 
     //while we have more to decode or while the decoder did output something on EOF
-    while (avpkt.size > 0 || (!pkt && ist->next_pts != ist->pts)) {
+    while (avpkt.size > 0 || (!pkt && got_output)) {
         uint8_t *data_buf, *decoded_data_buf;
         int data_size, decoded_data_size;
     handle_eof:
@@ -1573,9 +1572,10 @@ static int output_packet(AVInputStream *ist, int ist_index,
                 avpkt.data += ret;
                 avpkt.size -= ret;
                 data_size   = ret;
+                got_output  = decoded_data_size > 0;
                 /* Some bug in mpeg audio decoder gives */
                 /* decoded_data_size < 0, it seems they are overflows */
-                if (decoded_data_size <= 0) {
+                if (!got_output) {
                     /* no audio frame */
                     continue;
                 }
@@ -1592,11 +1592,11 @@ static int output_packet(AVInputStream *ist, int ist_index,
                     pkt_pts = AV_NOPTS_VALUE;
 
                     ret = avcodec_decode_video2(ist->st->codec,
-                                                &picture, &got_picture, &avpkt);
+                                                &picture, &got_output, &avpkt);
                     ist->st->quality= picture.quality;
                     if (ret < 0)
                         goto fail_decode;
-                    if (!got_picture) {
+                    if (!got_output) {
                         /* no picture yet */
                         goto discard_packet;
                     }
@@ -1613,10 +1613,10 @@ static int output_packet(AVInputStream *ist, int ist_index,
                     break;
             case AVMEDIA_TYPE_SUBTITLE:
                 ret = avcodec_decode_subtitle2(ist->st->codec,
-                                               &subtitle, &got_picture, &avpkt);
+                                               &subtitle, &got_output, &avpkt);
                 if (ret < 0)
                     goto fail_decode;
-                if (!got_picture) {
+                if (!got_output) {
                     goto discard_packet;
                 }
                 subtitle_to_free = &subtitle;
diff --git a/libavcodec/alacenc.c b/libavcodec/alacenc.c
index 9d2865d51e..c3a1fdfa03 100644
--- a/libavcodec/alacenc.c
+++ b/libavcodec/alacenc.c
@@ -146,7 +146,7 @@ static void calc_predictor_params(AlacEncodeContext *s, int ch)
                                       s->min_prediction_order,
                                       s->max_prediction_order,
                                       ALAC_MAX_LPC_PRECISION, coefs, shift,
-                                      AV_LPC_TYPE_LEVINSON, 0,
+                                      FF_LPC_TYPE_LEVINSON, 0,
                                       ORDER_METHOD_EST, ALAC_MAX_LPC_SHIFT, 1);
 
         s->lpc[ch].lpc_order = opt_order;
@@ -457,7 +457,7 @@ static av_cold int alac_encode_init(AVCodecContext *avctx)
 
     s->avctx = avctx;
     ret = ff_lpc_init(&s->lpc_ctx, avctx->frame_size, s->max_prediction_order,
-                      AV_LPC_TYPE_LEVINSON);
+                      FF_LPC_TYPE_LEVINSON);
 
     return ret;
 }
diff --git a/libavcodec/alpha/dsputil_alpha.c b/libavcodec/alpha/dsputil_alpha.c
index 96e7030e9d..6ce3f4bf15 100644
--- a/libavcodec/alpha/dsputil_alpha.c
+++ b/libavcodec/alpha/dsputil_alpha.c
@@ -270,9 +270,9 @@ static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
 
 void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx)
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
 
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
     c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
     c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
     c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
diff --git a/libavcodec/arm/dsputil_init_arm.c b/libavcodec/arm/dsputil_init_arm.c
index 218d162687..0351412761 100644
--- a/libavcodec/arm/dsputil_init_arm.c
+++ b/libavcodec/arm/dsputil_init_arm.c
@@ -75,7 +75,7 @@ static void simple_idct_arm_add(uint8_t *dest, int line_size, DCTELEM *block)
 
 void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx)
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
 
     ff_put_pixels_clamped = c->put_pixels_clamped;
     ff_add_pixels_clamped = c->add_pixels_clamped;
@@ -97,7 +97,7 @@ void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx)
 
     c->add_pixels_clamped = ff_add_pixels_clamped_arm;
 
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
     c->put_pixels_tab[0][0] = ff_put_pixels16_arm;
     c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm;
     c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm;
diff --git a/libavcodec/arm/dsputil_init_armv6.c b/libavcodec/arm/dsputil_init_armv6.c
index fc0f7865f0..9acea4a1d6 100644
--- a/libavcodec/arm/dsputil_init_armv6.c
+++ b/libavcodec/arm/dsputil_init_armv6.c
@@ -72,7 +72,7 @@ int ff_pix_sum_armv6(uint8_t *pix, int line_size);
 
 void av_cold ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx)
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
 
     if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO ||
                            avctx->idct_algo == FF_IDCT_SIMPLEARMV6)) {
@@ -82,7 +82,7 @@ void av_cold ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx)
         c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
     }
 
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
     c->put_pixels_tab[0][0] = ff_put_pixels16_armv6;
     c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6;
     c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6;
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 9e456f32df..6faf3dc8d0 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -173,7 +173,7 @@ void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src,
 
 void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
 
     if (!avctx->lowres) {
         if (avctx->idct_algo == FF_IDCT_AUTO ||
@@ -192,7 +192,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
         }
     }
 
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
     c->clear_block  = ff_clear_block_neon;
     c->clear_blocks = ff_clear_blocks_neon;
 
@@ -223,7 +223,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
 
     if (CONFIG_H264_DECODER) {
-        if (!h264_high_depth) {
+        if (!high_bit_depth) {
         c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
         c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
         c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
diff --git a/libavcodec/arm/dsputil_iwmmxt.c b/libavcodec/arm/dsputil_iwmmxt.c
index 6db1837ba0..85be83148a 100644
--- a/libavcodec/arm/dsputil_iwmmxt.c
+++ b/libavcodec/arm/dsputil_iwmmxt.c
@@ -155,7 +155,7 @@ static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
 {
     int mm_flags = AV_CPU_FLAG_IWMMXT; /* multimedia extension flags */
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
 
     if (avctx->dsp_mask) {
         if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
@@ -168,7 +168,7 @@ void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
 
     c->add_pixels_clamped = add_pixels_clamped_iwmmxt;
 
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
     c->clear_blocks = clear_blocks_iwmmxt;
 
     c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
diff --git a/libavcodec/arm/h264pred_init_arm.c b/libavcodec/arm/h264pred_init_arm.c
index 5b11b7da88..b1d4f005e8 100644
--- a/libavcodec/arm/h264pred_init_arm.c
+++ b/libavcodec/arm/h264pred_init_arm.c
@@ -74,7 +74,7 @@ static void ff_h264_pred_init_neon(H264PredContext *h, int codec_id, const int b
         h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
 }
 
-void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, const int bit_depth)
+void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, int bit_depth)
 {
     if (HAVE_NEON)    ff_h264_pred_init_neon(h, codec_id, bit_depth);
 }
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index acdb183f90..00e9dd5e2f 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -516,10 +516,11 @@ enum AVChromaLocation{
     AVCHROMA_LOC_NB           , ///< Not part of ABI
 };
 
+#if FF_API_FLAC_GLOBAL_OPTS
 /**
  * LPC analysis type
  */
-enum AVLPCType {
+attribute_deprecated enum AVLPCType {
     AV_LPC_TYPE_DEFAULT     = -1, ///< use the codec default LPC type
     AV_LPC_TYPE_NONE        =  0, ///< do not use LPC prediction or use all zero coefficients
     AV_LPC_TYPE_FIXED       =  1, ///< fixed LPC coefficients
@@ -527,6 +528,7 @@ enum AVLPCType {
     AV_LPC_TYPE_CHOLESKY    =  3, ///< Cholesky factorization
     AV_LPC_TYPE_NB              , ///< Not part of ABI
 };
+#endif
 
 enum AVAudioServiceType {
     AV_AUDIO_SERVICE_TYPE_MAIN              = 0,
@@ -2513,13 +2515,6 @@ typedef struct AVCodecContext {
     int compression_level;
 #define FF_COMPRESSION_DEFAULT -1
 
-    /**
-     * LPC coefficient precision - used by FLAC encoder
-     * - encoding: Set by user.
-     * - decoding: unused
-     */
-    int lpc_coeff_precision;
-
     /**
      * - encoding: Set by user.
      * - decoding: unused
@@ -2532,24 +2527,42 @@ typedef struct AVCodecContext {
      */
     int max_prediction_order;
 
+#if FF_API_FLAC_GLOBAL_OPTS
+    /**
+     * @defgroup flac_opts FLAC options
+     * @deprecated Use FLAC encoder private options instead.
+     * @{
+     */
+
+    /**
+     * LPC coefficient precision - used by FLAC encoder
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    attribute_deprecated int lpc_coeff_precision;
+
     /**
      * search method for selecting prediction order
      * - encoding: Set by user.
      * - decoding: unused
      */
-    int prediction_order_method;
+    attribute_deprecated int prediction_order_method;
 
     /**
      * - encoding: Set by user.
      * - decoding: unused
      */
-    int min_partition_order;
+    attribute_deprecated int min_partition_order;
 
     /**
      * - encoding: Set by user.
      * - decoding: unused
      */
-    int max_partition_order;
+    attribute_deprecated int max_partition_order;
+    /**
+     * @}
+     */
+#endif
 
     /**
      * GOP timecode frame start number, in non drop frame format
@@ -2767,19 +2780,21 @@ typedef struct AVCodecContext {
 
     int log_level_offset;
 
+#if FF_API_FLAC_GLOBAL_OPTS
     /**
      * Determines which LPC analysis algorithm to use.
      * - encoding: Set by user
      * - decoding: unused
      */
-    enum AVLPCType lpc_type;
+    attribute_deprecated enum AVLPCType lpc_type;
 
     /**
      * Number of passes to use for Cholesky factorization during LPC analysis
      * - encoding: Set by user
      * - decoding: unused
      */
-    int lpc_passes;
+    attribute_deprecated int lpc_passes;
+#endif
 
     /**
      * Number of slices.
diff --git a/libavcodec/bfin/dsputil_bfin.c b/libavcodec/bfin/dsputil_bfin.c
index 01d7ec6a44..5b94472326 100644
--- a/libavcodec/bfin/dsputil_bfin.c
+++ b/libavcodec/bfin/dsputil_bfin.c
@@ -197,14 +197,14 @@ static int bfin_pix_abs8_xy2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_si
 
 void dsputil_init_bfin( DSPContext* c, AVCodecContext *avctx )
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
 
     c->get_pixels         = ff_bfin_get_pixels;
     c->diff_pixels        = ff_bfin_diff_pixels;
     c->put_pixels_clamped = ff_bfin_put_pixels_clamped;
     c->add_pixels_clamped = ff_bfin_add_pixels_clamped;
 
-    if (!h264_high_depth)
+    if (!high_bit_depth)
     c->clear_blocks       = bfin_clear_blocks;
     c->pix_sum            = ff_bfin_pix_sum;
     c->pix_norm1          = ff_bfin_pix_norm1;
@@ -231,7 +231,7 @@ void dsputil_init_bfin( DSPContext* c, AVCodecContext *avctx )
     c->sse[1] = ff_bfin_sse8;
     c->sse[2] = ff_bfin_sse4;
 
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
     c->put_pixels_tab[0][0] = bfin_put_pixels16;
     c->put_pixels_tab[0][1] = bfin_put_pixels16_x2;
     c->put_pixels_tab[0][2] = bfin_put_pixels16_y2;
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 162458a7b5..0e596b1b01 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -43,15 +43,15 @@ uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
 uint32_t ff_squareTbl[512] = {0, };
 
 #define BIT_DEPTH 9
-#include "dsputil_internal.h"
+#include "dsputil_template.c"
 #undef BIT_DEPTH
 
 #define BIT_DEPTH 10
-#include "dsputil_internal.h"
+#include "dsputil_template.c"
 #undef BIT_DEPTH
 
 #define BIT_DEPTH 8
-#include "dsputil_internal.h"
+#include "dsputil_template.c"
 
 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
 #define pb_7f (~0UL/255 * 0x7f)
diff --git a/libavcodec/dsputil_template.c b/libavcodec/dsputil_template.c
new file mode 100644
index 0000000000..8ca6d3e414
--- /dev/null
+++ b/libavcodec/dsputil_template.c
@@ -0,0 +1,1391 @@
+/*
+ * DSP utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * DSP utils
+ */
+
+#include "high_bit_depth.h"
+
+static inline void FUNC(copy_block2)(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
+{
+    int i;
+    for(i=0; i<h; i++)
+    {
+        AV_WN2P(dst   , AV_RN2P(src   ));
+        dst+=dstStride;
+        src+=srcStride;
+    }
+}
+
+static inline void FUNC(copy_block4)(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
+{
+    int i;
+    for(i=0; i<h; i++)
+    {
+        AV_WN4P(dst   , AV_RN4P(src   ));
+        dst+=dstStride;
+        src+=srcStride;
+    }
+}
+
+static inline void FUNC(copy_block8)(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
+{
+    int i;
+    for(i=0; i<h; i++)
+    {
+        AV_WN4P(dst                , AV_RN4P(src                ));
+        AV_WN4P(dst+4*sizeof(pixel), AV_RN4P(src+4*sizeof(pixel)));
+        dst+=dstStride;
+        src+=srcStride;
+    }
+}
+
+static inline void FUNC(copy_block16)(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
+{
+    int i;
+    for(i=0; i<h; i++)
+    {
+        AV_WN4P(dst                 , AV_RN4P(src                 ));
+        AV_WN4P(dst+ 4*sizeof(pixel), AV_RN4P(src+ 4*sizeof(pixel)));
+        AV_WN4P(dst+ 8*sizeof(pixel), AV_RN4P(src+ 8*sizeof(pixel)));
+        AV_WN4P(dst+12*sizeof(pixel), AV_RN4P(src+12*sizeof(pixel)));
+        dst+=dstStride;
+        src+=srcStride;
+    }
+}
+
+/* draw the edges of width 'w' of an image of size width, height */
+//FIXME check that this is ok for mpeg4 interlaced
+static void FUNCC(draw_edges)(uint8_t *_buf, int _wrap, int width, int height, int w, int sides)
+{
+    pixel *buf = (pixel*)_buf;
+    int wrap = _wrap / sizeof(pixel);
+    pixel *ptr, *last_line;
+    int i;
+
+    /* left and right */
+    ptr = buf;
+    for(i=0;i<height;i++) {
+#if BIT_DEPTH > 8
+        int j;
+        for (j = 0; j < w; j++) {
+            ptr[j-w] = ptr[0];
+            ptr[j+width] = ptr[width-1];
+        }
+#else
+        memset(ptr - w, ptr[0], w);
+        memset(ptr + width, ptr[width-1], w);
+#endif
+        ptr += wrap;
+    }
+
+    /* top and bottom + corners */
+    buf -= w;
+    last_line = buf + (height - 1) * wrap;
+    if (sides & EDGE_TOP)
+        for(i = 0; i < w; i++)
+            memcpy(buf - (i + 1) * wrap, buf, (width + w + w) * sizeof(pixel)); // top
+    if (sides & EDGE_BOTTOM)
+        for (i = 0; i < w; i++)
+            memcpy(last_line + (i + 1) * wrap, last_line, (width + w + w) * sizeof(pixel)); // bottom
+}
+
+/**
+ * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
+ * @param buf destination buffer
+ * @param src source buffer
+ * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
+ * @param block_w width of block
+ * @param block_h height of block
+ * @param src_x x coordinate of the top left sample of the block in the source buffer
+ * @param src_y y coordinate of the top left sample of the block in the source buffer
+ * @param w width of the source buffer
+ * @param h height of the source buffer
+ */
+void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
+                                    int src_x, int src_y, int w, int h){
+    int x, y;
+    int start_y, start_x, end_y, end_x;
+
+    if(src_y>= h){
+        src+= (h-1-src_y)*linesize;
+        src_y=h-1;
+    }else if(src_y<=-block_h){
+        src+= (1-block_h-src_y)*linesize;
+        src_y=1-block_h;
+    }
+    if(src_x>= w){
+        src+= (w-1-src_x)*sizeof(pixel);
+        src_x=w-1;
+    }else if(src_x<=-block_w){
+        src+= (1-block_w-src_x)*sizeof(pixel);
+        src_x=1-block_w;
+    }
+
+    start_y= FFMAX(0, -src_y);
+    start_x= FFMAX(0, -src_x);
+    end_y= FFMIN(block_h, h-src_y);
+    end_x= FFMIN(block_w, w-src_x);
+    assert(start_y < end_y && block_h);
+    assert(start_x < end_x && block_w);
+
+    w    = end_x - start_x;
+    src += start_y*linesize + start_x*sizeof(pixel);
+    buf += start_x*sizeof(pixel);
+
+    //top
+    for(y=0; y<start_y; y++){
+        memcpy(buf, src, w*sizeof(pixel));
+        buf += linesize;
+    }
+
+    // copy existing part
+    for(; y<end_y; y++){
+        memcpy(buf, src, w*sizeof(pixel));
+        src += linesize;
+        buf += linesize;
+    }
+
+    //bottom
+    src -= linesize;
+    for(; y<block_h; y++){
+        memcpy(buf, src, w*sizeof(pixel));
+        buf += linesize;
+    }
+
+    buf -= block_h * linesize + start_x*sizeof(pixel);
+    while (block_h--){
+        pixel *bufp = (pixel*)buf;
+       //left
+        for(x=0; x<start_x; x++){
+            bufp[x] = bufp[start_x];
+        }
+
+       //right
+        for(x=end_x; x<block_w; x++){
+            bufp[x] = bufp[end_x - 1];
+        }
+        buf += linesize;
+    }
+}
+
+static void FUNCC(add_pixels8)(uint8_t *restrict _pixels, DCTELEM *_block, int line_size)
+{
+    int i;
+    pixel *restrict pixels = (pixel *restrict)_pixels;
+    dctcoef *block = (dctcoef*)_block;
+    line_size /= sizeof(pixel);
+
+    for(i=0;i<8;i++) {
+        pixels[0] += block[0];
+        pixels[1] += block[1];
+        pixels[2] += block[2];
+        pixels[3] += block[3];
+        pixels[4] += block[4];
+        pixels[5] += block[5];
+        pixels[6] += block[6];
+        pixels[7] += block[7];
+        pixels += line_size;
+        block += 8;
+    }
+}
+
+static void FUNCC(add_pixels4)(uint8_t *restrict _pixels, DCTELEM *_block, int line_size)
+{
+    int i;
+    pixel *restrict pixels = (pixel *restrict)_pixels;
+    dctcoef *block = (dctcoef*)_block;
+    line_size /= sizeof(pixel);
+
+    for(i=0;i<4;i++) {
+        pixels[0] += block[0];
+        pixels[1] += block[1];
+        pixels[2] += block[2];
+        pixels[3] += block[3];
+        pixels += line_size;
+        block += 4;
+    }
+}
+
+#if 0
+
+#define PIXOP2(OPNAME, OP) \
+static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        OP(*((uint64_t*)block), AV_RN64(pixels));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+\
+static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        const uint64_t a= AV_RN64(pixels  );\
+        const uint64_t b= AV_RN64(pixels+1);\
+        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+\
+static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        const uint64_t a= AV_RN64(pixels  );\
+        const uint64_t b= AV_RN64(pixels+1);\
+        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+\
+static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        const uint64_t a= AV_RN64(pixels          );\
+        const uint64_t b= AV_RN64(pixels+line_size);\
+        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+\
+static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        const uint64_t a= AV_RN64(pixels          );\
+        const uint64_t b= AV_RN64(pixels+line_size);\
+        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+\
+static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+        int i;\
+        const uint64_t a= AV_RN64(pixels  );\
+        const uint64_t b= AV_RN64(pixels+1);\
+        uint64_t l0=  (a&0x0303030303030303ULL)\
+                    + (b&0x0303030303030303ULL)\
+                    + 0x0202020202020202ULL;\
+        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
+        uint64_t l1,h1;\
+\
+        pixels+=line_size;\
+        for(i=0; i<h; i+=2){\
+            uint64_t a= AV_RN64(pixels  );\
+            uint64_t b= AV_RN64(pixels+1);\
+            l1=  (a&0x0303030303030303ULL)\
+               + (b&0x0303030303030303ULL);\
+            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
+            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
+            pixels+=line_size;\
+            block +=line_size;\
+            a= AV_RN64(pixels  );\
+            b= AV_RN64(pixels+1);\
+            l0=  (a&0x0303030303030303ULL)\
+               + (b&0x0303030303030303ULL)\
+               + 0x0202020202020202ULL;\
+            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
+            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
+            pixels+=line_size;\
+            block +=line_size;\
+        }\
+}\
+\
+static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+        int i;\
+        const uint64_t a= AV_RN64(pixels  );\
+        const uint64_t b= AV_RN64(pixels+1);\
+        uint64_t l0=  (a&0x0303030303030303ULL)\
+                    + (b&0x0303030303030303ULL)\
+                    + 0x0101010101010101ULL;\
+        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
+        uint64_t l1,h1;\
+\
+        pixels+=line_size;\
+        for(i=0; i<h; i+=2){\
+            uint64_t a= AV_RN64(pixels  );\
+            uint64_t b= AV_RN64(pixels+1);\
+            l1=  (a&0x0303030303030303ULL)\
+               + (b&0x0303030303030303ULL);\
+            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
+            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
+            pixels+=line_size;\
+            block +=line_size;\
+            a= AV_RN64(pixels  );\
+            b= AV_RN64(pixels+1);\
+            l0=  (a&0x0303030303030303ULL)\
+               + (b&0x0303030303030303ULL)\
+               + 0x0101010101010101ULL;\
+            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
+            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
+            pixels+=line_size;\
+            block +=line_size;\
+        }\
+}\
+\
+CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8*sizeof(pixel))\
+CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8*sizeof(pixel))\
+CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8*sizeof(pixel))\
+CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8*sizeof(pixel))\
+CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8*sizeof(pixel))\
+CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8*sizeof(pixel))\
+CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8*sizeof(pixel))
+
+#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
+#else // 64 bit variant
+
+#define PIXOP2(OPNAME, OP) \
+static void FUNCC(OPNAME ## _pixels2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        OP(*((pixel2*)(block  )), AV_RN2P(pixels  ));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+static void FUNCC(OPNAME ## _pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        OP(*((pixel4*)(block  )), AV_RN4P(pixels  ));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+static void FUNCC(OPNAME ## _pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        OP(*((pixel4*)(block                )), AV_RN4P(pixels                ));\
+        OP(*((pixel4*)(block+4*sizeof(pixel))), AV_RN4P(pixels+4*sizeof(pixel)));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+static inline void FUNCC(OPNAME ## _no_rnd_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    FUNCC(OPNAME ## _pixels8)(block, pixels, line_size, h);\
+}\
+\
+static inline void FUNC(OPNAME ## _no_rnd_pixels8_l2)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        pixel4 a,b;\
+        a= AV_RN4P(&src1[i*src_stride1  ]);\
+        b= AV_RN4P(&src2[i*src_stride2  ]);\
+        OP(*((pixel4*)&dst[i*dst_stride  ]), no_rnd_avg_pixel4(a, b));\
+        a= AV_RN4P(&src1[i*src_stride1+4*sizeof(pixel)]);\
+        b= AV_RN4P(&src2[i*src_stride2+4*sizeof(pixel)]);\
+        OP(*((pixel4*)&dst[i*dst_stride+4*sizeof(pixel)]), no_rnd_avg_pixel4(a, b));\
+    }\
+}\
+\
+static inline void FUNC(OPNAME ## _pixels8_l2)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        pixel4 a,b;\
+        a= AV_RN4P(&src1[i*src_stride1  ]);\
+        b= AV_RN4P(&src2[i*src_stride2  ]);\
+        OP(*((pixel4*)&dst[i*dst_stride  ]), rnd_avg_pixel4(a, b));\
+        a= AV_RN4P(&src1[i*src_stride1+4*sizeof(pixel)]);\
+        b= AV_RN4P(&src2[i*src_stride2+4*sizeof(pixel)]);\
+        OP(*((pixel4*)&dst[i*dst_stride+4*sizeof(pixel)]), rnd_avg_pixel4(a, b));\
+    }\
+}\
+\
+static inline void FUNC(OPNAME ## _pixels4_l2)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        pixel4 a,b;\
+        a= AV_RN4P(&src1[i*src_stride1  ]);\
+        b= AV_RN4P(&src2[i*src_stride2  ]);\
+        OP(*((pixel4*)&dst[i*dst_stride  ]), rnd_avg_pixel4(a, b));\
+    }\
+}\
+\
+static inline void FUNC(OPNAME ## _pixels2_l2)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
+    int i;\
+    for(i=0; i<h; i++){\
+        pixel4 a,b;\
+        a= AV_RN2P(&src1[i*src_stride1  ]);\
+        b= AV_RN2P(&src2[i*src_stride2  ]);\
+        OP(*((pixel2*)&dst[i*dst_stride  ]), rnd_avg_pixel4(a, b));\
+    }\
+}\
+\
+static inline void FUNC(OPNAME ## _pixels16_l2)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
+    FUNC(OPNAME ## _pixels8_l2)(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
+    FUNC(OPNAME ## _pixels8_l2)(dst+8*sizeof(pixel), src1+8*sizeof(pixel), src2+8*sizeof(pixel), dst_stride, src_stride1, src_stride2, h);\
+}\
+\
+static inline void FUNC(OPNAME ## _no_rnd_pixels16_l2)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
+    FUNC(OPNAME ## _no_rnd_pixels8_l2)(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
+    FUNC(OPNAME ## _no_rnd_pixels8_l2)(dst+8*sizeof(pixel), src1+8*sizeof(pixel), src2+8*sizeof(pixel), dst_stride, src_stride1, src_stride2, h);\
+}\
+\
+static inline void FUNCC(OPNAME ## _no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    FUNC(OPNAME ## _no_rnd_pixels8_l2)(block, pixels, pixels+sizeof(pixel), line_size, line_size, line_size, h);\
+}\
+\
+static inline void FUNCC(OPNAME ## _pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    FUNC(OPNAME ## _pixels8_l2)(block, pixels, pixels+sizeof(pixel), line_size, line_size, line_size, h);\
+}\
+\
+static inline void FUNCC(OPNAME ## _no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    FUNC(OPNAME ## _no_rnd_pixels8_l2)(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
+}\
+\
+static inline void FUNCC(OPNAME ## _pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    FUNC(OPNAME ## _pixels8_l2)(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
+}\
+\
+static inline void FUNC(OPNAME ## _pixels8_l4)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
+                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
+    /* FIXME HIGH BIT DEPTH */\
+    int i;\
+    for(i=0; i<h; i++){\
+        uint32_t a, b, c, d, l0, l1, h0, h1;\
+        a= AV_RN32(&src1[i*src_stride1]);\
+        b= AV_RN32(&src2[i*src_stride2]);\
+        c= AV_RN32(&src3[i*src_stride3]);\
+        d= AV_RN32(&src4[i*src_stride4]);\
+        l0=  (a&0x03030303UL)\
+           + (b&0x03030303UL)\
+           + 0x02020202UL;\
+        h0= ((a&0xFCFCFCFCUL)>>2)\
+          + ((b&0xFCFCFCFCUL)>>2);\
+        l1=  (c&0x03030303UL)\
+           + (d&0x03030303UL);\
+        h1= ((c&0xFCFCFCFCUL)>>2)\
+          + ((d&0xFCFCFCFCUL)>>2);\
+        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+        a= AV_RN32(&src1[i*src_stride1+4]);\
+        b= AV_RN32(&src2[i*src_stride2+4]);\
+        c= AV_RN32(&src3[i*src_stride3+4]);\
+        d= AV_RN32(&src4[i*src_stride4+4]);\
+        l0=  (a&0x03030303UL)\
+           + (b&0x03030303UL)\
+           + 0x02020202UL;\
+        h0= ((a&0xFCFCFCFCUL)>>2)\
+          + ((b&0xFCFCFCFCUL)>>2);\
+        l1=  (c&0x03030303UL)\
+           + (d&0x03030303UL);\
+        h1= ((c&0xFCFCFCFCUL)>>2)\
+          + ((d&0xFCFCFCFCUL)>>2);\
+        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+    }\
+}\
+\
+static inline void FUNCC(OPNAME ## _pixels4_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    FUNC(OPNAME ## _pixels4_l2)(block, pixels, pixels+sizeof(pixel), line_size, line_size, line_size, h);\
+}\
+\
+static inline void FUNCC(OPNAME ## _pixels4_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    FUNC(OPNAME ## _pixels4_l2)(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
+}\
+\
+static inline void FUNCC(OPNAME ## _pixels2_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    FUNC(OPNAME ## _pixels2_l2)(block, pixels, pixels+sizeof(pixel), line_size, line_size, line_size, h);\
+}\
+\
+static inline void FUNCC(OPNAME ## _pixels2_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    FUNC(OPNAME ## _pixels2_l2)(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
+}\
+\
+static inline void FUNC(OPNAME ## _no_rnd_pixels8_l4)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
+                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
+    /* FIXME HIGH BIT DEPTH*/\
+    int i;\
+    for(i=0; i<h; i++){\
+        uint32_t a, b, c, d, l0, l1, h0, h1;\
+        a= AV_RN32(&src1[i*src_stride1]);\
+        b= AV_RN32(&src2[i*src_stride2]);\
+        c= AV_RN32(&src3[i*src_stride3]);\
+        d= AV_RN32(&src4[i*src_stride4]);\
+        l0=  (a&0x03030303UL)\
+           + (b&0x03030303UL)\
+           + 0x01010101UL;\
+        h0= ((a&0xFCFCFCFCUL)>>2)\
+          + ((b&0xFCFCFCFCUL)>>2);\
+        l1=  (c&0x03030303UL)\
+           + (d&0x03030303UL);\
+        h1= ((c&0xFCFCFCFCUL)>>2)\
+          + ((d&0xFCFCFCFCUL)>>2);\
+        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+        a= AV_RN32(&src1[i*src_stride1+4]);\
+        b= AV_RN32(&src2[i*src_stride2+4]);\
+        c= AV_RN32(&src3[i*src_stride3+4]);\
+        d= AV_RN32(&src4[i*src_stride4+4]);\
+        l0=  (a&0x03030303UL)\
+           + (b&0x03030303UL)\
+           + 0x01010101UL;\
+        h0= ((a&0xFCFCFCFCUL)>>2)\
+          + ((b&0xFCFCFCFCUL)>>2);\
+        l1=  (c&0x03030303UL)\
+           + (d&0x03030303UL);\
+        h1= ((c&0xFCFCFCFCUL)>>2)\
+          + ((d&0xFCFCFCFCUL)>>2);\
+        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+    }\
+}\
+static inline void FUNC(OPNAME ## _pixels16_l4)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
+                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
+    FUNC(OPNAME ## _pixels8_l4)(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
+    FUNC(OPNAME ## _pixels8_l4)(dst+8*sizeof(pixel), src1+8*sizeof(pixel), src2+8*sizeof(pixel), src3+8*sizeof(pixel), src4+8*sizeof(pixel), dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
+}\
+static inline void FUNC(OPNAME ## _no_rnd_pixels16_l4)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
+                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
+    FUNC(OPNAME ## _no_rnd_pixels8_l4)(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
+    FUNC(OPNAME ## _no_rnd_pixels8_l4)(dst+8*sizeof(pixel), src1+8*sizeof(pixel), src2+8*sizeof(pixel), src3+8*sizeof(pixel), src4+8*sizeof(pixel), dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
+}\
+\
+static inline void FUNCC(OPNAME ## _pixels2_xy2)(uint8_t *_block, const uint8_t *_pixels, int line_size, int h)\
+{\
+        int i, a0, b0, a1, b1;\
+        pixel *block = (pixel*)_block;\
+        const pixel *pixels = (const pixel*)_pixels;\
+        line_size /= sizeof(pixel);\
+        a0= pixels[0];\
+        b0= pixels[1] + 2;\
+        a0 += b0;\
+        b0 += pixels[2];\
+\
+        pixels+=line_size;\
+        for(i=0; i<h; i+=2){\
+            a1= pixels[0];\
+            b1= pixels[1];\
+            a1 += b1;\
+            b1 += pixels[2];\
+\
+            block[0]= (a1+a0)>>2; /* FIXME non put */\
+            block[1]= (b1+b0)>>2;\
+\
+            pixels+=line_size;\
+            block +=line_size;\
+\
+            a0= pixels[0];\
+            b0= pixels[1] + 2;\
+            a0 += b0;\
+            b0 += pixels[2];\
+\
+            block[0]= (a1+a0)>>2;\
+            block[1]= (b1+b0)>>2;\
+            pixels+=line_size;\
+            block +=line_size;\
+        }\
+}\
+\
+static inline void FUNCC(OPNAME ## _pixels4_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+        /* FIXME HIGH BIT DEPTH */\
+        int i;\
+        const uint32_t a= AV_RN32(pixels  );\
+        const uint32_t b= AV_RN32(pixels+1);\
+        uint32_t l0=  (a&0x03030303UL)\
+                    + (b&0x03030303UL)\
+                    + 0x02020202UL;\
+        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
+                   + ((b&0xFCFCFCFCUL)>>2);\
+        uint32_t l1,h1;\
+\
+        pixels+=line_size;\
+        for(i=0; i<h; i+=2){\
+            uint32_t a= AV_RN32(pixels  );\
+            uint32_t b= AV_RN32(pixels+1);\
+            l1=  (a&0x03030303UL)\
+               + (b&0x03030303UL);\
+            h1= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+            a= AV_RN32(pixels  );\
+            b= AV_RN32(pixels+1);\
+            l0=  (a&0x03030303UL)\
+               + (b&0x03030303UL)\
+               + 0x02020202UL;\
+            h0= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+        }\
+}\
+\
+static inline void FUNCC(OPNAME ## _pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    /* FIXME HIGH BIT DEPTH */\
+    int j;\
+    for(j=0; j<2; j++){\
+        int i;\
+        const uint32_t a= AV_RN32(pixels  );\
+        const uint32_t b= AV_RN32(pixels+1);\
+        uint32_t l0=  (a&0x03030303UL)\
+                    + (b&0x03030303UL)\
+                    + 0x02020202UL;\
+        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
+                   + ((b&0xFCFCFCFCUL)>>2);\
+        uint32_t l1,h1;\
+\
+        pixels+=line_size;\
+        for(i=0; i<h; i+=2){\
+            uint32_t a= AV_RN32(pixels  );\
+            uint32_t b= AV_RN32(pixels+1);\
+            l1=  (a&0x03030303UL)\
+               + (b&0x03030303UL);\
+            h1= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+            a= AV_RN32(pixels  );\
+            b= AV_RN32(pixels+1);\
+            l0=  (a&0x03030303UL)\
+               + (b&0x03030303UL)\
+               + 0x02020202UL;\
+            h0= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+        }\
+        pixels+=4-line_size*(h+1);\
+        block +=4-line_size*h;\
+    }\
+}\
+\
+static inline void FUNCC(OPNAME ## _no_rnd_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    /* FIXME HIGH BIT DEPTH */\
+    int j;\
+    for(j=0; j<2; j++){\
+        int i;\
+        const uint32_t a= AV_RN32(pixels  );\
+        const uint32_t b= AV_RN32(pixels+1);\
+        uint32_t l0=  (a&0x03030303UL)\
+                    + (b&0x03030303UL)\
+                    + 0x01010101UL;\
+        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
+                   + ((b&0xFCFCFCFCUL)>>2);\
+        uint32_t l1,h1;\
+\
+        pixels+=line_size;\
+        for(i=0; i<h; i+=2){\
+            uint32_t a= AV_RN32(pixels  );\
+            uint32_t b= AV_RN32(pixels+1);\
+            l1=  (a&0x03030303UL)\
+               + (b&0x03030303UL);\
+            h1= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+            a= AV_RN32(pixels  );\
+            b= AV_RN32(pixels+1);\
+            l0=  (a&0x03030303UL)\
+               + (b&0x03030303UL)\
+               + 0x01010101UL;\
+            h0= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+        }\
+        pixels+=4-line_size*(h+1);\
+        block +=4-line_size*h;\
+    }\
+}\
+\
+CALL_2X_PIXELS(FUNCC(OPNAME ## _pixels16)    , FUNCC(OPNAME ## _pixels8)    , 8*sizeof(pixel))\
+CALL_2X_PIXELS(FUNCC(OPNAME ## _pixels16_x2) , FUNCC(OPNAME ## _pixels8_x2) , 8*sizeof(pixel))\
+CALL_2X_PIXELS(FUNCC(OPNAME ## _pixels16_y2) , FUNCC(OPNAME ## _pixels8_y2) , 8*sizeof(pixel))\
+CALL_2X_PIXELS(FUNCC(OPNAME ## _pixels16_xy2), FUNCC(OPNAME ## _pixels8_xy2), 8*sizeof(pixel))\
+av_unused CALL_2X_PIXELS(FUNCC(OPNAME ## _no_rnd_pixels16)    , FUNCC(OPNAME ## _pixels8) , 8*sizeof(pixel))\
+CALL_2X_PIXELS(FUNCC(OPNAME ## _no_rnd_pixels16_x2) , FUNCC(OPNAME ## _no_rnd_pixels8_x2) , 8*sizeof(pixel))\
+CALL_2X_PIXELS(FUNCC(OPNAME ## _no_rnd_pixels16_y2) , FUNCC(OPNAME ## _no_rnd_pixels8_y2) , 8*sizeof(pixel))\
+CALL_2X_PIXELS(FUNCC(OPNAME ## _no_rnd_pixels16_xy2), FUNCC(OPNAME ## _no_rnd_pixels8_xy2), 8*sizeof(pixel))\
+
+#define op_avg(a, b) a = rnd_avg_pixel4(a, b)
+#endif
+#define op_put(a, b) a = b
+
+PIXOP2(avg, op_avg)
+PIXOP2(put, op_put)
+#undef op_avg
+#undef op_put
+
+#define put_no_rnd_pixels8_c  put_pixels8_c
+#define put_no_rnd_pixels16_c put_pixels16_c
+
+static void FUNCC(put_no_rnd_pixels16_l2)(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
+    FUNC(put_no_rnd_pixels16_l2)(dst, a, b, stride, stride, stride, h);
+}
+
+static void FUNCC(put_no_rnd_pixels8_l2)(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
+    FUNC(put_no_rnd_pixels8_l2)(dst, a, b, stride, stride, stride, h);
+}
+
+#define H264_CHROMA_MC(OPNAME, OP)\
+static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    const int A=(8-x)*(8-y);\
+    const int B=(  x)*(8-y);\
+    const int C=(8-x)*(  y);\
+    const int D=(  x)*(  y);\
+    int i;\
+    stride /= sizeof(pixel);\
+    \
+    assert(x<8 && y<8 && x>=0 && y>=0);\
+\
+    if(D){\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
+            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }else{\
+        const int E= B+C;\
+        const int step= C ? stride : 1;\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + E*src[step+0]));\
+            OP(dst[1], (A*src[1] + E*src[step+1]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }\
+}\
+\
+static void FUNCC(OPNAME ## h264_chroma_mc4)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    const int A=(8-x)*(8-y);\
+    const int B=(  x)*(8-y);\
+    const int C=(8-x)*(  y);\
+    const int D=(  x)*(  y);\
+    int i;\
+    stride /= sizeof(pixel);\
+    \
+    assert(x<8 && y<8 && x>=0 && y>=0);\
+\
+    if(D){\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
+            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
+            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
+            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }else{\
+        const int E= B+C;\
+        const int step= C ? stride : 1;\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + E*src[step+0]));\
+            OP(dst[1], (A*src[1] + E*src[step+1]));\
+            OP(dst[2], (A*src[2] + E*src[step+2]));\
+            OP(dst[3], (A*src[3] + E*src[step+3]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }\
+}\
+\
+static void FUNCC(OPNAME ## h264_chroma_mc8)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    const int A=(8-x)*(8-y);\
+    const int B=(  x)*(8-y);\
+    const int C=(8-x)*(  y);\
+    const int D=(  x)*(  y);\
+    int i;\
+    stride /= sizeof(pixel);\
+    \
+    assert(x<8 && y<8 && x>=0 && y>=0);\
+\
+    if(D){\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
+            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
+            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
+            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
+            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
+            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
+            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
+            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }else{\
+        const int E= B+C;\
+        const int step= C ? stride : 1;\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + E*src[step+0]));\
+            OP(dst[1], (A*src[1] + E*src[step+1]));\
+            OP(dst[2], (A*src[2] + E*src[step+2]));\
+            OP(dst[3], (A*src[3] + E*src[step+3]));\
+            OP(dst[4], (A*src[4] + E*src[step+4]));\
+            OP(dst[5], (A*src[5] + E*src[step+5]));\
+            OP(dst[6], (A*src[6] + E*src[step+6]));\
+            OP(dst[7], (A*src[7] + E*src[step+7]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }\
+}
+
+#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
+#define op_put(a, b) a = (((b) + 32)>>6)
+
+H264_CHROMA_MC(put_       , op_put)
+H264_CHROMA_MC(avg_       , op_avg)
+#undef op_avg
+#undef op_put
+
+#define H264_LOWPASS(OPNAME, OP, OP2) \
+static av_unused void FUNC(OPNAME ## h264_qpel2_h_lowpass)(uint8_t *_dst, uint8_t *_src, int dstStride, int srcStride){\
+    const int h=2;\
+    INIT_CLIP\
+    int i;\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    dstStride /= sizeof(pixel);\
+    srcStride /= sizeof(pixel);\
+    for(i=0; i<h; i++)\
+    {\
+        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
+        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
+        dst+=dstStride;\
+        src+=srcStride;\
+    }\
+}\
+\
+static av_unused void FUNC(OPNAME ## h264_qpel2_v_lowpass)(uint8_t *_dst, uint8_t *_src, int dstStride, int srcStride){\
+    const int w=2;\
+    INIT_CLIP\
+    int i;\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    dstStride /= sizeof(pixel);\
+    srcStride /= sizeof(pixel);\
+    for(i=0; i<w; i++)\
+    {\
+        const int srcB= src[-2*srcStride];\
+        const int srcA= src[-1*srcStride];\
+        const int src0= src[0 *srcStride];\
+        const int src1= src[1 *srcStride];\
+        const int src2= src[2 *srcStride];\
+        const int src3= src[3 *srcStride];\
+        const int src4= src[4 *srcStride];\
+        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
+        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
+        dst++;\
+        src++;\
+    }\
+}\
+\
+static av_unused void FUNC(OPNAME ## h264_qpel2_hv_lowpass)(uint8_t *_dst, int16_t *tmp, uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
+    const int h=2;\
+    const int w=2;\
+    const int pad = (BIT_DEPTH > 9) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
+    INIT_CLIP\
+    int i;\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    dstStride /= sizeof(pixel);\
+    srcStride /= sizeof(pixel);\
+    src -= 2*srcStride;\
+    for(i=0; i<h+5; i++)\
+    {\
+        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]) + pad;\
+        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]) + pad;\
+        tmp+=tmpStride;\
+        src+=srcStride;\
+    }\
+    tmp -= tmpStride*(h+5-2);\
+    for(i=0; i<w; i++)\
+    {\
+        const int tmpB= tmp[-2*tmpStride] - pad;\
+        const int tmpA= tmp[-1*tmpStride] - pad;\
+        const int tmp0= tmp[0 *tmpStride] - pad;\
+        const int tmp1= tmp[1 *tmpStride] - pad;\
+        const int tmp2= tmp[2 *tmpStride] - pad;\
+        const int tmp3= tmp[3 *tmpStride] - pad;\
+        const int tmp4= tmp[4 *tmpStride] - pad;\
+        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
+        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
+        dst++;\
+        tmp++;\
+    }\
+}\
+static void FUNC(OPNAME ## h264_qpel4_h_lowpass)(uint8_t *_dst, uint8_t *_src, int dstStride, int srcStride){\
+    const int h=4;\
+    INIT_CLIP\
+    int i;\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    dstStride /= sizeof(pixel);\
+    srcStride /= sizeof(pixel);\
+    for(i=0; i<h; i++)\
+    {\
+        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
+        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
+        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
+        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
+        dst+=dstStride;\
+        src+=srcStride;\
+    }\
+}\
+\
+static void FUNC(OPNAME ## h264_qpel4_v_lowpass)(uint8_t *_dst, uint8_t *_src, int dstStride, int srcStride){\
+    const int w=4;\
+    INIT_CLIP\
+    int i;\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    dstStride /= sizeof(pixel);\
+    srcStride /= sizeof(pixel);\
+    for(i=0; i<w; i++)\
+    {\
+        const int srcB= src[-2*srcStride];\
+        const int srcA= src[-1*srcStride];\
+        const int src0= src[0 *srcStride];\
+        const int src1= src[1 *srcStride];\
+        const int src2= src[2 *srcStride];\
+        const int src3= src[3 *srcStride];\
+        const int src4= src[4 *srcStride];\
+        const int src5= src[5 *srcStride];\
+        const int src6= src[6 *srcStride];\
+        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
+        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
+        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
+        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
+        dst++;\
+        src++;\
+    }\
+}\
+\
+static void FUNC(OPNAME ## h264_qpel4_hv_lowpass)(uint8_t *_dst, int16_t *tmp, uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
+    const int h=4;\
+    const int w=4;\
+    const int pad = (BIT_DEPTH > 9) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
+    INIT_CLIP\
+    int i;\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    dstStride /= sizeof(pixel);\
+    srcStride /= sizeof(pixel);\
+    src -= 2*srcStride;\
+    for(i=0; i<h+5; i++)\
+    {\
+        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]) + pad;\
+        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]) + pad;\
+        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]) + pad;\
+        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]) + pad;\
+        tmp+=tmpStride;\
+        src+=srcStride;\
+    }\
+    tmp -= tmpStride*(h+5-2);\
+    for(i=0; i<w; i++)\
+    {\
+        const int tmpB= tmp[-2*tmpStride] - pad;\
+        const int tmpA= tmp[-1*tmpStride] - pad;\
+        const int tmp0= tmp[0 *tmpStride] - pad;\
+        const int tmp1= tmp[1 *tmpStride] - pad;\
+        const int tmp2= tmp[2 *tmpStride] - pad;\
+        const int tmp3= tmp[3 *tmpStride] - pad;\
+        const int tmp4= tmp[4 *tmpStride] - pad;\
+        const int tmp5= tmp[5 *tmpStride] - pad;\
+        const int tmp6= tmp[6 *tmpStride] - pad;\
+        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
+        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
+        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
+        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
+        dst++;\
+        tmp++;\
+    }\
+}\
+\
+static void FUNC(OPNAME ## h264_qpel8_h_lowpass)(uint8_t *_dst, uint8_t *_src, int dstStride, int srcStride){\
+    const int h=8;\
+    INIT_CLIP\
+    int i;\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    dstStride /= sizeof(pixel);\
+    srcStride /= sizeof(pixel);\
+    for(i=0; i<h; i++)\
+    {\
+        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
+        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
+        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
+        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
+        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
+        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
+        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
+        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
+        dst+=dstStride;\
+        src+=srcStride;\
+    }\
+}\
+\
+static void FUNC(OPNAME ## h264_qpel8_v_lowpass)(uint8_t *_dst, uint8_t *_src, int dstStride, int srcStride){\
+    const int w=8;\
+    INIT_CLIP\
+    int i;\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    dstStride /= sizeof(pixel);\
+    srcStride /= sizeof(pixel);\
+    for(i=0; i<w; i++)\
+    {\
+        const int srcB= src[-2*srcStride];\
+        const int srcA= src[-1*srcStride];\
+        const int src0= src[0 *srcStride];\
+        const int src1= src[1 *srcStride];\
+        const int src2= src[2 *srcStride];\
+        const int src3= src[3 *srcStride];\
+        const int src4= src[4 *srcStride];\
+        const int src5= src[5 *srcStride];\
+        const int src6= src[6 *srcStride];\
+        const int src7= src[7 *srcStride];\
+        const int src8= src[8 *srcStride];\
+        const int src9= src[9 *srcStride];\
+        const int src10=src[10*srcStride];\
+        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
+        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
+        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
+        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
+        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
+        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
+        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
+        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
+        dst++;\
+        src++;\
+    }\
+}\
+\
+static void FUNC(OPNAME ## h264_qpel8_hv_lowpass)(uint8_t *_dst, int16_t *tmp, uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
+    const int h=8;\
+    const int w=8;\
+    const int pad = (BIT_DEPTH > 9) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
+    INIT_CLIP\
+    int i;\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    dstStride /= sizeof(pixel);\
+    srcStride /= sizeof(pixel);\
+    src -= 2*srcStride;\
+    for(i=0; i<h+5; i++)\
+    {\
+        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]) + pad;\
+        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]) + pad;\
+        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]) + pad;\
+        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]) + pad;\
+        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]) + pad;\
+        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]) + pad;\
+        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]) + pad;\
+        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]) + pad;\
+        tmp+=tmpStride;\
+        src+=srcStride;\
+    }\
+    tmp -= tmpStride*(h+5-2);\
+    for(i=0; i<w; i++)\
+    {\
+        const int tmpB= tmp[-2*tmpStride] - pad;\
+        const int tmpA= tmp[-1*tmpStride] - pad;\
+        const int tmp0= tmp[0 *tmpStride] - pad;\
+        const int tmp1= tmp[1 *tmpStride] - pad;\
+        const int tmp2= tmp[2 *tmpStride] - pad;\
+        const int tmp3= tmp[3 *tmpStride] - pad;\
+        const int tmp4= tmp[4 *tmpStride] - pad;\
+        const int tmp5= tmp[5 *tmpStride] - pad;\
+        const int tmp6= tmp[6 *tmpStride] - pad;\
+        const int tmp7= tmp[7 *tmpStride] - pad;\
+        const int tmp8= tmp[8 *tmpStride] - pad;\
+        const int tmp9= tmp[9 *tmpStride] - pad;\
+        const int tmp10=tmp[10*tmpStride] - pad;\
+        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
+        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
+        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
+        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
+        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
+        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
+        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
+        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
+        dst++;\
+        tmp++;\
+    }\
+}\
+\
+static void FUNC(OPNAME ## h264_qpel16_v_lowpass)(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    FUNC(OPNAME ## h264_qpel8_v_lowpass)(dst                , src                , dstStride, srcStride);\
+    FUNC(OPNAME ## h264_qpel8_v_lowpass)(dst+8*sizeof(pixel), src+8*sizeof(pixel), dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    FUNC(OPNAME ## h264_qpel8_v_lowpass)(dst                , src                , dstStride, srcStride);\
+    FUNC(OPNAME ## h264_qpel8_v_lowpass)(dst+8*sizeof(pixel), src+8*sizeof(pixel), dstStride, srcStride);\
+}\
+\
+static void FUNC(OPNAME ## h264_qpel16_h_lowpass)(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    FUNC(OPNAME ## h264_qpel8_h_lowpass)(dst                , src                , dstStride, srcStride);\
+    FUNC(OPNAME ## h264_qpel8_h_lowpass)(dst+8*sizeof(pixel), src+8*sizeof(pixel), dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    FUNC(OPNAME ## h264_qpel8_h_lowpass)(dst                , src                , dstStride, srcStride);\
+    FUNC(OPNAME ## h264_qpel8_h_lowpass)(dst+8*sizeof(pixel), src+8*sizeof(pixel), dstStride, srcStride);\
+}\
+\
+static void FUNC(OPNAME ## h264_qpel16_hv_lowpass)(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    FUNC(OPNAME ## h264_qpel8_hv_lowpass)(dst                , tmp  , src                , dstStride, tmpStride, srcStride);\
+    FUNC(OPNAME ## h264_qpel8_hv_lowpass)(dst+8*sizeof(pixel), tmp+8, src+8*sizeof(pixel), dstStride, tmpStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    FUNC(OPNAME ## h264_qpel8_hv_lowpass)(dst                , tmp  , src                , dstStride, tmpStride, srcStride);\
+    FUNC(OPNAME ## h264_qpel8_hv_lowpass)(dst+8*sizeof(pixel), tmp+8, src+8*sizeof(pixel), dstStride, tmpStride, srcStride);\
+}\
+
+#define H264_MC(OPNAME, SIZE) \
+static av_unused void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc00)(uint8_t *dst, uint8_t *src, int stride){\
+    FUNCC(OPNAME ## pixels ## SIZE)(dst, src, stride, SIZE);\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc10)(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t half[SIZE*SIZE*sizeof(pixel)];\
+    FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(half, src, SIZE*sizeof(pixel), stride);\
+    FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, src, half, stride, stride, SIZE*sizeof(pixel), SIZE);\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc20)(uint8_t *dst, uint8_t *src, int stride){\
+    FUNC(OPNAME ## h264_qpel ## SIZE ## _h_lowpass)(dst, src, stride, stride);\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc30)(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t half[SIZE*SIZE*sizeof(pixel)];\
+    FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(half, src, SIZE*sizeof(pixel), stride);\
+    FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, src+sizeof(pixel), half, stride, stride, SIZE*sizeof(pixel), SIZE);\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc01)(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
+    uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
+    uint8_t half[SIZE*SIZE*sizeof(pixel)];\
+    FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),  stride, SIZE + 5);\
+    FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(half, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+    FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, full_mid, half, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc02)(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
+    uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
+    FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),  stride, SIZE + 5);\
+    FUNC(OPNAME ## h264_qpel ## SIZE ## _v_lowpass)(dst, full_mid, stride, SIZE*sizeof(pixel));\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc03)(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
+    uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
+    uint8_t half[SIZE*SIZE*sizeof(pixel)];\
+    FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),  stride, SIZE + 5);\
+    FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(half, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+    FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, full_mid+SIZE*sizeof(pixel), half, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc11)(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
+    uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
+    uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
+    uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
+    FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel), stride);\
+    FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),  stride, SIZE + 5);\
+    FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+    FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc31)(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
+    uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
+    uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
+    uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
+    FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel), stride);\
+    FUNC(copy_block ## SIZE )(full, src - stride*2 + sizeof(pixel), SIZE*sizeof(pixel),  stride, SIZE + 5);\
+    FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+    FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc13)(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
+    uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
+    uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
+    uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
+    FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src + stride, SIZE*sizeof(pixel), stride);\
+    FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),  stride, SIZE + 5);\
+    FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+    FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc33)(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
+    uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
+    uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
+    uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
+    FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src + stride, SIZE*sizeof(pixel), stride);\
+    FUNC(copy_block ## SIZE )(full, src - stride*2 + sizeof(pixel), SIZE*sizeof(pixel),  stride, SIZE + 5);\
+    FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+    FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc22)(uint8_t *dst, uint8_t *src, int stride){\
+    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    FUNC(OPNAME ## h264_qpel ## SIZE ## _hv_lowpass)(dst, tmp, src, stride, SIZE*sizeof(pixel), stride);\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc21)(uint8_t *dst, uint8_t *src, int stride){\
+    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
+    uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
+    FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel), stride);\
+    FUNC(put_h264_qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src, SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
+    FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfHV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc23)(uint8_t *dst, uint8_t *src, int stride){\
+    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
+    uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
+    FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src + stride, SIZE*sizeof(pixel), stride);\
+    FUNC(put_h264_qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src, SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
+    FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfHV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc12)(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
+    uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
+    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
+    uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
+    FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),  stride, SIZE + 5);\
+    FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+    FUNC(put_h264_qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src, SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
+    FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfV, halfHV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
+}\
+\
+static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc32)(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
+    uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
+    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
+    uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
+    FUNC(copy_block ## SIZE )(full, src - stride*2 + sizeof(pixel), SIZE*sizeof(pixel),  stride, SIZE + 5);\
+    FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid, SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+    FUNC(put_h264_qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src, SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
+    FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfV, halfHV, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
+}\
+
+#define op_avg(a, b)  a = (((a)+CLIP(((b) + 16)>>5)+1)>>1)
+//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
+#define op_put(a, b)  a = CLIP(((b) + 16)>>5)
+#define op2_avg(a, b)  a = (((a)+CLIP(((b) + 512)>>10)+1)>>1)
+#define op2_put(a, b)  a = CLIP(((b) + 512)>>10)
+
+H264_LOWPASS(put_       , op_put, op2_put)
+H264_LOWPASS(avg_       , op_avg, op2_avg)
+H264_MC(put_, 2)
+H264_MC(put_, 4)
+H264_MC(put_, 8)
+H264_MC(put_, 16)
+H264_MC(avg_, 4)
+H264_MC(avg_, 8)
+H264_MC(avg_, 16)
+
+#undef op_avg
+#undef op_put
+#undef op2_avg
+#undef op2_put
+
+#if BIT_DEPTH == 8
+#   define put_h264_qpel8_mc00_8_c  ff_put_pixels8x8_8_c
+#   define avg_h264_qpel8_mc00_8_c  ff_avg_pixels8x8_8_c
+#   define put_h264_qpel16_mc00_8_c ff_put_pixels16x16_8_c
+#   define avg_h264_qpel16_mc00_8_c ff_avg_pixels16x16_8_c
+#elif BIT_DEPTH == 9
+#   define put_h264_qpel8_mc00_9_c  ff_put_pixels8x8_9_c
+#   define avg_h264_qpel8_mc00_9_c  ff_avg_pixels8x8_9_c
+#   define put_h264_qpel16_mc00_9_c ff_put_pixels16x16_9_c
+#   define avg_h264_qpel16_mc00_9_c ff_avg_pixels16x16_9_c
+#elif BIT_DEPTH == 10
+#   define put_h264_qpel8_mc00_10_c  ff_put_pixels8x8_10_c
+#   define avg_h264_qpel8_mc00_10_c  ff_avg_pixels8x8_10_c
+#   define put_h264_qpel16_mc00_10_c ff_put_pixels16x16_10_c
+#   define avg_h264_qpel16_mc00_10_c ff_avg_pixels16x16_10_c
+#endif
+
+void FUNCC(ff_put_pixels8x8)(uint8_t *dst, uint8_t *src, int stride) {
+    FUNCC(put_pixels8)(dst, src, stride, 8);
+}
+void FUNCC(ff_avg_pixels8x8)(uint8_t *dst, uint8_t *src, int stride) {
+    FUNCC(avg_pixels8)(dst, src, stride, 8);
+}
+void FUNCC(ff_put_pixels16x16)(uint8_t *dst, uint8_t *src, int stride) {
+    FUNCC(put_pixels16)(dst, src, stride, 16);
+}
+void FUNCC(ff_avg_pixels16x16)(uint8_t *dst, uint8_t *src, int stride) {
+    FUNCC(avg_pixels16)(dst, src, stride, 16);
+}
+
+static void FUNCC(clear_block)(DCTELEM *block)
+{
+    memset(block, 0, sizeof(dctcoef)*64);
+}
+
+/**
+ * memset(blocks, 0, sizeof(DCTELEM)*6*64)
+ */
+static void FUNCC(clear_blocks)(DCTELEM *blocks)
+{
+    memset(blocks, 0, sizeof(dctcoef)*6*64);
+}
diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
index 637d09dba5..811a75a06f 100644
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@@ -21,6 +21,7 @@
 
 #include "libavutil/crc.h"
 #include "libavutil/md5.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "golomb.h"
@@ -43,7 +44,7 @@
 typedef struct CompressionOptions {
     int compression_level;
     int block_time_ms;
-    enum AVLPCType lpc_type;
+    enum FFLPCType lpc_type;
     int lpc_passes;
     int lpc_coeff_precision;
     int min_prediction_order;
@@ -80,6 +81,7 @@ typedef struct FlacFrame {
 } FlacFrame;
 
 typedef struct FlacEncodeContext {
+    AVClass *class;
     PutBitContext pb;
     int channels;
     int samplerate;
@@ -156,16 +158,16 @@ static av_cold void dprint_compression_options(FlacEncodeContext *s)
     av_log(avctx, AV_LOG_DEBUG, " compression: %d\n", opt->compression_level);
 
     switch (opt->lpc_type) {
-    case AV_LPC_TYPE_NONE:
+    case FF_LPC_TYPE_NONE:
         av_log(avctx, AV_LOG_DEBUG, " lpc type: None\n");
         break;
-    case AV_LPC_TYPE_FIXED:
+    case FF_LPC_TYPE_FIXED:
         av_log(avctx, AV_LOG_DEBUG, " lpc type: Fixed pre-defined coefficients\n");
         break;
-    case AV_LPC_TYPE_LEVINSON:
+    case FF_LPC_TYPE_LEVINSON:
         av_log(avctx, AV_LOG_DEBUG, " lpc type: Levinson-Durbin recursion with Welch window\n");
         break;
-    case AV_LPC_TYPE_CHOLESKY:
+    case FF_LPC_TYPE_CHOLESKY:
         av_log(avctx, AV_LOG_DEBUG, " lpc type: Cholesky factorization, %d pass%s\n",
                opt->lpc_passes, opt->lpc_passes == 1 ? "" : "es");
         break;
@@ -266,32 +268,42 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
 
     s->options.block_time_ms = ((int[]){ 27, 27, 27,105,105,105,105,105,105,105,105,105,105})[level];
 
-    s->options.lpc_type      = ((int[]){ AV_LPC_TYPE_FIXED,    AV_LPC_TYPE_FIXED,    AV_LPC_TYPE_FIXED,
-                                         AV_LPC_TYPE_LEVINSON, AV_LPC_TYPE_LEVINSON, AV_LPC_TYPE_LEVINSON,
-                                         AV_LPC_TYPE_LEVINSON, AV_LPC_TYPE_LEVINSON, AV_LPC_TYPE_LEVINSON,
-                                         AV_LPC_TYPE_LEVINSON, AV_LPC_TYPE_LEVINSON, AV_LPC_TYPE_LEVINSON,
-                                         AV_LPC_TYPE_LEVINSON})[level];
+    if (s->options.lpc_type == FF_LPC_TYPE_DEFAULT)
+        s->options.lpc_type  = ((int[]){ FF_LPC_TYPE_FIXED,    FF_LPC_TYPE_FIXED,    FF_LPC_TYPE_FIXED,
+                                         FF_LPC_TYPE_LEVINSON, FF_LPC_TYPE_LEVINSON, FF_LPC_TYPE_LEVINSON,
+                                         FF_LPC_TYPE_LEVINSON, FF_LPC_TYPE_LEVINSON, FF_LPC_TYPE_LEVINSON,
+                                         FF_LPC_TYPE_LEVINSON, FF_LPC_TYPE_LEVINSON, FF_LPC_TYPE_LEVINSON,
+                                         FF_LPC_TYPE_LEVINSON})[level];
 
     s->options.min_prediction_order = ((int[]){  2,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1})[level];
     s->options.max_prediction_order = ((int[]){  3,  4,  4,  6,  8,  8,  8,  8, 12, 12, 12, 32, 32})[level];
 
-    s->options.prediction_order_method = ((int[]){ ORDER_METHOD_EST,    ORDER_METHOD_EST,    ORDER_METHOD_EST,
-                                                   ORDER_METHOD_EST,    ORDER_METHOD_EST,    ORDER_METHOD_EST,
-                                                   ORDER_METHOD_4LEVEL, ORDER_METHOD_LOG,    ORDER_METHOD_4LEVEL,
-                                                   ORDER_METHOD_LOG,    ORDER_METHOD_SEARCH, ORDER_METHOD_LOG,
-                                                   ORDER_METHOD_SEARCH})[level];
+    if (s->options.prediction_order_method < 0)
+        s->options.prediction_order_method = ((int[]){ ORDER_METHOD_EST,    ORDER_METHOD_EST,    ORDER_METHOD_EST,
+                                                       ORDER_METHOD_EST,    ORDER_METHOD_EST,    ORDER_METHOD_EST,
+                                                       ORDER_METHOD_4LEVEL, ORDER_METHOD_LOG,    ORDER_METHOD_4LEVEL,
+                                                       ORDER_METHOD_LOG,    ORDER_METHOD_SEARCH, ORDER_METHOD_LOG,
+                                                       ORDER_METHOD_SEARCH})[level];
 
-    s->options.min_partition_order = ((int[]){  2,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0})[level];
-    s->options.max_partition_order = ((int[]){  2,  2,  3,  3,  3,  8,  8,  8,  8,  8,  8,  8,  8})[level];
+    if (s->options.min_partition_order > s->options.max_partition_order) {
+        av_log(avctx, AV_LOG_ERROR, "invalid partition orders: min=%d max=%d\n",
+               s->options.min_partition_order, s->options.max_partition_order);
+        return AVERROR(EINVAL);
+    }
+    if (s->options.min_partition_order < 0)
+        s->options.min_partition_order = ((int[]){  2,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0})[level];
+    if (s->options.max_partition_order < 0)
+        s->options.max_partition_order = ((int[]){  2,  2,  3,  3,  3,  8,  8,  8,  8,  8,  8,  8,  8})[level];
 
     /* set compression option overrides from AVCodecContext */
-    if (avctx->lpc_type > AV_LPC_TYPE_DEFAULT) {
-        if (avctx->lpc_type > AV_LPC_TYPE_CHOLESKY) {
+#if FF_API_FLAC_GLOBAL_OPTS
+    if (avctx->lpc_type > FF_LPC_TYPE_DEFAULT) {
+        if (avctx->lpc_type > FF_LPC_TYPE_CHOLESKY) {
             av_log(avctx, AV_LOG_ERROR, "unknown lpc type: %d\n", avctx->lpc_type);
             return -1;
         }
         s->options.lpc_type = avctx->lpc_type;
-        if (s->options.lpc_type == AV_LPC_TYPE_CHOLESKY) {
+        if (s->options.lpc_type == FF_LPC_TYPE_CHOLESKY) {
             if (avctx->lpc_passes < 0) {
                 // default number of passes for Cholesky
                 s->options.lpc_passes = 2;
@@ -304,11 +316,12 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
             }
         }
     }
+#endif
 
-    if (s->options.lpc_type == AV_LPC_TYPE_NONE) {
+    if (s->options.lpc_type == FF_LPC_TYPE_NONE) {
         s->options.min_prediction_order = 0;
     } else if (avctx->min_prediction_order >= 0) {
-        if (s->options.lpc_type == AV_LPC_TYPE_FIXED) {
+        if (s->options.lpc_type == FF_LPC_TYPE_FIXED) {
             if (avctx->min_prediction_order > MAX_FIXED_ORDER) {
                 av_log(avctx, AV_LOG_ERROR, "invalid min prediction order: %d\n",
                        avctx->min_prediction_order);
@@ -322,10 +335,10 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
         }
         s->options.min_prediction_order = avctx->min_prediction_order;
     }
-    if (s->options.lpc_type == AV_LPC_TYPE_NONE) {
+    if (s->options.lpc_type == FF_LPC_TYPE_NONE) {
         s->options.max_prediction_order = 0;
     } else if (avctx->max_prediction_order >= 0) {
-        if (s->options.lpc_type == AV_LPC_TYPE_FIXED) {
+        if (s->options.lpc_type == FF_LPC_TYPE_FIXED) {
             if (avctx->max_prediction_order > MAX_FIXED_ORDER) {
                 av_log(avctx, AV_LOG_ERROR, "invalid max prediction order: %d\n",
                        avctx->max_prediction_order);
@@ -345,6 +358,7 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
         return -1;
     }
 
+#if FF_API_FLAC_GLOBAL_OPTS
     if (avctx->prediction_order_method >= 0) {
         if (avctx->prediction_order_method > ORDER_METHOD_LOG) {
             av_log(avctx, AV_LOG_ERROR, "invalid prediction order method: %d\n",
@@ -375,6 +389,7 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
                s->options.min_partition_order, s->options.max_partition_order);
         return -1;
     }
+#endif
 
     if (avctx->frame_size > 0) {
         if (avctx->frame_size < FLAC_MIN_BLOCKSIZE ||
@@ -388,6 +403,7 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
     }
     s->max_blocksize = s->avctx->frame_size;
 
+#if FF_API_FLAC_GLOBAL_OPTS
     /* set LPC precision */
     if (avctx->lpc_coeff_precision > 0) {
         if (avctx->lpc_coeff_precision > MAX_LPC_PRECISION) {
@@ -396,10 +412,8 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
             return -1;
         }
         s->options.lpc_coeff_precision = avctx->lpc_coeff_precision;
-    } else {
-        /* default LPC precision */
-        s->options.lpc_coeff_precision = 15;
     }
+#endif
 
     /* set maximum encoded frame size in verbatim mode */
     s->max_framesize = ff_flac_get_max_frame_size(s->avctx->frame_size,
@@ -448,7 +462,7 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
     }
 
     ret = ff_lpc_init(&s->lpc_ctx, avctx->frame_size,
-                      s->options.max_prediction_order, AV_LPC_TYPE_LEVINSON);
+                      s->options.max_prediction_order, FF_LPC_TYPE_LEVINSON);
 
     dprint_compression_options(s);
 
@@ -889,8 +903,8 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
 
     /* FIXED */
     sub->type = FLAC_SUBFRAME_FIXED;
-    if (s->options.lpc_type == AV_LPC_TYPE_NONE  ||
-        s->options.lpc_type == AV_LPC_TYPE_FIXED || n <= max_order) {
+    if (s->options.lpc_type == FF_LPC_TYPE_NONE  ||
+        s->options.lpc_type == FF_LPC_TYPE_FIXED || n <= max_order) {
         uint32_t bits[MAX_FIXED_ORDER+1];
         if (max_order > MAX_FIXED_ORDER)
             max_order = MAX_FIXED_ORDER;
@@ -1336,6 +1350,33 @@ static av_cold int flac_encode_close(AVCodecContext *avctx)
     return 0;
 }
 
+#define FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
+static const AVOption options[] = {
+{ "lpc_coeff_precision", "LPC coefficient precision", offsetof(FlacEncodeContext, options.lpc_coeff_precision), FF_OPT_TYPE_INT, 15, 0, MAX_LPC_PRECISION, FLAGS },
+{ "lpc_type", "LPC algorithm", offsetof(FlacEncodeContext, options.lpc_type), FF_OPT_TYPE_INT, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_NB-1, FLAGS, "lpc_type" },
+{ "none",     NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_NONE,     INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "fixed",    NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_FIXED,    INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "levinson", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_LEVINSON, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "cholesky", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_CHOLESKY, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes),  FF_OPT_TYPE_INT, -1, INT_MIN, INT_MAX, FLAGS },
+{ "min_partition_order",  NULL, offsetof(FlacEncodeContext, options.min_partition_order),  FF_OPT_TYPE_INT, -1,      -1, MAX_PARTITION_ORDER, FLAGS },
+{ "max_partition_order",  NULL, offsetof(FlacEncodeContext, options.max_partition_order),  FF_OPT_TYPE_INT, -1,      -1, MAX_PARTITION_ORDER, FLAGS },
+{ "prediction_order_method", "Search method for selecting prediction order", offsetof(FlacEncodeContext, options.prediction_order_method), FF_OPT_TYPE_INT, -1, -1, ORDER_METHOD_LOG, FLAGS, "predm" },
+{ "estimation", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_EST,    INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "2level",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_2LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "4level",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_4LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "8level",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_8LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "search",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_SEARCH, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "log",        NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_LOG,    INT_MIN, INT_MAX, FLAGS, "predm" },
+{ NULL },
+};
+
+static const AVClass flac_encoder_class = {
+    "FLAC encoder",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
 
 AVCodec ff_flac_encoder = {
     "flac",
@@ -1349,4 +1390,5 @@ AVCodec ff_flac_encoder = {
     .capabilities = CODEC_CAP_SMALL_LAST_FRAME | CODEC_CAP_DELAY,
     .sample_fmts = (const enum AVSampleFormat[]){AV_SAMPLE_FMT_S16,AV_SAMPLE_FMT_NONE},
     .long_name = NULL_IF_CONFIG_SMALL("FLAC (Free Lossless Audio Codec)"),
+    .priv_class = &flac_encoder_class,
 };
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 353a0b343b..a843d21446 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -45,11 +45,11 @@
 //#undef NDEBUG
 #include <assert.h>
 
-static const uint8_t rem6[QP_MAX_MAX+1]={
+static const uint8_t rem6[QP_MAX_NUM+1]={
 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
 };
 
-static const uint8_t div6[QP_MAX_MAX+1]={
+static const uint8_t div6[QP_MAX_NUM+1]={
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9,10,10,10,10,
 };
 
@@ -586,6 +586,7 @@ av_cold int ff_h264_decode_init(AVCodecContext *avctx){
 
     h->sps.bit_depth_luma = avctx->bits_per_raw_sample = 8;
     h->pixel_shift = 0;
+    h->sps.bit_depth_luma = avctx->bits_per_raw_sample = 8;
 
     h->thread_context[0] = h;
     h->outputed_poc = h->next_outputed_poc = INT_MIN;
@@ -733,6 +734,7 @@ static int decode_update_thread_context(AVCodecContext *dst, const AVCodecContex
 int ff_h264_frame_start(H264Context *h){
     MpegEncContext * const s = &h->s;
     int i;
+    const int pixel_shift = h->pixel_shift;
 
     if(MPV_frame_start(s, s->avctx) < 0)
         return -1;
@@ -749,14 +751,14 @@ int ff_h264_frame_start(H264Context *h){
     assert(s->linesize && s->uvlinesize);
 
     for(i=0; i<16; i++){
-        h->block_offset[i]= (4*((scan8[i] - scan8[0])&7)<<h->pixel_shift) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
-        h->block_offset[24+i]= (4*((scan8[i] - scan8[0])&7)<<h->pixel_shift) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[24+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
     }
     for(i=0; i<4; i++){
         h->block_offset[16+i]=
-        h->block_offset[20+i]= (4*((scan8[i] - scan8[0])&7)<<h->pixel_shift) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[20+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
         h->block_offset[24+16+i]=
-        h->block_offset[24+20+i]= (4*((scan8[i] - scan8[0])&7)<<h->pixel_shift) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[24+20+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
     }
 
     /* can't be in alloc_tables because linesize isn't known there.
@@ -948,6 +950,7 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
     MpegEncContext * const s = &h->s;
     uint8_t *top_border;
     int top_idx = 1;
+    const int pixel_shift = h->pixel_shift;
 
     src_y  -=   linesize;
     src_cb -= uvlinesize;
@@ -958,10 +961,10 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
             if(!MB_MBAFF){
                 top_border = h->top_borders[0][s->mb_x];
                 AV_COPY128(top_border, src_y + 15*linesize);
-                if (h->pixel_shift)
+                if (pixel_shift)
                     AV_COPY128(top_border+16, src_y+15*linesize+16);
                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
-                    if (h->pixel_shift) {
+                    if (pixel_shift) {
                         AV_COPY128(top_border+32, src_cb+7*uvlinesize);
                         AV_COPY128(top_border+48, src_cr+7*uvlinesize);
                     } else {
@@ -980,11 +983,11 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
     // There are two lines saved, the line above the the top macroblock of a pair,
     // and the line above the bottom macroblock
     AV_COPY128(top_border, src_y + 16*linesize);
-    if (h->pixel_shift)
+    if (pixel_shift)
         AV_COPY128(top_border+16, src_y+16*linesize+16);
 
     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
-        if (h->pixel_shift) {
+        if (pixel_shift) {
             AV_COPY128(top_border+32, src_cb+8*uvlinesize);
             AV_COPY128(top_border+48, src_cr+8*uvlinesize);
         } else {
@@ -994,7 +997,10 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
     }
 }
 
-static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple, int pixel_shift){
+static inline void xchg_mb_border(H264Context *h, uint8_t *src_y,
+                                  uint8_t *src_cb, uint8_t *src_cr,
+                                  int linesize, int uvlinesize,
+                                  int xchg, int simple, int pixel_shift){
     MpegEncContext * const s = &h->s;
     int deblock_left;
     int deblock_top;
@@ -1040,38 +1046,38 @@ else      AV_COPY64(b,a);
 
     if(deblock_top){
         if(deblock_left){
-            XCHG(top_border_m1+(8<<pixel_shift), src_y -(7<<h->pixel_shift), 1);
+            XCHG(top_border_m1 + (8 << pixel_shift), src_y - (7 << pixel_shift), 1);
         }
-        XCHG(top_border+(0<<pixel_shift), src_y +(1<<pixel_shift), xchg);
-        XCHG(top_border+(8<<pixel_shift), src_y +(9<<pixel_shift), 1);
+        XCHG(top_border + (0 << pixel_shift), src_y + (1 << pixel_shift), xchg);
+        XCHG(top_border + (8 << pixel_shift), src_y + (9 << pixel_shift), 1);
         if(s->mb_x+1 < s->mb_width){
-            XCHG(h->top_borders[top_idx][s->mb_x+1], src_y +(17<<pixel_shift), 1);
+            XCHG(h->top_borders[top_idx][s->mb_x+1], src_y + (17 << pixel_shift), 1);
         }
     }
     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
         if(deblock_top){
             if(deblock_left){
-                XCHG(top_border_m1+(16<<pixel_shift), src_cb -(7<<pixel_shift), 1);
-                XCHG(top_border_m1+(24<<pixel_shift), src_cr -(7<<pixel_shift), 1);
+                XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1);
+                XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1);
             }
-            XCHG(top_border+(16<<pixel_shift), src_cb+1+pixel_shift, 1);
-            XCHG(top_border+(24<<pixel_shift), src_cr+1+pixel_shift, 1);
+            XCHG(top_border + (16 << pixel_shift), src_cb+1+pixel_shift, 1);
+            XCHG(top_border + (24 << pixel_shift), src_cr+1+pixel_shift, 1);
         }
     }
 }
 
-static av_always_inline int dctcoef_get(H264Context *h, DCTELEM *mb, int index, int pixel_shift) {
-    if (!pixel_shift)
-        return mb[index];
-    else
-        return ((int32_t*)mb)[index];
+static av_always_inline int dctcoef_get(DCTELEM *mb, int high_bit_depth, int index) {
+    if (high_bit_depth) {
+        return AV_RN32A(((int32_t*)mb) + index);
+    } else
+        return AV_RN16A(mb + index);
 }
 
-static av_always_inline void dctcoef_set(H264Context *h, DCTELEM *mb, int index, int value, int pixel_shift) {
-    if (!pixel_shift)
-        mb[index] = value;
-    else
-        ((int32_t*)mb)[index] = value;
+static av_always_inline void dctcoef_set(DCTELEM *mb, int high_bit_depth, int index, int value) {
+    if (high_bit_depth) {
+        AV_WN32A(((int32_t*)mb) + index, value);
+    } else
+        AV_WN16A(mb + index, value);
 }
 
 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift){
@@ -1090,12 +1096,12 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
 
-    dest_y  = s->current_picture.data[0] + ((mb_x<<pixel_shift) + mb_y * s->linesize  ) * 16;
-    dest_cb = s->current_picture.data[1] + ((mb_x<<pixel_shift) + mb_y * s->uvlinesize) * 8;
-    dest_cr = s->current_picture.data[2] + ((mb_x<<pixel_shift) + mb_y * s->uvlinesize) * 8;
+    dest_y  = s->current_picture.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
+    dest_cb = s->current_picture.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
+    dest_cr = s->current_picture.data[2] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
 
-    s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + (64<<pixel_shift), s->linesize, 4);
-    s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + (64<<pixel_shift), dest_cr - dest_cb, 2);
+    s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + (64 << pixel_shift), s->linesize, 4);
+    s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + (64 << pixel_shift), dest_cr - dest_cb, 2);
 
     h->list_counts[mb_xy]= h->list_count;
 
@@ -1186,16 +1192,16 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                             uint8_t * const ptr= dest_y + block_offset[i];
                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
-                                h->hpc.pred8x8l_add[dir](ptr, h->mb + (i*16<<pixel_shift), linesize);
+                                h->hpc.pred8x8l_add[dir](ptr, h->mb + (i*16 << pixel_shift), linesize);
                             }else{
                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
                                                             (h->topright_samples_available<<i)&0x4000, linesize);
                                 if(nnz){
-                                    if(nnz == 1 && dctcoef_get(h, h->mb, i*16, pixel_shift))
-                                        idct_dc_add(ptr, h->mb + (i*16<<pixel_shift), linesize);
+                                    if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16))
+                                        idct_dc_add(ptr, h->mb + (i*16 << pixel_shift), linesize);
                                     else
-                                        idct_add   (ptr, h->mb + (i*16<<pixel_shift), linesize);
+                                        idct_add   (ptr, h->mb + (i*16 << pixel_shift), linesize);
                                 }
                             }
                         }
@@ -1212,7 +1218,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
 
                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
-                                h->hpc.pred4x4_add[dir](ptr, h->mb + (i*16<<pixel_shift), linesize);
+                                h->hpc.pred4x4_add[dir](ptr, h->mb + (i*16 << pixel_shift), linesize);
                             }else{
                                 uint8_t *topright;
                                 int nnz, tr;
@@ -1229,7 +1235,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                                         topright= (uint8_t*) &tr;
                                         }
                                     }else
-                                        topright= ptr + (4<<pixel_shift) - linesize;
+                                        topright= ptr + (4 << pixel_shift) - linesize;
                                 }else
                                     topright= NULL;
 
@@ -1237,8 +1243,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                                 nnz = h->non_zero_count_cache[ scan8[i] ];
                                 if(nnz){
                                     if(is_h264){
-                                        if(nnz == 1 && dctcoef_get(h, h->mb, i*16, pixel_shift))
-                                            idct_dc_add(ptr, h->mb + (i*16<<pixel_shift), linesize);
+                                        if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16))
+                                            idct_dc_add(ptr, h->mb + (i*16 << pixel_shift), linesize);
                                         else
                                             idct_add   (ptr, h->mb + (i*16<<pixel_shift), linesize);
                                     }
@@ -1261,7 +1267,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                             static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
                                                                     8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
                             for(i = 0; i < 16; i++)
-                                dctcoef_set(h, h->mb, dc_mapping[i], dctcoef_get(h, h->mb_luma_dc, i,pixel_shift),pixel_shift);
+                                dctcoef_set(h->mb, pixel_shift, dc_mapping[i], dctcoef_get(h->mb_luma_dc, pixel_shift, i));
                         }
                     }
                 }
@@ -1288,8 +1294,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
                         }else{
                             for(i=0; i<16; i++){
-                                if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h, h->mb, i*16,pixel_shift))
-                                    s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + (i*16<<pixel_shift), linesize);
+                                if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
+                                    s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + (i*16 << pixel_shift), linesize);
                             }
                         }
                     }else{
@@ -1301,7 +1307,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
                         for(i=0; i<16; i+=di){
                             if(h->non_zero_count_cache[ scan8[i] ]){
-                                idct_add(dest_y + block_offset[i], h->mb + (i*16<<pixel_shift), linesize);
+                                idct_add(dest_y + block_offset[i], h->mb + (i*16 << pixel_shift), linesize);
                             }
                         }
                     }else{
@@ -1329,21 +1335,21 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
             uint8_t *dest[2] = {dest_cb, dest_cr};
             if(transform_bypass){
                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + (16*16<<pixel_shift), uvlinesize);
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + (20*16<<pixel_shift), uvlinesize);
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + (16*16 << pixel_shift), uvlinesize);
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + (20*16 << pixel_shift), uvlinesize);
                 }else{
                     idct_add = s->dsp.add_pixels4;
                     for(i=16; i<16+8; i++){
-                        if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h, h->mb, i*16,pixel_shift))
-                            idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + (i*16<<pixel_shift), uvlinesize);
+                        if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
+                            idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize);
                     }
                 }
             }else{
                 if(is_h264){
                     if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+0] ])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16<<pixel_shift)       , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16 << pixel_shift)       , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
                     if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+1] ])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + ((16*16+4*16)<<pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + ((16*16+4*16) << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
                     h->h264dsp.h264_idct_add8(dest, block_offset,
                                               h->mb, uvlinesize,
                                               h->non_zero_count_cache);
@@ -1370,9 +1376,12 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
 /**
  * Process a macroblock; this case avoids checks for expensive uncommon cases.
  */
-static void hl_decode_mb_simple8(H264Context *h){
-    hl_decode_mb_internal(h, 1, 0);
+#define hl_decode_mb_simple(sh, bits) \
+static void hl_decode_mb_simple_ ## bits(H264Context *h){ \
+    hl_decode_mb_internal(h, 1, sh); \
 }
+hl_decode_mb_simple(0, 8);
+hl_decode_mb_simple(1, 16);
 
 /**
  * Process a macroblock; this handles edge cases, such as interlacing.
@@ -1387,11 +1396,12 @@ void ff_h264_hl_decode_mb(H264Context *h){
     const int mb_type= s->current_picture.mb_type[mb_xy];
     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
 
-    if (is_complex || h->pixel_shift)
+    if (is_complex) {
         hl_decode_mb_complex(h);
-    else{
-        hl_decode_mb_simple8(h);
-    }
+    } else if (h->pixel_shift) {
+        hl_decode_mb_simple_16(h);
+    } else
+        hl_decode_mb_simple_8(h);
 }
 
 static int pred_weight_table(H264Context *h){
@@ -2557,6 +2567,7 @@ static void loop_filter(H264Context *h){
     const int end_mb_y= s->mb_y + FRAME_MBAFF;
     const int old_slice_type= h->slice_type;
     const int end_mb_x  = s->mb_x;
+    const int pixel_shift = h->pixel_shift;
 
     if(h->deblocking_filter) {
         int start_x= s->resync_mb_y == s->mb_y ? s->resync_mb_x : 0;
@@ -2573,9 +2584,9 @@ static void loop_filter(H264Context *h){
 
                 s->mb_x= mb_x;
                 s->mb_y= mb_y;
-                dest_y  = s->current_picture.data[0] + ((mb_x<<h->pixel_shift) + mb_y * s->linesize  ) * 16;
-                dest_cb = s->current_picture.data[1] + ((mb_x<<h->pixel_shift) + mb_y * s->uvlinesize) * 8;
-                dest_cr = s->current_picture.data[2] + ((mb_x<<h->pixel_shift) + mb_y * s->uvlinesize) * 8;
+                dest_y  = s->current_picture.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
+                dest_cb = s->current_picture.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
+                dest_cr = s->current_picture.data[2] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
                     //FIXME simplify above
 
                 if (MB_FIELD) {
@@ -3060,7 +3071,7 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
             if (avctx->bits_per_raw_sample != h->sps.bit_depth_luma) {
                 if (h->sps.bit_depth_luma >= 8 && h->sps.bit_depth_luma <= 10) {
                     avctx->bits_per_raw_sample = h->sps.bit_depth_luma;
-                    h->pixel_shift = h->sps.bit_depth_luma/9;
+                    h->pixel_shift = h->sps.bit_depth_luma > 8;
 
                     ff_h264dsp_init(&h->h264dsp, h->sps.bit_depth_luma);
                     ff_h264_pred_init(&h->hpc, s->codec_id, h->sps.bit_depth_luma);
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index a1db628bdc..04da701750 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -108,7 +108,7 @@
  */
 #define DELAYED_PIC_REF 4
 
-#define QP_MAX_MAX (51 + 2*6)           // The maximum supported qp
+#define QP_MAX_NUM (51 + 2*6)           // The maximum supported qp
 
 /* NAL unit types */
 enum {
@@ -266,7 +266,7 @@ typedef struct MMCO{
 typedef struct H264Context{
     MpegEncContext s;
     H264DSPContext h264dsp;
-    int pixel_shift;
+    int pixel_shift;    ///< 0 for 8-bit H264, 1 for high-bit-depth H264
     int chroma_qp[2]; //QPc
 
     int qp_thresh;      ///< QP threshold to skip loopfilter
@@ -355,8 +355,8 @@ typedef struct H264Context{
      */
     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
 
-    uint32_t dequant4_buffer[6][QP_MAX_MAX+1][16]; //FIXME should these be moved down?
-    uint32_t dequant8_buffer[2][QP_MAX_MAX+1][64];
+    uint32_t dequant4_buffer[6][QP_MAX_NUM+1][16]; //FIXME should these be moved down?
+    uint32_t dequant8_buffer[2][QP_MAX_NUM+1][64];
     uint32_t (*dequant4_coeff[6])[16];
     uint32_t (*dequant8_coeff[2])[64];
 
@@ -597,7 +597,7 @@ typedef struct H264Context{
 }H264Context;
 
 
-extern const uint8_t ff_h264_chroma_qp[3][QP_MAX_MAX+1]; ///< One chroma qp table for each supported bit depth (8, 9, 10).
+extern const uint8_t ff_h264_chroma_qp[3][QP_MAX_NUM+1]; ///< One chroma qp table for each supported bit depth (8, 9, 10).
 
 /**
  * Decode SEI
diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index 6017afc7aa..925ac44498 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -1103,10 +1103,11 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
 
 
 #define STORE_BLOCK(type) \
-    do {\
-        uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;\
-\
-        int j= scantable[index[--coeff_count]];\
+    do { \
+        uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base; \
+ \
+        int j= scantable[index[--coeff_count]]; \
+ \
         if( get_cabac( CC, ctx ) == 0 ) { \
             node_ctx = coeff_abs_level_transition[0][node_ctx]; \
             if( is_dc ) { \
@@ -1141,8 +1142,8 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
             }else{ \
                 ((type*)block)[j] = ((int)(get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32)) >> 6; \
             } \
-        }\
-    } while( coeff_count );
+        } \
+    } while ( coeff_count );
 
         if (h->pixel_shift) {
             STORE_BLOCK(int32_t)
@@ -1204,6 +1205,7 @@ int ff_h264_decode_mb_cabac(H264Context *h) {
     int mb_xy;
     int mb_type, partition_count, cbp = 0;
     int dct8x8_allowed= h->pps.transform_8x8_mode;
+    const int pixel_shift = h->pixel_shift;
 
     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
 
@@ -1312,7 +1314,7 @@ decode_intra_mb:
     h->slice_table[ mb_xy ]= h->slice_num;
 
     if(IS_INTRA_PCM(mb_type)) {
-        const int mb_size = 384*h->sps.bit_depth_luma/8;
+        const int mb_size = (384*h->sps.bit_depth_luma) >> 3;
         const uint8_t *ptr;
 
         // We assume these blocks are very rare so we do not optimize it.
@@ -1670,7 +1672,7 @@ decode_intra_mb:
                 qmul = h->dequant4_coeff[0][s->qscale];
                 for( i = 0; i < 16; i++ ) {
                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
-                    decode_cabac_residual_nondc(h, h->mb + (16*i<<h->pixel_shift), 1, i, scan + 1, qmul, 15);
+                    decode_cabac_residual_nondc(h, h->mb + (16*i << pixel_shift), 1, i, scan + 1, qmul, 15);
                 }
             } else {
                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
@@ -1680,7 +1682,7 @@ decode_intra_mb:
             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
                 if( cbp & (1<<i8x8) ) {
                     if( IS_8x8DCT(mb_type) ) {
-                        decode_cabac_residual_nondc(h, h->mb + (64*i8x8<<h->pixel_shift), 5, 4*i8x8,
+                        decode_cabac_residual_nondc(h, h->mb + (64*i8x8 << pixel_shift), 5, 4*i8x8,
                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
                     } else {
                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
@@ -1688,7 +1690,7 @@ decode_intra_mb:
                             const int index = 4*i8x8 + i4x4;
                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
 //START_TIMER
-                            decode_cabac_residual_nondc(h, h->mb + (16*index<<h->pixel_shift), 2, index, scan, qmul, 16);
+                            decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 2, index, scan, qmul, 16);
 //STOP_TIMER("decode_residual")
                         }
                     }
@@ -1703,7 +1705,7 @@ decode_intra_mb:
             int c;
             for( c = 0; c < 2; c++ ) {
                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
-                decode_cabac_residual_dc(h, h->mb + ((256 + 16*4*c)<<h->pixel_shift), 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
+                decode_cabac_residual_dc(h, h->mb + ((256 + 16*4*c) << pixel_shift), 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
             }
         }
 
@@ -1714,7 +1716,7 @@ decode_intra_mb:
                 for( i = 0; i < 4; i++ ) {
                     const int index = 16 + 4 * c + i;
                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
-                    decode_cabac_residual_nondc(h, h->mb + (16*index<<h->pixel_shift), 4, index, scan + 1, qmul, 15);
+                    decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 4, index, scan + 1, qmul, 15);
                 }
             }
         } else {
diff --git a/libavcodec/h264_cavlc.c b/libavcodec/h264_cavlc.c
index 61b7436739..62e30f1311 100644
--- a/libavcodec/h264_cavlc.c
+++ b/libavcodec/h264_cavlc.c
@@ -542,6 +542,7 @@ int ff_h264_decode_mb_cavlc(H264Context *h){
     int partition_count;
     unsigned int mb_type, cbp;
     int dct8x8_allowed= h->pps.transform_8x8_mode;
+    const int pixel_shift = h->pixel_shift;
 
     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
 
@@ -961,7 +962,7 @@ decode_intra_mb:
                 for(i8x8=0; i8x8<4; i8x8++){
                     for(i4x4=0; i4x4<4; i4x4++){
                         const int index= i4x4 + 4*i8x8;
-                        if( decode_residual(h, h->intra_gb_ptr, h->mb + (16*index<<h->pixel_shift), index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
+                        if( decode_residual(h, h->intra_gb_ptr, h->mb + (16*index << pixel_shift), index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
                             return -1;
                         }
                     }
@@ -973,7 +974,7 @@ decode_intra_mb:
             for(i8x8=0; i8x8<4; i8x8++){
                 if(cbp & (1<<i8x8)){
                     if(IS_8x8DCT(mb_type)){
-                        DCTELEM *buf = &h->mb[64*i8x8<<h->pixel_shift];
+                        DCTELEM *buf = &h->mb[64*i8x8 << pixel_shift];
                         uint8_t *nnz;
                         for(i4x4=0; i4x4<4; i4x4++){
                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
@@ -986,7 +987,7 @@ decode_intra_mb:
                         for(i4x4=0; i4x4<4; i4x4++){
                             const int index= i4x4 + 4*i8x8;
 
-                            if( decode_residual(h, gb, h->mb + (16*index<<h->pixel_shift), index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
+                            if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
                                 return -1;
                             }
                         }
@@ -1000,7 +1001,7 @@ decode_intra_mb:
 
         if(cbp&0x30){
             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
-                if( decode_residual(h, gb, h->mb + ((256 + 16*4*chroma_idx)<<h->pixel_shift), CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
+                if( decode_residual(h, gb, h->mb + ((256 + 16*4*chroma_idx) << pixel_shift), CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
                     return -1;
                 }
         }
@@ -1010,7 +1011,7 @@ decode_intra_mb:
                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
                 for(i4x4=0; i4x4<4; i4x4++){
                     const int index= 16 + 4*chroma_idx + i4x4;
-                    if( decode_residual(h, gb, h->mb + (16*index<<h->pixel_shift), index, scan + 1, qmul, 15) < 0){
+                    if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index, scan + 1, qmul, 15) < 0){
                         return -1;
                     }
                 }
diff --git a/libavcodec/h264_loopfilter.c b/libavcodec/h264_loopfilter.c
index 325fd3cc61..2e61a3110a 100644
--- a/libavcodec/h264_loopfilter.c
+++ b/libavcodec/h264_loopfilter.c
@@ -101,197 +101,92 @@ static const uint8_t tc0_table[52*3][4] = {
 };
 
 static void av_always_inline filter_mb_edgev( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h) {
-    const int bit_depth = h->sps.bit_depth_luma;
-    const int qp_bd_offset = 6*(bit_depth-8);
+    const int qp_bd_offset = 6 * (h->sps.bit_depth_luma - 8);
     const unsigned int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a] << (bit_depth-8);
-    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset] << (bit_depth-8);
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset];
     if (alpha ==0 || beta == 0) return;
 
     if( bS[0] < 4 ) {
         int8_t tc[4];
-        tc[0] = tc0_table[index_a][bS[0]] << (bit_depth-8);
-        tc[1] = tc0_table[index_a][bS[1]] << (bit_depth-8);
-        tc[2] = tc0_table[index_a][bS[2]] << (bit_depth-8);
-        tc[3] = tc0_table[index_a][bS[3]] << (bit_depth-8);
+        tc[0] = tc0_table[index_a][bS[0]];
+        tc[1] = tc0_table[index_a][bS[1]];
+        tc[2] = tc0_table[index_a][bS[2]];
+        tc[3] = tc0_table[index_a][bS[3]];
         h->h264dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
     } else {
         h->h264dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
     }
 }
 static void av_always_inline filter_mb_edgecv( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h ) {
-    const int bit_depth = h->sps.bit_depth_luma;
-    const int qp_bd_offset = 6*(bit_depth-8);
+    const int qp_bd_offset = 6 * (h->sps.bit_depth_luma - 8);
     const unsigned int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a] << (bit_depth-8);
-    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset] << (bit_depth-8);
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset];
     if (alpha ==0 || beta == 0) return;
 
     if( bS[0] < 4 ) {
         int8_t tc[4];
-        tc[0] = (tc0_table[index_a][bS[0]] << (bit_depth-8))+1;
-        tc[1] = (tc0_table[index_a][bS[1]] << (bit_depth-8))+1;
-        tc[2] = (tc0_table[index_a][bS[2]] << (bit_depth-8))+1;
-        tc[3] = (tc0_table[index_a][bS[3]] << (bit_depth-8))+1;
+        tc[0] = tc0_table[index_a][bS[0]]+1;
+        tc[1] = tc0_table[index_a][bS[1]]+1;
+        tc[2] = tc0_table[index_a][bS[2]]+1;
+        tc[3] = tc0_table[index_a][bS[3]]+1;
         h->h264dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
     } else {
         h->h264dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
     }
 }
 
-static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int bsi, int qp ) {
-    int i;
-    const int bit_depth = h->sps.bit_depth_luma;
-    const int qp_bd_offset = 6*(bit_depth-8);
+static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[7], int bsi, int qp ) {
+    const int qp_bd_offset = 6 * (h->sps.bit_depth_luma - 8);
     int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
-    int alpha = alpha_table[index_a] << (bit_depth-8);
-    int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset] << (bit_depth-8);
-    for( i = 0; i < 8; i++, pix += stride) {
-        const int bS_index = (i >> 1) * bsi;
-
-        if( bS[bS_index] == 0 ) {
-            continue;
-        }
-
-        if( bS[bS_index] < 4 ) {
-            const int tc0 = tc0_table[index_a][bS[bS_index]] << (bit_depth-8);
-            const int p0 = pix[-1];
-            const int p1 = pix[-2];
-            const int p2 = pix[-3];
-            const int q0 = pix[0];
-            const int q1 = pix[1];
-            const int q2 = pix[2];
-
-            if( FFABS( p0 - q0 ) < alpha &&
-                FFABS( p1 - p0 ) < beta &&
-                FFABS( q1 - q0 ) < beta ) {
-                int tc = tc0;
-                int i_delta;
-
-                if( FFABS( p2 - p0 ) < beta ) {
-                    if(tc0)
-                    pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
-                    tc++;
-                }
-                if( FFABS( q2 - q0 ) < beta ) {
-                    if(tc0)
-                    pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
-                    tc++;
-                }
-
-                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
-                pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
-                pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
-                tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
-            }
-        }else{
-            const int p0 = pix[-1];
-            const int p1 = pix[-2];
-            const int p2 = pix[-3];
-
-            const int q0 = pix[0];
-            const int q1 = pix[1];
-            const int q2 = pix[2];
-
-            if( FFABS( p0 - q0 ) < alpha &&
-                FFABS( p1 - p0 ) < beta &&
-                FFABS( q1 - q0 ) < beta ) {
-
-                if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
-                    if( FFABS( p2 - p0 ) < beta)
-                    {
-                        const int p3 = pix[-4];
-                        /* p0', p1', p2' */
-                        pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
-                        pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
-                        pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
-                    } else {
-                        /* p0' */
-                        pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-                    }
-                    if( FFABS( q2 - q0 ) < beta)
-                    {
-                        const int q3 = pix[3];
-                        /* q0', q1', q2' */
-                        pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
-                        pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
-                        pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
-                    } else {
-                        /* q0' */
-                        pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-                    }
-                }else{
-                    /* p0', q0' */
-                    pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-                    pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-                }
-                tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
-            }
-        }
-    }
-}
-static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int bsi, int qp ) {
-    int i;
-    const int bit_depth = h->sps.bit_depth_luma;
-    const int qp_bd_offset = 6*(bit_depth-8);
-    int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
-    int alpha = alpha_table[index_a] << (bit_depth-8);
-    int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset] << (bit_depth-8);
-    for( i = 0; i < 4; i++, pix += stride) {
-        const int bS_index = i*bsi;
-
-        if( bS[bS_index] == 0 ) {
-            continue;
-        }
-
-        if( bS[bS_index] < 4 ) {
-            const int tc = (tc0_table[index_a][bS[bS_index]] << (bit_depth-8)) + 1;
-            const int p0 = pix[-1];
-            const int p1 = pix[-2];
-            const int q0 = pix[0];
-            const int q1 = pix[1];
-
-            if( FFABS( p0 - q0 ) < alpha &&
-                FFABS( p1 - p0 ) < beta &&
-                FFABS( q1 - q0 ) < beta ) {
-                const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
-
-                pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
-                pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
-                tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
-            }
-        }else{
-            const int p0 = pix[-1];
-            const int p1 = pix[-2];
-            const int q0 = pix[0];
-            const int q1 = pix[1];
-
-            if( FFABS( p0 - q0 ) < alpha &&
-                FFABS( p1 - p0 ) < beta &&
-                FFABS( q1 - q0 ) < beta ) {
-
-                pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
-                pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
-                tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
-            }
-        }
-    }
-}
-
-static void av_always_inline filter_mb_edgeh( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h ) {
-    const int bit_depth = h->sps.bit_depth_luma;
-    const int qp_bd_offset = 6*(bit_depth-8);
-    const unsigned int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a] << (bit_depth-8);
-    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset] << (bit_depth-8);
+    int alpha = alpha_table[index_a];
+    int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset];
     if (alpha ==0 || beta == 0) return;
 
     if( bS[0] < 4 ) {
         int8_t tc[4];
-        tc[0] = tc0_table[index_a][bS[0]] << (bit_depth-8);
-        tc[1] = tc0_table[index_a][bS[1]] << (bit_depth-8);
-        tc[2] = tc0_table[index_a][bS[2]] << (bit_depth-8);
-        tc[3] = tc0_table[index_a][bS[3]] << (bit_depth-8);
+        tc[0] = tc0_table[index_a][bS[0*bsi]];
+        tc[1] = tc0_table[index_a][bS[1*bsi]];
+        tc[2] = tc0_table[index_a][bS[2*bsi]];
+        tc[3] = tc0_table[index_a][bS[3*bsi]];
+        h->h264dsp.h264_h_loop_filter_luma_mbaff(pix, stride, alpha, beta, tc);
+    } else {
+        h->h264dsp.h264_h_loop_filter_luma_mbaff_intra(pix, stride, alpha, beta);
+    }
+}
+static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[7], int bsi, int qp ) {
+    const int qp_bd_offset = 6 * (h->sps.bit_depth_luma - 8);
+    int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
+    int alpha = alpha_table[index_a];
+    int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset];
+    if (alpha ==0 || beta == 0) return;
+
+    if( bS[0] < 4 ) {
+        int8_t tc[4];
+        tc[0] = tc0_table[index_a][bS[0*bsi]] + 1;
+        tc[1] = tc0_table[index_a][bS[1*bsi]] + 1;
+        tc[2] = tc0_table[index_a][bS[2*bsi]] + 1;
+        tc[3] = tc0_table[index_a][bS[3*bsi]] + 1;
+        h->h264dsp.h264_h_loop_filter_chroma_mbaff(pix, stride, alpha, beta, tc);
+    } else {
+        h->h264dsp.h264_h_loop_filter_chroma_mbaff_intra(pix, stride, alpha, beta);
+    }
+}
+
+static void av_always_inline filter_mb_edgeh( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h ) {
+    const int qp_bd_offset = 6 * (h->sps.bit_depth_luma - 8);
+    const unsigned int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset];
+    if (alpha ==0 || beta == 0) return;
+
+    if( bS[0] < 4 ) {
+        int8_t tc[4];
+        tc[0] = tc0_table[index_a][bS[0]];
+        tc[1] = tc0_table[index_a][bS[1]];
+        tc[2] = tc0_table[index_a][bS[2]];
+        tc[3] = tc0_table[index_a][bS[3]];
         h->h264dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
     } else {
         h->h264dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
@@ -299,19 +194,18 @@ static void av_always_inline filter_mb_edgeh( uint8_t *pix, int stride, int16_t
 }
 
 static void av_always_inline filter_mb_edgech( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h ) {
-    const int bit_depth = h->sps.bit_depth_luma;
-    const int qp_bd_offset = 6*(bit_depth-8);
+    const int qp_bd_offset = 6 * (h->sps.bit_depth_luma - 8);
     const unsigned int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a] << (bit_depth-8);
-    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset] << (bit_depth-8);
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset];
     if (alpha ==0 || beta == 0) return;
 
     if( bS[0] < 4 ) {
         int8_t tc[4];
-        tc[0] = (tc0_table[index_a][bS[0]] << (bit_depth-8))+1;
-        tc[1] = (tc0_table[index_a][bS[1]] << (bit_depth-8))+1;
-        tc[2] = (tc0_table[index_a][bS[2]] << (bit_depth-8))+1;
-        tc[3] = (tc0_table[index_a][bS[3]] << (bit_depth-8))+1;
+        tc[0] = tc0_table[index_a][bS[0]]+1;
+        tc[1] = tc0_table[index_a][bS[1]]+1;
+        tc[2] = tc0_table[index_a][bS[2]]+1;
+        tc[3] = tc0_table[index_a][bS[3]]+1;
         h->h264dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
     } else {
         h->h264dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
@@ -650,10 +544,10 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
         //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
         if( dir == 0 ) {
-            filter_mb_edgev( &img_y[4*edge<<h->pixel_shift], linesize, bS, qp, h );
+            filter_mb_edgev( &img_y[4*edge << h->pixel_shift], linesize, bS, qp, h );
             if( (edge&1) == 0 ) {
-                filter_mb_edgecv( &img_cb[2*edge<<h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
-                filter_mb_edgecv( &img_cr[2*edge<<h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
+                filter_mb_edgecv( &img_cb[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
+                filter_mb_edgecv( &img_cr[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
             }
         } else {
             filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, h );
diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c
index ab20ecfeb7..f77a013112 100644
--- a/libavcodec/h264_ps.c
+++ b/libavcodec/h264_ps.c
@@ -70,7 +70,7 @@ static const AVRational pixel_aspect[17]={
     QP(37,d), QP(37,d), QP(37,d), QP(38,d), QP(38,d), QP(38,d),\
     QP(39,d), QP(39,d), QP(39,d), QP(39,d)
 
-const uint8_t ff_h264_chroma_qp[3][QP_MAX_MAX+1] = {
+const uint8_t ff_h264_chroma_qp[3][QP_MAX_NUM+1] = {
     {
         CHROMA_QP_TABLE_END(8)
     },
diff --git a/libavcodec/h264_refs.c b/libavcodec/h264_refs.c
index a2058b5aec..9554201522 100644
--- a/libavcodec/h264_refs.c
+++ b/libavcodec/h264_refs.c
@@ -629,8 +629,9 @@ int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
          * short_ref and long_ref buffers.
          */
         av_log(h->s.avctx, AV_LOG_ERROR,
-               "number of reference frames exceeds max (probably "
-               "corrupt input), discarding one long:%d short:%d max:%d\n", h->long_ref_count, h->short_ref_count, h->sps.ref_frame_count);
+               "number of reference frames (%d+%d) exceeds max (%d; probably "
+               "corrupt input), discarding one\n",
+               h->long_ref_count, h->short_ref_count, h->sps.ref_frame_count);
 
         if (h->long_ref_count && !h->short_ref_count) {
             for (i = 0; i < 16; ++i)
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index 04c6ea6df4..96a38ff77d 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -30,15 +30,15 @@
 #include "h264dsp.h"
 
 #define BIT_DEPTH 8
-#include "h264dsp_internal.h"
+#include "h264dsp_template.c"
 #undef BIT_DEPTH
 
 #define BIT_DEPTH 9
-#include "h264dsp_internal.h"
+#include "h264dsp_template.c"
 #undef BIT_DEPTH
 
 #define BIT_DEPTH 10
-#include "h264dsp_internal.h"
+#include "h264dsp_template.c"
 #undef BIT_DEPTH
 
 void ff_h264dsp_init(H264DSPContext *c, const int bit_depth)
@@ -47,58 +47,62 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth)
 #define FUNC(a, depth) a ## _ ## depth ## _c
 
 #define H264_DSP(depth) \
-    c->h264_idct_add                   = FUNC(ff_h264_idct_add               , depth);\
-    c->h264_idct8_add                  = FUNC(ff_h264_idct8_add              , depth);\
-    c->h264_idct_dc_add                = FUNC(ff_h264_idct_dc_add            , depth);\
-    c->h264_idct8_dc_add               = FUNC(ff_h264_idct8_dc_add           , depth);\
-    c->h264_idct_add16                 = FUNC(ff_h264_idct_add16             , depth);\
-    c->h264_idct8_add4                 = FUNC(ff_h264_idct8_add4             , depth);\
-    c->h264_idct_add8                  = FUNC(ff_h264_idct_add8              , depth);\
-    c->h264_idct_add16intra            = FUNC(ff_h264_idct_add16intra        , depth);\
-    c->h264_luma_dc_dequant_idct       = FUNC(ff_h264_luma_dc_dequant_idct   , depth);\
-    c->h264_chroma_dc_dequant_idct     = FUNC(ff_h264_chroma_dc_dequant_idct , depth);\
+    c->h264_idct_add= FUNC(ff_h264_idct_add, depth);\
+    c->h264_idct8_add= FUNC(ff_h264_idct8_add, depth);\
+    c->h264_idct_dc_add= FUNC(ff_h264_idct_dc_add, depth);\
+    c->h264_idct8_dc_add= FUNC(ff_h264_idct8_dc_add, depth);\
+    c->h264_idct_add16     = FUNC(ff_h264_idct_add16, depth);\
+    c->h264_idct8_add4     = FUNC(ff_h264_idct8_add4, depth);\
+    c->h264_idct_add8      = FUNC(ff_h264_idct_add8, depth);\
+    c->h264_idct_add16intra= FUNC(ff_h264_idct_add16intra, depth);\
+    c->h264_luma_dc_dequant_idct= FUNC(ff_h264_luma_dc_dequant_idct, depth);\
+    c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma_dc_dequant_idct, depth);\
 \
-    c->weight_h264_pixels_tab[0]       = FUNC(  weight_h264_pixels16x16      , depth);\
-    c->weight_h264_pixels_tab[1]       = FUNC(  weight_h264_pixels16x8       , depth);\
-    c->weight_h264_pixels_tab[2]       = FUNC(  weight_h264_pixels8x16       , depth);\
-    c->weight_h264_pixels_tab[3]       = FUNC(  weight_h264_pixels8x8        , depth);\
-    c->weight_h264_pixels_tab[4]       = FUNC(  weight_h264_pixels8x4        , depth);\
-    c->weight_h264_pixels_tab[5]       = FUNC(  weight_h264_pixels4x8        , depth);\
-    c->weight_h264_pixels_tab[6]       = FUNC(  weight_h264_pixels4x4        , depth);\
-    c->weight_h264_pixels_tab[7]       = FUNC(  weight_h264_pixels4x2        , depth);\
-    c->weight_h264_pixels_tab[8]       = FUNC(  weight_h264_pixels2x4        , depth);\
-    c->weight_h264_pixels_tab[9]       = FUNC(  weight_h264_pixels2x2        , depth);\
-    c->biweight_h264_pixels_tab[0]     = FUNC(biweight_h264_pixels16x16      , depth);\
-    c->biweight_h264_pixels_tab[1]     = FUNC(biweight_h264_pixels16x8       , depth);\
-    c->biweight_h264_pixels_tab[2]     = FUNC(biweight_h264_pixels8x16       , depth);\
-    c->biweight_h264_pixels_tab[3]     = FUNC(biweight_h264_pixels8x8        , depth);\
-    c->biweight_h264_pixels_tab[4]     = FUNC(biweight_h264_pixels8x4        , depth);\
-    c->biweight_h264_pixels_tab[5]     = FUNC(biweight_h264_pixels4x8        , depth);\
-    c->biweight_h264_pixels_tab[6]     = FUNC(biweight_h264_pixels4x4        , depth);\
-    c->biweight_h264_pixels_tab[7]     = FUNC(biweight_h264_pixels4x2        , depth);\
-    c->biweight_h264_pixels_tab[8]     = FUNC(biweight_h264_pixels2x4        , depth);\
-    c->biweight_h264_pixels_tab[9]     = FUNC(biweight_h264_pixels2x2        , depth);\
+    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\
+    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\
+    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\
+    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\
+    c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\
+    c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\
+    c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\
+    c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\
+    c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\
+    c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\
+    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\
+    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\
+    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\
+    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\
+    c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\
+    c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\
+    c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\
+    c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\
+    c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\
+    c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\
 \
-    c->h264_v_loop_filter_luma         = FUNC(h264_v_loop_filter_luma        , depth);\
-    c->h264_h_loop_filter_luma         = FUNC(h264_h_loop_filter_luma        , depth);\
-    c->h264_v_loop_filter_luma_intra   = FUNC(h264_v_loop_filter_luma_intra  , depth);\
-    c->h264_h_loop_filter_luma_intra   = FUNC(h264_h_loop_filter_luma_intra  , depth);\
-    c->h264_v_loop_filter_chroma       = FUNC(h264_v_loop_filter_chroma      , depth);\
-    c->h264_h_loop_filter_chroma       = FUNC(h264_h_loop_filter_chroma      , depth);\
-    c->h264_v_loop_filter_chroma_intra = FUNC(h264_v_loop_filter_chroma_intra, depth);\
-    c->h264_h_loop_filter_chroma_intra = FUNC(h264_h_loop_filter_chroma_intra, depth);\
+    c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\
+    c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\
+    c->h264_h_loop_filter_luma_mbaff= FUNC(h264_h_loop_filter_luma_mbaff, depth);\
+    c->h264_v_loop_filter_luma_intra= FUNC(h264_v_loop_filter_luma_intra, depth);\
+    c->h264_h_loop_filter_luma_intra= FUNC(h264_h_loop_filter_luma_intra, depth);\
+    c->h264_h_loop_filter_luma_mbaff_intra= FUNC(h264_h_loop_filter_luma_mbaff_intra, depth);\
+    c->h264_v_loop_filter_chroma= FUNC(h264_v_loop_filter_chroma, depth);\
+    c->h264_h_loop_filter_chroma= FUNC(h264_h_loop_filter_chroma, depth);\
+    c->h264_h_loop_filter_chroma_mbaff= FUNC(h264_h_loop_filter_chroma_mbaff, depth);\
+    c->h264_v_loop_filter_chroma_intra= FUNC(h264_v_loop_filter_chroma_intra, depth);\
+    c->h264_h_loop_filter_chroma_intra= FUNC(h264_h_loop_filter_chroma_intra, depth);\
+    c->h264_h_loop_filter_chroma_mbaff_intra= FUNC(h264_h_loop_filter_chroma_mbaff_intra, depth);\
     c->h264_loop_filter_strength= NULL;
 
     switch (bit_depth) {
-        case 9:
-            H264_DSP(9);
-            break;
-        case 10:
-            H264_DSP(10);
-            break;
-        default:
-            H264_DSP(8);
-            break;
+    case 9:
+        H264_DSP(9);
+        break;
+    case 10:
+        H264_DSP(10);
+        break;
+    default:
+        H264_DSP(8);
+        break;
     }
 
     if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth);
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index 8a0b9ae72b..4b606efa17 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -45,13 +45,17 @@ typedef struct H264DSPContext{
     /* loop filter */
     void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
     void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0);
+    void (*h264_h_loop_filter_luma_mbaff)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
     /* v/h_loop_filter_luma_intra: align 16 */
     void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
     void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
+    void (*h264_h_loop_filter_luma_mbaff_intra)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta);
     void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0);
     void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0);
+    void (*h264_h_loop_filter_chroma_mbaff)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0);
     void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
     void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
+    void (*h264_h_loop_filter_chroma_mbaff_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
     // h264_loop_filter_strength: simd only. the C version is inlined in h264.c
     void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
                                       int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field);
diff --git a/libavcodec/h264dsp_template.c b/libavcodec/h264dsp_template.c
new file mode 100644
index 0000000000..91162ea900
--- /dev/null
+++ b/libavcodec/h264dsp_template.c
@@ -0,0 +1,313 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 DSP functions.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "high_bit_depth.h"
+
+#define op_scale1(x)  block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
+#define op_scale2(x)  dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
+#define H264_WEIGHT(W,H) \
+static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \
+    int y; \
+    pixel *block = (pixel*)_block; \
+    stride /= sizeof(pixel); \
+    offset <<= (log2_denom + (BIT_DEPTH-8)); \
+    if(log2_denom) offset += 1<<(log2_denom-1); \
+    for(y=0; y<H; y++, block += stride){ \
+        op_scale1(0); \
+        op_scale1(1); \
+        if(W==2) continue; \
+        op_scale1(2); \
+        op_scale1(3); \
+        if(W==4) continue; \
+        op_scale1(4); \
+        op_scale1(5); \
+        op_scale1(6); \
+        op_scale1(7); \
+        if(W==8) continue; \
+        op_scale1(8); \
+        op_scale1(9); \
+        op_scale1(10); \
+        op_scale1(11); \
+        op_scale1(12); \
+        op_scale1(13); \
+        op_scale1(14); \
+        op_scale1(15); \
+    } \
+} \
+static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+    int y; \
+    pixel *dst = (pixel*)_dst; \
+    pixel *src = (pixel*)_src; \
+    stride /= sizeof(pixel); \
+    offset = ((offset + 1) | 1) << log2_denom; \
+    for(y=0; y<H; y++, dst += stride, src += stride){ \
+        op_scale2(0); \
+        op_scale2(1); \
+        if(W==2) continue; \
+        op_scale2(2); \
+        op_scale2(3); \
+        if(W==4) continue; \
+        op_scale2(4); \
+        op_scale2(5); \
+        op_scale2(6); \
+        op_scale2(7); \
+        if(W==8) continue; \
+        op_scale2(8); \
+        op_scale2(9); \
+        op_scale2(10); \
+        op_scale2(11); \
+        op_scale2(12); \
+        op_scale2(13); \
+        op_scale2(14); \
+        op_scale2(15); \
+    } \
+}
+
+H264_WEIGHT(16,16)
+H264_WEIGHT(16,8)
+H264_WEIGHT(8,16)
+H264_WEIGHT(8,8)
+H264_WEIGHT(8,4)
+H264_WEIGHT(4,8)
+H264_WEIGHT(4,4)
+H264_WEIGHT(4,2)
+H264_WEIGHT(2,4)
+H264_WEIGHT(2,2)
+
+#undef op_scale1
+#undef op_scale2
+#undef H264_WEIGHT
+
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
+{
+    pixel *pix = (pixel*)_pix;
+    int i, d;
+    xstride /= sizeof(pixel);
+    ystride /= sizeof(pixel);
+    alpha <<= BIT_DEPTH - 8;
+    beta  <<= BIT_DEPTH - 8;
+    for( i = 0; i < 4; i++ ) {
+        const int tc_orig = tc0[i] << (BIT_DEPTH - 8);
+        if( tc_orig < 0 ) {
+            pix += inner_iters*ystride;
+            continue;
+        }
+        for( d = 0; d < inner_iters; d++ ) {
+            const int p0 = pix[-1*xstride];
+            const int p1 = pix[-2*xstride];
+            const int p2 = pix[-3*xstride];
+            const int q0 = pix[0];
+            const int q1 = pix[1*xstride];
+            const int q2 = pix[2*xstride];
+
+            if( FFABS( p0 - q0 ) < alpha &&
+                FFABS( p1 - p0 ) < beta &&
+                FFABS( q1 - q0 ) < beta ) {
+
+                int tc = tc_orig;
+                int i_delta;
+
+                if( FFABS( p2 - p0 ) < beta ) {
+                    if(tc_orig)
+                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc_orig, tc_orig );
+                    tc++;
+                }
+                if( FFABS( q2 - q0 ) < beta ) {
+                    if(tc_orig)
+                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc_orig, tc_orig );
+                    tc++;
+                }
+
+                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                pix[-xstride] = av_clip_pixel( p0 + i_delta );    /* p0' */
+                pix[0]        = av_clip_pixel( q0 - i_delta );    /* q0' */
+            }
+            pix += ystride;
+        }
+    }
+}
+static void FUNCC(h264_v_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    FUNCC(h264_loop_filter_luma)(pix, stride, sizeof(pixel), 4, alpha, beta, tc0);
+}
+static void FUNCC(h264_h_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    FUNCC(h264_loop_filter_luma)(pix, sizeof(pixel), stride, 4, alpha, beta, tc0);
+}
+static void FUNCC(h264_h_loop_filter_luma_mbaff)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    FUNCC(h264_loop_filter_luma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0);
+}
+
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
+{
+    pixel *pix = (pixel*)_pix;
+    int d;
+    xstride /= sizeof(pixel);
+    ystride /= sizeof(pixel);
+    alpha <<= BIT_DEPTH - 8;
+    beta  <<= BIT_DEPTH - 8;
+    for( d = 0; d < 4 * inner_iters; d++ ) {
+        const int p2 = pix[-3*xstride];
+        const int p1 = pix[-2*xstride];
+        const int p0 = pix[-1*xstride];
+
+        const int q0 = pix[ 0*xstride];
+        const int q1 = pix[ 1*xstride];
+        const int q2 = pix[ 2*xstride];
+
+        if( FFABS( p0 - q0 ) < alpha &&
+            FFABS( p1 - p0 ) < beta &&
+            FFABS( q1 - q0 ) < beta ) {
+
+            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
+                if( FFABS( p2 - p0 ) < beta)
+                {
+                    const int p3 = pix[-4*xstride];
+                    /* p0', p1', p2' */
+                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                } else {
+                    /* p0' */
+                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                }
+                if( FFABS( q2 - q0 ) < beta)
+                {
+                    const int q3 = pix[3*xstride];
+                    /* q0', q1', q2' */
+                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                } else {
+                    /* q0' */
+                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                }
+            }else{
+                /* p0', q0' */
+                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+            }
+        }
+        pix += ystride;
+    }
+}
+static void FUNCC(h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta)
+{
+    FUNCC(h264_loop_filter_luma_intra)(pix, stride, sizeof(pixel), 4, alpha, beta);
+}
+static void FUNCC(h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta)
+{
+    FUNCC(h264_loop_filter_luma_intra)(pix, sizeof(pixel), stride, 4, alpha, beta);
+}
+static void FUNCC(h264_h_loop_filter_luma_mbaff_intra)(uint8_t *pix, int stride, int alpha, int beta)
+{
+    FUNCC(h264_loop_filter_luma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta);
+}
+
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
+{
+    pixel *pix = (pixel*)_pix;
+    int i, d;
+    xstride /= sizeof(pixel);
+    ystride /= sizeof(pixel);
+    alpha <<= BIT_DEPTH - 8;
+    beta  <<= BIT_DEPTH - 8;
+    for( i = 0; i < 4; i++ ) {
+        const int tc = ((tc0[i] - 1) << (BIT_DEPTH - 8)) + 1;
+        if( tc <= 0 ) {
+            pix += inner_iters*ystride;
+            continue;
+        }
+        for( d = 0; d < inner_iters; d++ ) {
+            const int p0 = pix[-1*xstride];
+            const int p1 = pix[-2*xstride];
+            const int q0 = pix[0];
+            const int q1 = pix[1*xstride];
+
+            if( FFABS( p0 - q0 ) < alpha &&
+                FFABS( p1 - p0 ) < beta &&
+                FFABS( q1 - q0 ) < beta ) {
+
+                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+                pix[-xstride] = av_clip_pixel( p0 + delta );    /* p0' */
+                pix[0]        = av_clip_pixel( q0 - delta );    /* q0' */
+            }
+            pix += ystride;
+        }
+    }
+}
+static void FUNCC(h264_v_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    FUNCC(h264_loop_filter_chroma)(pix, stride, sizeof(pixel), 2, alpha, beta, tc0);
+}
+static void FUNCC(h264_h_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0);
+}
+static void FUNCC(h264_h_loop_filter_chroma_mbaff)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 1, alpha, beta, tc0);
+}
+
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
+{
+    pixel *pix = (pixel*)_pix;
+    int d;
+    xstride /= sizeof(pixel);
+    ystride /= sizeof(pixel);
+    alpha <<= BIT_DEPTH - 8;
+    beta  <<= BIT_DEPTH - 8;
+    for( d = 0; d < 4 * inner_iters; d++ ) {
+        const int p0 = pix[-1*xstride];
+        const int p1 = pix[-2*xstride];
+        const int q0 = pix[0];
+        const int q1 = pix[1*xstride];
+
+        if( FFABS( p0 - q0 ) < alpha &&
+            FFABS( p1 - p0 ) < beta &&
+            FFABS( q1 - q0 ) < beta ) {
+
+            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
+            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+        }
+        pix += ystride;
+    }
+}
+static void FUNCC(h264_v_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta)
+{
+    FUNCC(h264_loop_filter_chroma_intra)(pix, stride, sizeof(pixel), 2, alpha, beta);
+}
+static void FUNCC(h264_h_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta)
+{
+    FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta);
+}
+static void FUNCC(h264_h_loop_filter_chroma_mbaff_intra)(uint8_t *pix, int stride, int alpha, int beta)
+{
+    FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 1, alpha, beta);
+}
diff --git a/libavcodec/h264idct.c b/libavcodec/h264idct.c
index 920356d01f..7d1ee007bc 100644
--- a/libavcodec/h264idct.c
+++ b/libavcodec/h264idct.c
@@ -26,13 +26,13 @@
  */
 
 #define BIT_DEPTH 8
-#include "h264idct_internal.h"
+#include "h264idct_template.c"
 #undef BIT_DEPTH
 
 #define BIT_DEPTH 9
-#include "h264idct_internal.h"
+#include "h264idct_template.c"
 #undef BIT_DEPTH
 
 #define BIT_DEPTH 10
-#include "h264idct_internal.h"
+#include "h264idct_template.c"
 #undef BIT_DEPTH
diff --git a/libavcodec/h264idct_template.c b/libavcodec/h264idct_template.c
new file mode 100644
index 0000000000..39c9a1c9eb
--- /dev/null
+++ b/libavcodec/h264idct_template.c
@@ -0,0 +1,291 @@
+/*
+ * H.264 IDCT
+ * Copyright (c) 2004-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 IDCT.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "high_bit_depth.h"
+
+#ifndef AVCODEC_H264IDCT_INTERNAL_H
+#define AVCODEC_H264IDCT_INTERNAL_H
+//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
+static const uint8_t scan8[16 + 2*4]={
+ 4+1*8, 5+1*8, 4+2*8, 5+2*8,
+ 6+1*8, 7+1*8, 6+2*8, 7+2*8,
+ 4+3*8, 5+3*8, 4+4*8, 5+4*8,
+ 6+3*8, 7+3*8, 6+4*8, 7+4*8,
+ 1+1*8, 2+1*8,
+ 1+2*8, 2+2*8,
+ 1+4*8, 2+4*8,
+ 1+5*8, 2+5*8,
+};
+#endif
+
+static av_always_inline void FUNCC(idct_internal)(uint8_t *_dst, DCTELEM *_block, int stride, int block_stride, int shift, int add){
+    int i;
+    INIT_CLIP
+    pixel *dst = (pixel*)_dst;
+    dctcoef *block = (dctcoef*)_block;
+    stride /= sizeof(pixel);
+
+    block[0] += 1<<(shift-1);
+
+    for(i=0; i<4; i++){
+        const int z0=  block[i + block_stride*0]     +  block[i + block_stride*2];
+        const int z1=  block[i + block_stride*0]     -  block[i + block_stride*2];
+        const int z2= (block[i + block_stride*1]>>1) -  block[i + block_stride*3];
+        const int z3=  block[i + block_stride*1]     + (block[i + block_stride*3]>>1);
+
+        block[i + block_stride*0]= z0 + z3;
+        block[i + block_stride*1]= z1 + z2;
+        block[i + block_stride*2]= z1 - z2;
+        block[i + block_stride*3]= z0 - z3;
+    }
+
+    for(i=0; i<4; i++){
+        const int z0=  block[0 + block_stride*i]     +  block[2 + block_stride*i];
+        const int z1=  block[0 + block_stride*i]     -  block[2 + block_stride*i];
+        const int z2= (block[1 + block_stride*i]>>1) -  block[3 + block_stride*i];
+        const int z3=  block[1 + block_stride*i]     + (block[3 + block_stride*i]>>1);
+
+        dst[i + 0*stride]= CLIP(add*dst[i + 0*stride] + ((z0 + z3) >> shift));
+        dst[i + 1*stride]= CLIP(add*dst[i + 1*stride] + ((z1 + z2) >> shift));
+        dst[i + 2*stride]= CLIP(add*dst[i + 2*stride] + ((z1 - z2) >> shift));
+        dst[i + 3*stride]= CLIP(add*dst[i + 3*stride] + ((z0 - z3) >> shift));
+    }
+}
+
+void FUNCC(ff_h264_idct_add)(uint8_t *dst, DCTELEM *block, int stride){
+    FUNCC(idct_internal)(dst, block, stride, 4, 6, 1);
+}
+
+void FUNCC(ff_h264_lowres_idct_add)(uint8_t *dst, int stride, DCTELEM *block){
+    FUNCC(idct_internal)(dst, block, stride, 8, 3, 1);
+}
+
+void FUNCC(ff_h264_lowres_idct_put)(uint8_t *dst, int stride, DCTELEM *block){
+    FUNCC(idct_internal)(dst, block, stride, 8, 3, 0);
+}
+
+void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, DCTELEM *_block, int stride){
+    int i;
+    INIT_CLIP
+    pixel *dst = (pixel*)_dst;
+    dctcoef *block = (dctcoef*)_block;
+    stride /= sizeof(pixel);
+
+    block[0] += 32;
+
+    for( i = 0; i < 8; i++ )
+    {
+        const int a0 =  block[i+0*8] + block[i+4*8];
+        const int a2 =  block[i+0*8] - block[i+4*8];
+        const int a4 = (block[i+2*8]>>1) - block[i+6*8];
+        const int a6 = (block[i+6*8]>>1) + block[i+2*8];
+
+        const int b0 = a0 + a6;
+        const int b2 = a2 + a4;
+        const int b4 = a2 - a4;
+        const int b6 = a0 - a6;
+
+        const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1);
+        const int a3 =  block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1);
+        const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1);
+        const int a7 =  block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1);
+
+        const int b1 = (a7>>2) + a1;
+        const int b3 =  a3 + (a5>>2);
+        const int b5 = (a3>>2) - a5;
+        const int b7 =  a7 - (a1>>2);
+
+        block[i+0*8] = b0 + b7;
+        block[i+7*8] = b0 - b7;
+        block[i+1*8] = b2 + b5;
+        block[i+6*8] = b2 - b5;
+        block[i+2*8] = b4 + b3;
+        block[i+5*8] = b4 - b3;
+        block[i+3*8] = b6 + b1;
+        block[i+4*8] = b6 - b1;
+    }
+    for( i = 0; i < 8; i++ )
+    {
+        const int a0 =  block[0+i*8] + block[4+i*8];
+        const int a2 =  block[0+i*8] - block[4+i*8];
+        const int a4 = (block[2+i*8]>>1) - block[6+i*8];
+        const int a6 = (block[6+i*8]>>1) + block[2+i*8];
+
+        const int b0 = a0 + a6;
+        const int b2 = a2 + a4;
+        const int b4 = a2 - a4;
+        const int b6 = a0 - a6;
+
+        const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1);
+        const int a3 =  block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1);
+        const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1);
+        const int a7 =  block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1);
+
+        const int b1 = (a7>>2) + a1;
+        const int b3 =  a3 + (a5>>2);
+        const int b5 = (a3>>2) - a5;
+        const int b7 =  a7 - (a1>>2);
+
+        dst[i + 0*stride] = CLIP( dst[i + 0*stride] + ((b0 + b7) >> 6) );
+        dst[i + 1*stride] = CLIP( dst[i + 1*stride] + ((b2 + b5) >> 6) );
+        dst[i + 2*stride] = CLIP( dst[i + 2*stride] + ((b4 + b3) >> 6) );
+        dst[i + 3*stride] = CLIP( dst[i + 3*stride] + ((b6 + b1) >> 6) );
+        dst[i + 4*stride] = CLIP( dst[i + 4*stride] + ((b6 - b1) >> 6) );
+        dst[i + 5*stride] = CLIP( dst[i + 5*stride] + ((b4 - b3) >> 6) );
+        dst[i + 6*stride] = CLIP( dst[i + 6*stride] + ((b2 - b5) >> 6) );
+        dst[i + 7*stride] = CLIP( dst[i + 7*stride] + ((b0 - b7) >> 6) );
+    }
+}
+
+// assumes all AC coefs are 0
+void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, DCTELEM *block, int stride){
+    int i, j;
+    int dc = (((dctcoef*)block)[0] + 32) >> 6;
+    INIT_CLIP
+    pixel *dst = (pixel*)_dst;
+    stride /= sizeof(pixel);
+    for( j = 0; j < 4; j++ )
+    {
+        for( i = 0; i < 4; i++ )
+            dst[i] = CLIP( dst[i] + dc );
+        dst += stride;
+    }
+}
+
+void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, DCTELEM *block, int stride){
+    int i, j;
+    int dc = (((dctcoef*)block)[0] + 32) >> 6;
+    INIT_CLIP
+    pixel *dst = (pixel*)_dst;
+    stride /= sizeof(pixel);
+    for( j = 0; j < 8; j++ )
+    {
+        for( i = 0; i < 8; i++ )
+            dst[i] = CLIP( dst[i] + dc );
+        dst += stride;
+    }
+}
+
+void FUNCC(ff_h264_idct_add16)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && ((dctcoef*)block)[i*16]) FUNCC(ff_h264_idct_dc_add)(dst + block_offset[i], block + i*16*sizeof(pixel), stride);
+            else                                  FUNCC(idct_internal      )(dst + block_offset[i], block + i*16*sizeof(pixel), stride, 4, 6, 1);
+        }
+    }
+}
+
+void FUNCC(ff_h264_idct_add16intra)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        if(nnzc[ scan8[i] ])             FUNCC(idct_internal      )(dst + block_offset[i], block + i*16*sizeof(pixel), stride, 4, 6, 1);
+        else if(((dctcoef*)block)[i*16]) FUNCC(ff_h264_idct_dc_add)(dst + block_offset[i], block + i*16*sizeof(pixel), stride);
+    }
+}
+
+void FUNCC(ff_h264_idct8_add4)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=4){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && ((dctcoef*)block)[i*16]) FUNCC(ff_h264_idct8_dc_add)(dst + block_offset[i], block + i*16*sizeof(pixel), stride);
+            else                                  FUNCC(ff_h264_idct8_add   )(dst + block_offset[i], block + i*16*sizeof(pixel), stride);
+        }
+    }
+}
+
+void FUNCC(ff_h264_idct_add8)(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=16; i<16+8; i++){
+        if(nnzc[ scan8[i] ])
+            FUNCC(ff_h264_idct_add   )(dest[(i&4)>>2] + block_offset[i], block + i*16*sizeof(pixel), stride);
+        else if(((dctcoef*)block)[i*16])
+            FUNCC(ff_h264_idct_dc_add)(dest[(i&4)>>2] + block_offset[i], block + i*16*sizeof(pixel), stride);
+    }
+}
+/**
+ * IDCT transforms the 16 dc values and dequantizes them.
+ * @param qp quantization parameter
+ */
+void FUNCC(ff_h264_luma_dc_dequant_idct)(DCTELEM *_output, DCTELEM *_input, int qmul){
+#define stride 16
+    int i;
+    int temp[16];
+    static const uint8_t x_offset[4]={0, 2*stride, 8*stride, 10*stride};
+    dctcoef *input = (dctcoef*)_input;
+    dctcoef *output = (dctcoef*)_output;
+
+    for(i=0; i<4; i++){
+        const int z0= input[4*i+0] + input[4*i+1];
+        const int z1= input[4*i+0] - input[4*i+1];
+        const int z2= input[4*i+2] - input[4*i+3];
+        const int z3= input[4*i+2] + input[4*i+3];
+
+        temp[4*i+0]= z0+z3;
+        temp[4*i+1]= z0-z3;
+        temp[4*i+2]= z1-z2;
+        temp[4*i+3]= z1+z2;
+    }
+
+    for(i=0; i<4; i++){
+        const int offset= x_offset[i];
+        const int z0= temp[4*0+i] + temp[4*2+i];
+        const int z1= temp[4*0+i] - temp[4*2+i];
+        const int z2= temp[4*1+i] - temp[4*3+i];
+        const int z3= temp[4*1+i] + temp[4*3+i];
+
+        output[stride* 0+offset]= ((((z0 + z3)*qmul + 128 ) >> 8));
+        output[stride* 1+offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
+        output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
+        output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
+    }
+#undef stride
+}
+
+void FUNCC(ff_h264_chroma_dc_dequant_idct)(DCTELEM *_block, int qmul){
+    const int stride= 16*2;
+    const int xStride= 16;
+    int a,b,c,d,e;
+    dctcoef *block = (dctcoef*)_block;
+
+    a= block[stride*0 + xStride*0];
+    b= block[stride*0 + xStride*1];
+    c= block[stride*1 + xStride*0];
+    d= block[stride*1 + xStride*1];
+
+    e= a-b;
+    a= a+b;
+    b= c-d;
+    c= c+d;
+
+    block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
+    block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
+    block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
+    block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
+}
diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c
index 8d2c6f0355..f6533cf9ba 100644
--- a/libavcodec/h264pred.c
+++ b/libavcodec/h264pred.c
@@ -28,17 +28,338 @@
 #include "h264pred.h"
 
 #define BIT_DEPTH 8
-#include "h264pred_internal.h"
+#include "h264pred_template.c"
 #undef BIT_DEPTH
 
 #define BIT_DEPTH 9
-#include "h264pred_internal.h"
+#include "h264pred_template.c"
 #undef BIT_DEPTH
 
 #define BIT_DEPTH 10
-#include "h264pred_internal.h"
+#include "h264pred_template.c"
 #undef BIT_DEPTH
 
+static void pred4x4_vertical_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
+    const int lt= src[-1-1*stride];
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+    uint32_t v = PACK_4U8((lt + 2*t0 + t1 + 2) >> 2,
+                          (t0 + 2*t1 + t2 + 2) >> 2,
+                          (t1 + 2*t2 + t3 + 2) >> 2,
+                          (t2 + 2*t3 + t4 + 2) >> 2);
+
+    AV_WN32A(src+0*stride, v);
+    AV_WN32A(src+1*stride, v);
+    AV_WN32A(src+2*stride, v);
+    AV_WN32A(src+3*stride, v);
+}
+
+static void pred4x4_horizontal_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
+    const int lt= src[-1-1*stride];
+    LOAD_LEFT_EDGE
+
+    AV_WN32A(src+0*stride, ((lt + 2*l0 + l1 + 2) >> 2)*0x01010101);
+    AV_WN32A(src+1*stride, ((l0 + 2*l1 + l2 + 2) >> 2)*0x01010101);
+    AV_WN32A(src+2*stride, ((l1 + 2*l2 + l3 + 2) >> 2)*0x01010101);
+    AV_WN32A(src+3*stride, ((l2 + 2*l3 + l3 + 2) >> 2)*0x01010101);
+}
+
+static void pred4x4_down_left_svq3_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_TOP_EDGE
+    LOAD_LEFT_EDGE
+    const av_unused int unu0= t0;
+    const av_unused int unu1= l0;
+
+    src[0+0*stride]=(l1 + t1)>>1;
+    src[1+0*stride]=
+    src[0+1*stride]=(l2 + t2)>>1;
+    src[2+0*stride]=
+    src[1+1*stride]=
+    src[0+2*stride]=
+    src[3+0*stride]=
+    src[2+1*stride]=
+    src[1+2*stride]=
+    src[0+3*stride]=
+    src[3+1*stride]=
+    src[2+2*stride]=
+    src[1+3*stride]=
+    src[3+2*stride]=
+    src[2+3*stride]=
+    src[3+3*stride]=(l3 + t3)>>1;
+}
+
+static void pred4x4_down_left_rv40_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+    LOAD_LEFT_EDGE
+    LOAD_DOWN_LEFT_EDGE
+
+    src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3;
+    src[1+0*stride]=
+    src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3;
+    src[2+0*stride]=
+    src[1+1*stride]=
+    src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + l4 + 2*l3 + 2)>>3;
+    src[3+0*stride]=
+    src[2+1*stride]=
+    src[1+2*stride]=
+    src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3 + l5 + 2*l4 + 2)>>3;
+    src[3+1*stride]=
+    src[2+2*stride]=
+    src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l4 + l6 + 2*l5 + 2)>>3;
+    src[3+2*stride]=
+    src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l5 + l7 + 2*l6 + 2)>>3;
+    src[3+3*stride]=(t6 + t7 + 1 + l6 + l7 + 1)>>2;
+}
+
+static void pred4x4_down_left_rv40_nodown_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3;
+    src[1+0*stride]=
+    src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3;
+    src[2+0*stride]=
+    src[1+1*stride]=
+    src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + 3*l3 + 2)>>3;
+    src[3+0*stride]=
+    src[2+1*stride]=
+    src[1+2*stride]=
+    src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3*4 + 2)>>3;
+    src[3+1*stride]=
+    src[2+2*stride]=
+    src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l3*4 + 2)>>3;
+    src[3+2*stride]=
+    src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l3*4 + 2)>>3;
+    src[3+3*stride]=(t6 + t7 + 1 + 2*l3 + 1)>>2;
+}
+
+static void pred4x4_vertical_left_rv40(uint8_t *src, const uint8_t *topright, int stride,
+                                       const int l0, const int l1, const int l2, const int l3, const int l4){
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+
+    src[0+0*stride]=(2*t0 + 2*t1 + l1 + 2*l2 + l3 + 4)>>3;
+    src[1+0*stride]=
+    src[0+2*stride]=(t1 + t2 + 1)>>1;
+    src[2+0*stride]=
+    src[1+2*stride]=(t2 + t3 + 1)>>1;
+    src[3+0*stride]=
+    src[2+2*stride]=(t3 + t4+ 1)>>1;
+    src[3+2*stride]=(t4 + t5+ 1)>>1;
+    src[0+1*stride]=(t0 + 2*t1 + t2 + l2 + 2*l3 + l4 + 4)>>3;
+    src[1+1*stride]=
+    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+    src[2+1*stride]=
+    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
+    src[3+1*stride]=
+    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
+    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
+}
+
+static void pred4x4_vertical_left_rv40_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_LEFT_EDGE
+    LOAD_DOWN_LEFT_EDGE
+
+    pred4x4_vertical_left_rv40(src, topright, stride, l0, l1, l2, l3, l4);
+}
+
+static void pred4x4_vertical_left_rv40_nodown_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_LEFT_EDGE
+
+    pred4x4_vertical_left_rv40(src, topright, stride, l0, l1, l2, l3, l3);
+}
+
+static void pred4x4_vertical_left_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+
+    src[0+0*stride]=(t0 + t1 + 1)>>1;
+    src[1+0*stride]=
+    src[0+2*stride]=(t1 + t2 + 1)>>1;
+    src[2+0*stride]=
+    src[1+2*stride]=(t2 + t3 + 1)>>1;
+    src[3+0*stride]=
+    src[2+2*stride]=(t3 + t4 + 1)>>1;
+    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[1+1*stride]=
+    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+    src[2+1*stride]=
+    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
+    src[3+1*stride]=
+    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
+    src[3+2*stride]=(t4 + 2*t5 + t6 + 2)>>2;
+    src[3+3*stride]=(t5 + 2*t6 + t7 + 2)>>2;
+}
+
+static void pred4x4_horizontal_up_rv40_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_LEFT_EDGE
+    LOAD_DOWN_LEFT_EDGE
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+
+    src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3;
+    src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3;
+    src[2+0*stride]=
+    src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3;
+    src[3+0*stride]=
+    src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3;
+    src[2+1*stride]=
+    src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3;
+    src[3+1*stride]=
+    src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3;
+    src[3+2*stride]=
+    src[1+3*stride]=(l3 + 2*l4 + l5 + 2)>>2;
+    src[0+3*stride]=
+    src[2+2*stride]=(t6 + t7 + l3 + l4 + 2)>>2;
+    src[2+3*stride]=(l4 + l5 + 1)>>1;
+    src[3+3*stride]=(l4 + 2*l5 + l6 + 2)>>2;
+}
+
+static void pred4x4_horizontal_up_rv40_nodown_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_LEFT_EDGE
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+
+    src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3;
+    src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3;
+    src[2+0*stride]=
+    src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3;
+    src[3+0*stride]=
+    src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3;
+    src[2+1*stride]=
+    src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3;
+    src[3+1*stride]=
+    src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3;
+    src[3+2*stride]=
+    src[1+3*stride]=l3;
+    src[0+3*stride]=
+    src[2+2*stride]=(t6 + t7 + 2*l3 + 2)>>2;
+    src[2+3*stride]=
+    src[3+3*stride]=l3;
+}
+
+static void pred4x4_tm_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
+    uint8_t *top = src-stride;
+    int y;
+
+    for (y = 0; y < 4; y++) {
+        uint8_t *cm_in = cm + src[-1];
+        src[0] = cm_in[top[0]];
+        src[1] = cm_in[top[1]];
+        src[2] = cm_in[top[2]];
+        src[3] = cm_in[top[3]];
+        src += stride;
+    }
+}
+
+static void pred16x16_plane_svq3_c(uint8_t *src, int stride){
+    pred16x16_plane_compat_8_c(src, stride, 1, 0);
+}
+
+static void pred16x16_plane_rv40_c(uint8_t *src, int stride){
+    pred16x16_plane_compat_8_c(src, stride, 0, 1);
+}
+
+static void pred16x16_tm_vp8_c(uint8_t *src, int stride){
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
+    uint8_t *top = src-stride;
+    int y;
+
+    for (y = 0; y < 16; y++) {
+        uint8_t *cm_in = cm + src[-1];
+        src[0]  = cm_in[top[0]];
+        src[1]  = cm_in[top[1]];
+        src[2]  = cm_in[top[2]];
+        src[3]  = cm_in[top[3]];
+        src[4]  = cm_in[top[4]];
+        src[5]  = cm_in[top[5]];
+        src[6]  = cm_in[top[6]];
+        src[7]  = cm_in[top[7]];
+        src[8]  = cm_in[top[8]];
+        src[9]  = cm_in[top[9]];
+        src[10] = cm_in[top[10]];
+        src[11] = cm_in[top[11]];
+        src[12] = cm_in[top[12]];
+        src[13] = cm_in[top[13]];
+        src[14] = cm_in[top[14]];
+        src[15] = cm_in[top[15]];
+        src += stride;
+    }
+}
+
+static void pred8x8_left_dc_rv40_c(uint8_t *src, int stride){
+    int i;
+    int dc0;
+
+    dc0=0;
+    for(i=0;i<8; i++)
+        dc0+= src[-1+i*stride];
+    dc0= 0x01010101*((dc0 + 4)>>3);
+
+    for(i=0; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]= dc0;
+    }
+}
+
+static void pred8x8_top_dc_rv40_c(uint8_t *src, int stride){
+    int i;
+    int dc0;
+
+    dc0=0;
+    for(i=0;i<8; i++)
+        dc0+= src[i-stride];
+    dc0= 0x01010101*((dc0 + 4)>>3);
+
+    for(i=0; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]= dc0;
+    }
+}
+
+static void pred8x8_dc_rv40_c(uint8_t *src, int stride){
+    int i;
+    int dc0=0;
+
+    for(i=0;i<4; i++){
+        dc0+= src[-1+i*stride] + src[i-stride];
+        dc0+= src[4+i-stride];
+        dc0+= src[-1+(i+4)*stride];
+    }
+    dc0= 0x01010101*((dc0 + 8)>>4);
+
+    for(i=0; i<4; i++){
+        ((uint32_t*)(src+i*stride))[0]= dc0;
+        ((uint32_t*)(src+i*stride))[1]= dc0;
+    }
+    for(i=4; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]= dc0;
+        ((uint32_t*)(src+i*stride))[1]= dc0;
+    }
+}
+
+static void pred8x8_tm_vp8_c(uint8_t *src, int stride){
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
+    uint8_t *top = src-stride;
+    int y;
+
+    for (y = 0; y < 8; y++) {
+        uint8_t *cm_in = cm + src[-1];
+        src[0] = cm_in[top[0]];
+        src[1] = cm_in[top[1]];
+        src[2] = cm_in[top[2]];
+        src[3] = cm_in[top[3]];
+        src[4] = cm_in[top[4]];
+        src[5] = cm_in[top[5]];
+        src[6] = cm_in[top[6]];
+        src[7] = cm_in[top[7]];
+        src += stride;
+    }
+}
+
 /**
  * Set the intra prediction function pointers.
  */
@@ -49,26 +370,27 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
 #undef FUNCC
 #define FUNC(a, depth) a ## _ ## depth
 #define FUNCC(a, depth) a ## _ ## depth ## _c
+#define FUNCD(a) a ## _c
 
 #define H264_PRED(depth) \
     if(codec_id != CODEC_ID_RV40){\
         if(codec_id == CODEC_ID_VP8) {\
-            h->pred4x4[VERT_PRED       ]= FUNCC(pred4x4_vertical_vp8      , depth);\
-            h->pred4x4[HOR_PRED        ]= FUNCC(pred4x4_horizontal_vp8    , depth);\
+            h->pred4x4[VERT_PRED       ]= FUNCD(pred4x4_vertical_vp8);\
+            h->pred4x4[HOR_PRED        ]= FUNCD(pred4x4_horizontal_vp8);\
         } else {\
             h->pred4x4[VERT_PRED       ]= FUNCC(pred4x4_vertical          , depth);\
             h->pred4x4[HOR_PRED        ]= FUNCC(pred4x4_horizontal        , depth);\
         }\
         h->pred4x4[DC_PRED             ]= FUNCC(pred4x4_dc                , depth);\
         if(codec_id == CODEC_ID_SVQ3)\
-            h->pred4x4[DIAG_DOWN_LEFT_PRED ]= FUNCC(pred4x4_down_left_svq3, depth);\
+            h->pred4x4[DIAG_DOWN_LEFT_PRED ]= FUNCD(pred4x4_down_left_svq3);\
         else\
             h->pred4x4[DIAG_DOWN_LEFT_PRED ]= FUNCC(pred4x4_down_left     , depth);\
         h->pred4x4[DIAG_DOWN_RIGHT_PRED]= FUNCC(pred4x4_down_right        , depth);\
         h->pred4x4[VERT_RIGHT_PRED     ]= FUNCC(pred4x4_vertical_right    , depth);\
         h->pred4x4[HOR_DOWN_PRED       ]= FUNCC(pred4x4_horizontal_down   , depth);\
         if (codec_id == CODEC_ID_VP8) {\
-            h->pred4x4[VERT_LEFT_PRED  ]= FUNCC(pred4x4_vertical_left_vp8 , depth);\
+            h->pred4x4[VERT_LEFT_PRED  ]= FUNCD(pred4x4_vertical_left_vp8);\
         } else\
             h->pred4x4[VERT_LEFT_PRED  ]= FUNCC(pred4x4_vertical_left     , depth);\
         h->pred4x4[HOR_UP_PRED         ]= FUNCC(pred4x4_horizontal_up     , depth);\
@@ -77,7 +399,7 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
             h->pred4x4[TOP_DC_PRED     ]= FUNCC(pred4x4_top_dc            , depth);\
             h->pred4x4[DC_128_PRED     ]= FUNCC(pred4x4_128_dc            , depth);\
         } else {\
-            h->pred4x4[TM_VP8_PRED     ]= FUNCC(pred4x4_tm_vp8            , depth);\
+            h->pred4x4[TM_VP8_PRED     ]= FUNCD(pred4x4_tm_vp8);\
             h->pred4x4[DC_127_PRED     ]= FUNCC(pred4x4_127_dc            , depth);\
             h->pred4x4[DC_129_PRED     ]= FUNCC(pred4x4_129_dc            , depth);\
             h->pred4x4[VERT_VP8_PRED   ]= FUNCC(pred4x4_vertical          , depth);\
@@ -87,18 +409,18 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
         h->pred4x4[VERT_PRED           ]= FUNCC(pred4x4_vertical          , depth);\
         h->pred4x4[HOR_PRED            ]= FUNCC(pred4x4_horizontal        , depth);\
         h->pred4x4[DC_PRED             ]= FUNCC(pred4x4_dc                , depth);\
-        h->pred4x4[DIAG_DOWN_LEFT_PRED ]= FUNCC(pred4x4_down_left_rv40    , depth);\
+        h->pred4x4[DIAG_DOWN_LEFT_PRED ]= FUNCD(pred4x4_down_left_rv40);\
         h->pred4x4[DIAG_DOWN_RIGHT_PRED]= FUNCC(pred4x4_down_right        , depth);\
         h->pred4x4[VERT_RIGHT_PRED     ]= FUNCC(pred4x4_vertical_right    , depth);\
         h->pred4x4[HOR_DOWN_PRED       ]= FUNCC(pred4x4_horizontal_down   , depth);\
-        h->pred4x4[VERT_LEFT_PRED      ]= FUNCC(pred4x4_vertical_left_rv40, depth);\
-        h->pred4x4[HOR_UP_PRED         ]= FUNCC(pred4x4_horizontal_up_rv40, depth);\
+        h->pred4x4[VERT_LEFT_PRED      ]= FUNCD(pred4x4_vertical_left_rv40);\
+        h->pred4x4[HOR_UP_PRED         ]= FUNCD(pred4x4_horizontal_up_rv40);\
         h->pred4x4[LEFT_DC_PRED        ]= FUNCC(pred4x4_left_dc           , depth);\
         h->pred4x4[TOP_DC_PRED         ]= FUNCC(pred4x4_top_dc            , depth);\
         h->pred4x4[DC_128_PRED         ]= FUNCC(pred4x4_128_dc            , depth);\
-        h->pred4x4[DIAG_DOWN_LEFT_PRED_RV40_NODOWN]= FUNCC(pred4x4_down_left_rv40_nodown, depth);\
-        h->pred4x4[HOR_UP_PRED_RV40_NODOWN]= FUNCC(pred4x4_horizontal_up_rv40_nodown    , depth);\
-        h->pred4x4[VERT_LEFT_PRED_RV40_NODOWN]= FUNCC(pred4x4_vertical_left_rv40_nodown , depth);\
+        h->pred4x4[DIAG_DOWN_LEFT_PRED_RV40_NODOWN]= FUNCD(pred4x4_down_left_rv40_nodown);\
+        h->pred4x4[HOR_UP_PRED_RV40_NODOWN]= FUNCD(pred4x4_horizontal_up_rv40_nodown);\
+        h->pred4x4[VERT_LEFT_PRED_RV40_NODOWN]= FUNCD(pred4x4_vertical_left_rv40_nodown);\
     }\
 \
     h->pred8x8l[VERT_PRED           ]= FUNCC(pred8x8l_vertical            , depth);\
@@ -119,7 +441,7 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
     if (codec_id != CODEC_ID_VP8) {\
         h->pred8x8[PLANE_PRED8x8]= FUNCC(pred8x8_plane                    , depth);\
     } else\
-        h->pred8x8[PLANE_PRED8x8]= FUNCC(pred8x8_tm_vp8                   , depth);\
+        h->pred8x8[PLANE_PRED8x8]= FUNCD(pred8x8_tm_vp8);\
     if(codec_id != CODEC_ID_RV40 && codec_id != CODEC_ID_VP8){\
         h->pred8x8[DC_PRED8x8     ]= FUNCC(pred8x8_dc                     , depth);\
         h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x8_left_dc                , depth);\
@@ -129,9 +451,9 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
         h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\
         h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\
     }else{\
-        h->pred8x8[DC_PRED8x8     ]= FUNCC(pred8x8_dc_rv40                , depth);\
-        h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x8_left_dc_rv40           , depth);\
-        h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x8_top_dc_rv40            , depth);\
+        h->pred8x8[DC_PRED8x8     ]= FUNCD(pred8x8_dc_rv40);\
+        h->pred8x8[LEFT_DC_PRED8x8]= FUNCD(pred8x8_left_dc_rv40);\
+        h->pred8x8[TOP_DC_PRED8x8 ]= FUNCD(pred8x8_top_dc_rv40);\
         if (codec_id == CODEC_ID_VP8) {\
             h->pred8x8[DC_127_PRED8x8]= FUNCC(pred8x8_127_dc              , depth);\
             h->pred8x8[DC_129_PRED8x8]= FUNCC(pred8x8_129_dc              , depth);\
@@ -144,13 +466,13 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
     h->pred16x16[HOR_PRED8x8    ]= FUNCC(pred16x16_horizontal             , depth);\
     switch(codec_id){\
     case CODEC_ID_SVQ3:\
-       h->pred16x16[PLANE_PRED8x8  ]= FUNCC(pred16x16_plane_svq3          , depth);\
+       h->pred16x16[PLANE_PRED8x8  ]= FUNCD(pred16x16_plane_svq3);\
        break;\
     case CODEC_ID_RV40:\
-       h->pred16x16[PLANE_PRED8x8  ]= FUNCC(pred16x16_plane_rv40          , depth);\
+       h->pred16x16[PLANE_PRED8x8  ]= FUNCD(pred16x16_plane_rv40);\
        break;\
     case CODEC_ID_VP8:\
-       h->pred16x16[PLANE_PRED8x8  ]= FUNCC(pred16x16_tm_vp8              , depth);\
+       h->pred16x16[PLANE_PRED8x8  ]= FUNCD(pred16x16_tm_vp8);\
        h->pred16x16[DC_127_PRED8x8]= FUNCC(pred16x16_127_dc               , depth);\
        h->pred16x16[DC_129_PRED8x8]= FUNCC(pred16x16_129_dc               , depth);\
        break;\
diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c
new file mode 100644
index 0000000000..066e837cdf
--- /dev/null
+++ b/libavcodec/h264pred_template.c
@@ -0,0 +1,975 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 prediction functions.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "mathops.h"
+#include "high_bit_depth.h"
+
+static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const pixel4 a= ((pixel4*)(src-stride))[0];
+    ((pixel4*)(src+0*stride))[0]= a;
+    ((pixel4*)(src+1*stride))[0]= a;
+    ((pixel4*)(src+2*stride))[0]= a;
+    ((pixel4*)(src+3*stride))[0]= a;
+}
+
+static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    ((pixel4*)(src+0*stride))[0]= PIXEL_SPLAT_X4(src[-1+0*stride]);
+    ((pixel4*)(src+1*stride))[0]= PIXEL_SPLAT_X4(src[-1+1*stride]);
+    ((pixel4*)(src+2*stride))[0]= PIXEL_SPLAT_X4(src[-1+2*stride]);
+    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(src[-1+3*stride]);
+}
+
+static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
+                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
+
+    ((pixel4*)(src+0*stride))[0]=
+    ((pixel4*)(src+1*stride))[0]=
+    ((pixel4*)(src+2*stride))[0]=
+    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
+}
+
+static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
+
+    ((pixel4*)(src+0*stride))[0]=
+    ((pixel4*)(src+1*stride))[0]=
+    ((pixel4*)(src+2*stride))[0]=
+    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
+}
+
+static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
+
+    ((pixel4*)(src+0*stride))[0]=
+    ((pixel4*)(src+1*stride))[0]=
+    ((pixel4*)(src+2*stride))[0]=
+    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
+}
+
+static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    ((pixel4*)(src+0*stride))[0]=
+    ((pixel4*)(src+1*stride))[0]=
+    ((pixel4*)(src+2*stride))[0]=
+    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
+}
+
+static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    ((pixel4*)(src+0*stride))[0]=
+    ((pixel4*)(src+1*stride))[0]=
+    ((pixel4*)(src+2*stride))[0]=
+    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
+}
+
+static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    ((pixel4*)(src+0*stride))[0]=
+    ((pixel4*)(src+1*stride))[0]=
+    ((pixel4*)(src+2*stride))[0]=
+    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
+}
+
+
+#define LOAD_TOP_RIGHT_EDGE\
+    const int av_unused t4= topright[0];\
+    const int av_unused t5= topright[1];\
+    const int av_unused t6= topright[2];\
+    const int av_unused t7= topright[3];\
+
+#define LOAD_DOWN_LEFT_EDGE\
+    const int av_unused l4= src[-1+4*stride];\
+    const int av_unused l5= src[-1+5*stride];\
+    const int av_unused l6= src[-1+6*stride];\
+    const int av_unused l7= src[-1+7*stride];\
+
+#define LOAD_LEFT_EDGE\
+    const int av_unused l0= src[-1+0*stride];\
+    const int av_unused l1= src[-1+1*stride];\
+    const int av_unused l2= src[-1+2*stride];\
+    const int av_unused l3= src[-1+3*stride];\
+
+#define LOAD_TOP_EDGE\
+    const int av_unused t0= src[ 0-1*stride];\
+    const int av_unused t1= src[ 1-1*stride];\
+    const int av_unused t2= src[ 2-1*stride];\
+    const int av_unused t3= src[ 3-1*stride];\
+
+static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const int lt= src[-1-1*stride];
+    LOAD_TOP_EDGE
+    LOAD_LEFT_EDGE
+
+    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
+    src[0+2*stride]=
+    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
+    src[0+1*stride]=
+    src[1+2*stride]=
+    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
+    src[0+0*stride]=
+    src[1+1*stride]=
+    src[2+2*stride]=
+    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
+    src[1+0*stride]=
+    src[2+1*stride]=
+    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
+    src[2+0*stride]=
+    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+}
+
+static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright, int _stride){
+    pixel *src = (pixel*)_src;
+    const pixel *topright = (const pixel*)_topright;
+    int stride = _stride/sizeof(pixel);
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+//    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
+    src[1+0*stride]=
+    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
+    src[2+0*stride]=
+    src[1+1*stride]=
+    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
+    src[3+0*stride]=
+    src[2+1*stride]=
+    src[1+2*stride]=
+    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
+    src[3+1*stride]=
+    src[2+2*stride]=
+    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
+    src[3+2*stride]=
+    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
+    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
+}
+
+static void FUNCC(pred4x4_vertical_right)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const int lt= src[-1-1*stride];
+    LOAD_TOP_EDGE
+    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=
+    src[1+2*stride]=(lt + t0 + 1)>>1;
+    src[1+0*stride]=
+    src[2+2*stride]=(t0 + t1 + 1)>>1;
+    src[2+0*stride]=
+    src[3+2*stride]=(t1 + t2 + 1)>>1;
+    src[3+0*stride]=(t2 + t3 + 1)>>1;
+    src[0+1*stride]=
+    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
+    src[1+1*stride]=
+    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
+    src[2+1*stride]=
+    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
+    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
+}
+
+static void FUNCC(pred4x4_vertical_left)(uint8_t *_src, const uint8_t *_topright, int _stride){
+    pixel *src = (pixel*)_src;
+    const pixel *topright = (const pixel*)_topright;
+    int stride = _stride/sizeof(pixel);
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+
+    src[0+0*stride]=(t0 + t1 + 1)>>1;
+    src[1+0*stride]=
+    src[0+2*stride]=(t1 + t2 + 1)>>1;
+    src[2+0*stride]=
+    src[1+2*stride]=(t2 + t3 + 1)>>1;
+    src[3+0*stride]=
+    src[2+2*stride]=(t3 + t4+ 1)>>1;
+    src[3+2*stride]=(t4 + t5+ 1)>>1;
+    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[1+1*stride]=
+    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+    src[2+1*stride]=
+    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
+    src[3+1*stride]=
+    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
+    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
+}
+
+static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=(l0 + l1 + 1)>>1;
+    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
+    src[2+0*stride]=
+    src[0+1*stride]=(l1 + l2 + 1)>>1;
+    src[3+0*stride]=
+    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
+    src[2+1*stride]=
+    src[0+2*stride]=(l2 + l3 + 1)>>1;
+    src[3+1*stride]=
+    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
+    src[3+2*stride]=
+    src[1+3*stride]=
+    src[0+3*stride]=
+    src[2+2*stride]=
+    src[2+3*stride]=
+    src[3+3*stride]=l3;
+}
+
+static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const int lt= src[-1-1*stride];
+    LOAD_TOP_EDGE
+    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=
+    src[2+1*stride]=(lt + l0 + 1)>>1;
+    src[1+0*stride]=
+    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
+    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
+    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[0+1*stride]=
+    src[2+2*stride]=(l0 + l1 + 1)>>1;
+    src[1+1*stride]=
+    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
+    src[0+2*stride]=
+    src[2+3*stride]=(l1 + l2+ 1)>>1;
+    src[1+2*stride]=
+    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
+    src[0+3*stride]=(l2 + l3 + 1)>>1;
+    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
+}
+
+static void FUNCC(pred16x16_vertical)(uint8_t *_src, int _stride){
+    int i;
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const pixel4 a = ((pixel4*)(src-stride))[0];
+    const pixel4 b = ((pixel4*)(src-stride))[1];
+    const pixel4 c = ((pixel4*)(src-stride))[2];
+    const pixel4 d = ((pixel4*)(src-stride))[3];
+
+    for(i=0; i<16; i++){
+        ((pixel4*)(src+i*stride))[0] = a;
+        ((pixel4*)(src+i*stride))[1] = b;
+        ((pixel4*)(src+i*stride))[2] = c;
+        ((pixel4*)(src+i*stride))[3] = d;
+    }
+}
+
+static void FUNCC(pred16x16_horizontal)(uint8_t *_src, int stride){
+    int i;
+    pixel *src = (pixel*)_src;
+    stride /= sizeof(pixel);
+
+    for(i=0; i<16; i++){
+        ((pixel4*)(src+i*stride))[0] =
+        ((pixel4*)(src+i*stride))[1] =
+        ((pixel4*)(src+i*stride))[2] =
+        ((pixel4*)(src+i*stride))[3] = PIXEL_SPLAT_X4(src[-1+i*stride]);
+    }
+}
+
+#define PREDICT_16x16_DC(v)\
+    for(i=0; i<16; i++){\
+        AV_WN4P(src+ 0, v);\
+        AV_WN4P(src+ 4, v);\
+        AV_WN4P(src+ 8, v);\
+        AV_WN4P(src+12, v);\
+        src += stride;\
+    }
+
+static void FUNCC(pred16x16_dc)(uint8_t *_src, int stride){
+    int i, dc=0;
+    pixel *src = (pixel*)_src;
+    pixel4 dcsplat;
+    stride /= sizeof(pixel);
+
+    for(i=0;i<16; i++){
+        dc+= src[-1+i*stride];
+    }
+
+    for(i=0;i<16; i++){
+        dc+= src[i-stride];
+    }
+
+    dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
+    PREDICT_16x16_DC(dcsplat);
+}
+
+static void FUNCC(pred16x16_left_dc)(uint8_t *_src, int stride){
+    int i, dc=0;
+    pixel *src = (pixel*)_src;
+    pixel4 dcsplat;
+    stride /= sizeof(pixel);
+
+    for(i=0;i<16; i++){
+        dc+= src[-1+i*stride];
+    }
+
+    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
+    PREDICT_16x16_DC(dcsplat);
+}
+
+static void FUNCC(pred16x16_top_dc)(uint8_t *_src, int stride){
+    int i, dc=0;
+    pixel *src = (pixel*)_src;
+    pixel4 dcsplat;
+    stride /= sizeof(pixel);
+
+    for(i=0;i<16; i++){
+        dc+= src[i-stride];
+    }
+
+    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
+    PREDICT_16x16_DC(dcsplat);
+}
+
+#define PRED16x16_X(n, v) \
+static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, int stride){\
+    int i;\
+    pixel *src = (pixel*)_src;\
+    stride /= sizeof(pixel);\
+    PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
+}
+
+PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1);
+PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0);
+PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1);
+
+static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src, int _stride, const int svq3, const int rv40){
+  int i, j, k;
+  int a;
+  INIT_CLIP
+  pixel *src = (pixel*)_src;
+  int stride = _stride/sizeof(pixel);
+  const pixel * const src0 = src +7-stride;
+  const pixel *       src1 = src +8*stride-1;
+  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
+  int H = src0[1] - src0[-1];
+  int V = src1[0] - src2[ 0];
+  for(k=2; k<=8; ++k) {
+    src1 += stride; src2 -= stride;
+    H += k*(src0[k] - src0[-k]);
+    V += k*(src1[0] - src2[ 0]);
+  }
+  if(svq3){
+    H = ( 5*(H/4) ) / 16;
+    V = ( 5*(V/4) ) / 16;
+
+    /* required for 100% accuracy */
+    i = H; H = V; V = i;
+  }else if(rv40){
+    H = ( H + (H>>2) ) >> 4;
+    V = ( V + (V>>2) ) >> 4;
+  }else{
+    H = ( 5*H+32 ) >> 6;
+    V = ( 5*V+32 ) >> 6;
+  }
+
+  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
+  for(j=16; j>0; --j) {
+    int b = a;
+    a += V;
+    for(i=-16; i<0; i+=4) {
+      src[16+i] = CLIP((b    ) >> 5);
+      src[17+i] = CLIP((b+  H) >> 5);
+      src[18+i] = CLIP((b+2*H) >> 5);
+      src[19+i] = CLIP((b+3*H) >> 5);
+      b += 4*H;
+    }
+    src += stride;
+  }
+}
+
+static void FUNCC(pred16x16_plane)(uint8_t *src, int stride){
+    FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
+}
+
+static void FUNCC(pred8x8_vertical)(uint8_t *_src, int _stride){
+    int i;
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const pixel4 a= ((pixel4*)(src-stride))[0];
+    const pixel4 b= ((pixel4*)(src-stride))[1];
+
+    for(i=0; i<8; i++){
+        ((pixel4*)(src+i*stride))[0]= a;
+        ((pixel4*)(src+i*stride))[1]= b;
+    }
+}
+
+static void FUNCC(pred8x8_horizontal)(uint8_t *_src, int stride){
+    int i;
+    pixel *src = (pixel*)_src;
+    stride /= sizeof(pixel);
+
+    for(i=0; i<8; i++){
+        ((pixel4*)(src+i*stride))[0]=
+        ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(src[-1+i*stride]);
+    }
+}
+
+#define PRED8x8_X(n, v)\
+static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, int stride){\
+    int i;\
+    pixel *src = (pixel*)_src;\
+    stride /= sizeof(pixel);\
+    for(i=0; i<8; i++){\
+        ((pixel4*)(src+i*stride))[0]=\
+        ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(v);\
+    }\
+}
+
+PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1);
+PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0);
+PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1);
+
+static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int stride){
+    int i;
+    int dc0, dc2;
+    pixel4 dc0splat, dc2splat;
+    pixel *src = (pixel*)_src;
+    stride /= sizeof(pixel);
+
+    dc0=dc2=0;
+    for(i=0;i<4; i++){
+        dc0+= src[-1+i*stride];
+        dc2+= src[-1+(i+4)*stride];
+    }
+    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
+    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
+
+    for(i=0; i<4; i++){
+        ((pixel4*)(src+i*stride))[0]=
+        ((pixel4*)(src+i*stride))[1]= dc0splat;
+    }
+    for(i=4; i<8; i++){
+        ((pixel4*)(src+i*stride))[0]=
+        ((pixel4*)(src+i*stride))[1]= dc2splat;
+    }
+}
+
+static void FUNCC(pred8x8_top_dc)(uint8_t *_src, int stride){
+    int i;
+    int dc0, dc1;
+    pixel4 dc0splat, dc1splat;
+    pixel *src = (pixel*)_src;
+    stride /= sizeof(pixel);
+
+    dc0=dc1=0;
+    for(i=0;i<4; i++){
+        dc0+= src[i-stride];
+        dc1+= src[4+i-stride];
+    }
+    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
+    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
+
+    for(i=0; i<4; i++){
+        ((pixel4*)(src+i*stride))[0]= dc0splat;
+        ((pixel4*)(src+i*stride))[1]= dc1splat;
+    }
+    for(i=4; i<8; i++){
+        ((pixel4*)(src+i*stride))[0]= dc0splat;
+        ((pixel4*)(src+i*stride))[1]= dc1splat;
+    }
+}
+
+static void FUNCC(pred8x8_dc)(uint8_t *_src, int stride){
+    int i;
+    int dc0, dc1, dc2;
+    pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
+    pixel *src = (pixel*)_src;
+    stride /= sizeof(pixel);
+
+    dc0=dc1=dc2=0;
+    for(i=0;i<4; i++){
+        dc0+= src[-1+i*stride] + src[i-stride];
+        dc1+= src[4+i-stride];
+        dc2+= src[-1+(i+4)*stride];
+    }
+    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
+    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
+    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
+    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
+
+    for(i=0; i<4; i++){
+        ((pixel4*)(src+i*stride))[0]= dc0splat;
+        ((pixel4*)(src+i*stride))[1]= dc1splat;
+    }
+    for(i=4; i<8; i++){
+        ((pixel4*)(src+i*stride))[0]= dc2splat;
+        ((pixel4*)(src+i*stride))[1]= dc3splat;
+    }
+}
+
+//the following 4 function should not be optimized!
+static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
+    FUNCC(pred8x8_top_dc)(src, stride);
+    FUNCC(pred4x4_dc)(src, NULL, stride);
+}
+
+static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
+    FUNCC(pred8x8_dc)(src, stride);
+    FUNCC(pred4x4_top_dc)(src, NULL, stride);
+}
+
+static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
+    FUNCC(pred8x8_left_dc)(src, stride);
+    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
+    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
+}
+
+static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
+    FUNCC(pred8x8_left_dc)(src, stride);
+    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
+    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
+}
+
+static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){
+  int j, k;
+  int a;
+  INIT_CLIP
+  pixel *src = (pixel*)_src;
+  int stride = _stride/sizeof(pixel);
+  const pixel * const src0 = src +3-stride;
+  const pixel *       src1 = src +4*stride-1;
+  const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
+  int H = src0[1] - src0[-1];
+  int V = src1[0] - src2[ 0];
+  for(k=2; k<=4; ++k) {
+    src1 += stride; src2 -= stride;
+    H += k*(src0[k] - src0[-k]);
+    V += k*(src1[0] - src2[ 0]);
+  }
+  H = ( 17*H+16 ) >> 5;
+  V = ( 17*V+16 ) >> 5;
+
+  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
+  for(j=8; j>0; --j) {
+    int b = a;
+    a += V;
+    src[0] = CLIP((b    ) >> 5);
+    src[1] = CLIP((b+  H) >> 5);
+    src[2] = CLIP((b+2*H) >> 5);
+    src[3] = CLIP((b+3*H) >> 5);
+    src[4] = CLIP((b+4*H) >> 5);
+    src[5] = CLIP((b+5*H) >> 5);
+    src[6] = CLIP((b+6*H) >> 5);
+    src[7] = CLIP((b+7*H) >> 5);
+    src += stride;
+  }
+}
+
+#define SRC(x,y) src[(x)+(y)*stride]
+#define PL(y) \
+    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_LEFT \
+    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
+                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
+    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
+    const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
+
+#define PT(x) \
+    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_TOP \
+    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
+                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
+    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
+    const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
+                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
+
+#define PTR(x) \
+    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_TOPRIGHT \
+    int t8, t9, t10, t11, t12, t13, t14, t15; \
+    if(has_topright) { \
+        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
+        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
+    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
+
+#define PREDICT_8x8_LOAD_TOPLEFT \
+    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
+
+#define PREDICT_8x8_DC(v) \
+    int y; \
+    for( y = 0; y < 8; y++ ) { \
+        ((pixel4*)src)[0] = \
+        ((pixel4*)src)[1] = v; \
+        src += stride; \
+    }
+
+static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+
+    PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
+}
+static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+
+    PREDICT_8x8_LOAD_LEFT;
+    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
+    PREDICT_8x8_DC(dc);
+}
+static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+
+    PREDICT_8x8_LOAD_TOP;
+    const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
+    PREDICT_8x8_DC(dc);
+}
+static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOP;
+    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
+                                     +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
+    PREDICT_8x8_DC(dc);
+}
+static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+
+    PREDICT_8x8_LOAD_LEFT;
+#define ROW(y) ((pixel4*)(src+y*stride))[0] =\
+               ((pixel4*)(src+y*stride))[1] = PIXEL_SPLAT_X4(l##y)
+    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
+#undef ROW
+}
+static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    int y;
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+
+    PREDICT_8x8_LOAD_TOP;
+    src[0] = t0;
+    src[1] = t1;
+    src[2] = t2;
+    src[3] = t3;
+    src[4] = t4;
+    src[5] = t5;
+    src[6] = t6;
+    src[7] = t7;
+    for( y = 1; y < 8; y++ ) {
+        ((pixel4*)(src+y*stride))[0] = ((pixel4*)src)[0];
+        ((pixel4*)(src+y*stride))[1] = ((pixel4*)src)[1];
+    }
+}
+static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_TOPRIGHT;
+    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
+    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
+    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
+    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
+    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
+    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
+    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
+    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
+    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
+}
+static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
+    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
+    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
+    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
+    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
+    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
+    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
+    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
+    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
+}
+static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
+    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
+    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
+    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
+    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
+    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
+    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
+    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
+    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
+    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
+    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
+    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
+    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
+    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
+    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(7,0)= (t6 + t7 + 1) >> 1;
+}
+static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    SRC(0,7)= (l6 + l7 + 1) >> 1;
+    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
+    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
+    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
+    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
+    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
+    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
+    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
+    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
+    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
+    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
+    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
+    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
+    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
+    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
+    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
+    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
+    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
+    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
+    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
+    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
+}
+static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_TOPRIGHT;
+    SRC(0,0)= (t0 + t1 + 1) >> 1;
+    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
+    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
+    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
+    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
+    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
+    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
+    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
+    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
+    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
+    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
+    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
+    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
+    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
+    SRC(7,6)= (t10 + t11 + 1) >> 1;
+    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
+}
+static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    PREDICT_8x8_LOAD_LEFT;
+    SRC(0,0)= (l0 + l1 + 1) >> 1;
+    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
+    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
+    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
+    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
+    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
+    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
+    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
+    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
+    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
+    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
+    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
+    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
+    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
+    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
+    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
+    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
+    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
+}
+#undef PREDICT_8x8_LOAD_LEFT
+#undef PREDICT_8x8_LOAD_TOP
+#undef PREDICT_8x8_LOAD_TOPLEFT
+#undef PREDICT_8x8_LOAD_TOPRIGHT
+#undef PREDICT_8x8_DC
+#undef PTR
+#undef PT
+#undef PL
+#undef SRC
+
+static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
+    int i;
+    pixel *pix = (pixel*)_pix;
+    const dctcoef *block = (const dctcoef*)_block;
+    stride /= sizeof(pixel);
+    pix -= stride;
+    for(i=0; i<4; i++){
+        pixel v = pix[0];
+        pix[1*stride]= v += block[0];
+        pix[2*stride]= v += block[4];
+        pix[3*stride]= v += block[8];
+        pix[4*stride]= v +  block[12];
+        pix++;
+        block++;
+    }
+}
+
+static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
+    int i;
+    pixel *pix = (pixel*)_pix;
+    const dctcoef *block = (const dctcoef*)_block;
+    stride /= sizeof(pixel);
+    for(i=0; i<4; i++){
+        pixel v = pix[-1];
+        pix[0]= v += block[0];
+        pix[1]= v += block[1];
+        pix[2]= v += block[2];
+        pix[3]= v +  block[3];
+        pix+= stride;
+        block+= 4;
+    }
+}
+
+static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
+    int i;
+    pixel *pix = (pixel*)_pix;
+    const dctcoef *block = (const dctcoef*)_block;
+    stride /= sizeof(pixel);
+    pix -= stride;
+    for(i=0; i<8; i++){
+        pixel v = pix[0];
+        pix[1*stride]= v += block[0];
+        pix[2*stride]= v += block[8];
+        pix[3*stride]= v += block[16];
+        pix[4*stride]= v += block[24];
+        pix[5*stride]= v += block[32];
+        pix[6*stride]= v += block[40];
+        pix[7*stride]= v += block[48];
+        pix[8*stride]= v +  block[56];
+        pix++;
+        block++;
+    }
+}
+
+static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
+    int i;
+    pixel *pix = (pixel*)_pix;
+    const dctcoef *block = (const dctcoef*)_block;
+    stride /= sizeof(pixel);
+    for(i=0; i<8; i++){
+        pixel v = pix[-1];
+        pix[0]= v += block[0];
+        pix[1]= v += block[1];
+        pix[2]= v += block[2];
+        pix[3]= v += block[3];
+        pix[4]= v += block[4];
+        pix[5]= v += block[5];
+        pix[6]= v += block[6];
+        pix[7]= v +  block[7];
+        pix+= stride;
+        block+= 8;
+    }
+}
+
+static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<16; i++)
+        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
+}
+
+static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<16; i++)
+        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
+}
+
+static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<4; i++)
+        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
+}
+
+static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<4; i++)
+        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
+}
diff --git a/libavcodec/high_bit_depth.h b/libavcodec/high_bit_depth.h
new file mode 100644
index 0000000000..6f2b6a74f4
--- /dev/null
+++ b/libavcodec/high_bit_depth.h
@@ -0,0 +1,85 @@
+#include "dsputil.h"
+
+#ifndef BIT_DEPTH
+#define BIT_DEPTH 8
+#endif
+
+#ifdef AVCODEC_H264_HIGH_DEPTH_H
+#   undef pixel
+#   undef pixel2
+#   undef pixel4
+#   undef dctcoef
+#   undef INIT_CLIP
+#   undef no_rnd_avg_pixel4
+#   undef rnd_avg_pixel4
+#   undef AV_RN2P
+#   undef AV_RN4P
+#   undef AV_WN2P
+#   undef AV_WN4P
+#   undef AV_WN4PA
+#   undef CLIP
+#   undef FUNC
+#   undef FUNCC
+#   undef av_clip_pixel
+#   undef PIXEL_SPLAT_X4
+#else
+#   define AVCODEC_H264_HIGH_DEPTH_H
+#   define CLIP_PIXEL(depth)\
+    static inline uint16_t av_clip_pixel_ ## depth (int p)\
+    {\
+        const int pixel_max = (1 << depth)-1;\
+        return (p & ~pixel_max) ? (-p)>>31 & pixel_max : p;\
+    }
+
+CLIP_PIXEL( 9)
+CLIP_PIXEL(10)
+#endif
+
+#if BIT_DEPTH > 8
+#   define pixel  uint16_t
+#   define pixel2 uint32_t
+#   define pixel4 uint64_t
+#   define dctcoef int32_t
+
+#   define INIT_CLIP
+#   define no_rnd_avg_pixel4 no_rnd_avg64
+#   define    rnd_avg_pixel4    rnd_avg64
+#   define AV_RN2P  AV_RN32
+#   define AV_RN4P  AV_RN64
+#   define AV_WN2P  AV_WN32
+#   define AV_WN4P  AV_WN64
+#   define AV_WN4PA AV_WN64A
+#   define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL)
+#else
+#   define pixel  uint8_t
+#   define pixel2 uint16_t
+#   define pixel4 uint32_t
+#   define dctcoef int16_t
+
+#   define INIT_CLIP uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+#   define no_rnd_avg_pixel4 no_rnd_avg32
+#   define    rnd_avg_pixel4    rnd_avg32
+#   define AV_RN2P  AV_RN16
+#   define AV_RN4P  AV_RN32
+#   define AV_WN2P  AV_WN16
+#   define AV_WN4P  AV_WN32
+#   define AV_WN4PA AV_WN32A
+#   define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
+#endif
+
+#if BIT_DEPTH == 8
+#   define av_clip_pixel(a) av_clip_uint8(a)
+#   define CLIP(a) cm[a]
+#   define FUNC(a)  a ## _8
+#   define FUNCC(a) a ## _8_c
+#elif BIT_DEPTH == 9
+#   define av_clip_pixel(a) av_clip_pixel_9(a)
+#   define CLIP(a)          av_clip_pixel_9(a)
+#   define FUNC(a)  a ## _9
+#   define FUNCC(a) a ## _9_c
+#elif BIT_DEPTH == 10
+#   define av_clip_pixel(a) av_clip_pixel_10(a)
+#   define CLIP(a)          av_clip_pixel_10(a)
+#   define FUNC(a)  a ## _10
+#   define FUNCC(a) a ## _10_c
+#endif
diff --git a/libavcodec/lpc.c b/libavcodec/lpc.c
index 6d7671c81f..d041cafe85 100644
--- a/libavcodec/lpc.c
+++ b/libavcodec/lpc.c
@@ -158,7 +158,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
                       const int32_t *samples, int blocksize, int min_order,
                       int max_order, int precision,
                       int32_t coefs[][MAX_LPC_ORDER], int *shift,
-                      enum AVLPCType lpc_type, int lpc_passes,
+                      enum FFLPCType lpc_type, int lpc_passes,
                       int omethod, int max_shift, int zero_shift)
 {
     double autoc[MAX_LPC_ORDER+1];
@@ -168,7 +168,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
     int opt_order;
 
     assert(max_order >= MIN_LPC_ORDER && max_order <= MAX_LPC_ORDER &&
-           lpc_type > AV_LPC_TYPE_FIXED);
+           lpc_type > FF_LPC_TYPE_FIXED);
 
     /* reinit LPC context if parameters have changed */
     if (blocksize != s->blocksize || max_order != s->max_order ||
@@ -177,7 +177,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
         ff_lpc_init(s, blocksize, max_order, lpc_type);
     }
 
-    if (lpc_type == AV_LPC_TYPE_LEVINSON) {
+    if (lpc_type == FF_LPC_TYPE_LEVINSON) {
         double *windowed_samples = s->windowed_samples + max_order;
 
         s->lpc_apply_welch_window(samples, blocksize, windowed_samples);
@@ -188,7 +188,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
 
         for(i=0; i<max_order; i++)
             ref[i] = fabs(lpc[i][i]);
-    } else if (lpc_type == AV_LPC_TYPE_CHOLESKY) {
+    } else if (lpc_type == FF_LPC_TYPE_CHOLESKY) {
         LLSModel m[2];
         double var[MAX_LPC_ORDER+1], av_uninit(weight);
 
@@ -241,13 +241,13 @@ int ff_lpc_calc_coefs(LPCContext *s,
 }
 
 av_cold int ff_lpc_init(LPCContext *s, int blocksize, int max_order,
-                        enum AVLPCType lpc_type)
+                        enum FFLPCType lpc_type)
 {
     s->blocksize = blocksize;
     s->max_order = max_order;
     s->lpc_type  = lpc_type;
 
-    if (lpc_type == AV_LPC_TYPE_LEVINSON) {
+    if (lpc_type == FF_LPC_TYPE_LEVINSON) {
         s->windowed_samples = av_mallocz((blocksize + max_order + 2) *
                                          sizeof(*s->windowed_samples));
         if (!s->windowed_samples)
diff --git a/libavcodec/lpc.h b/libavcodec/lpc.h
index 96b66df909..9db5dbac30 100644
--- a/libavcodec/lpc.h
+++ b/libavcodec/lpc.h
@@ -35,11 +35,22 @@
 #define MIN_LPC_ORDER        1
 #define MAX_LPC_ORDER       32
 
+/**
+ * LPC analysis type
+ */
+enum FFLPCType {
+    FF_LPC_TYPE_DEFAULT     = -1, ///< use the codec default LPC type
+    FF_LPC_TYPE_NONE        =  0, ///< do not use LPC prediction or use all zero coefficients
+    FF_LPC_TYPE_FIXED       =  1, ///< fixed LPC coefficients
+    FF_LPC_TYPE_LEVINSON    =  2, ///< Levinson-Durbin recursion
+    FF_LPC_TYPE_CHOLESKY    =  3, ///< Cholesky factorization
+    FF_LPC_TYPE_NB              , ///< Not part of ABI
+};
 
 typedef struct LPCContext {
     int blocksize;
     int max_order;
-    enum AVLPCType lpc_type;
+    enum FFLPCType lpc_type;
     double *windowed_samples;
 
     /**
@@ -77,14 +88,14 @@ int ff_lpc_calc_coefs(LPCContext *s,
                       const int32_t *samples, int blocksize, int min_order,
                       int max_order, int precision,
                       int32_t coefs[][MAX_LPC_ORDER], int *shift,
-                      enum AVLPCType lpc_type, int lpc_passes,
+                      enum FFLPCType lpc_type, int lpc_passes,
                       int omethod, int max_shift, int zero_shift);
 
 /**
  * Initialize LPCContext.
  */
 int ff_lpc_init(LPCContext *s, int blocksize, int max_order,
-                enum AVLPCType lpc_type);
+                enum FFLPCType lpc_type);
 void ff_lpc_init_x86(LPCContext *s);
 
 /**
diff --git a/libavcodec/mlib/dsputil_mlib.c b/libavcodec/mlib/dsputil_mlib.c
index 3b2d693d88..1a18a8a223 100644
--- a/libavcodec/mlib/dsputil_mlib.c
+++ b/libavcodec/mlib/dsputil_mlib.c
@@ -421,13 +421,13 @@ static void ff_fdct_mlib(DCTELEM *data)
 
 void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx)
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
 
     c->get_pixels  = get_pixels_mlib;
     c->diff_pixels = diff_pixels_mlib;
     c->add_pixels_clamped = add_pixels_clamped_mlib;
 
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
     c->put_pixels_tab[0][0] = put_pixels16_mlib;
     c->put_pixels_tab[0][1] = put_pixels16_x2_mlib;
     c->put_pixels_tab[0][2] = put_pixels16_y2_mlib;
diff --git a/libavcodec/options.c b/libavcodec/options.c
index c0eb790b59..de25635161 100644
--- a/libavcodec/options.c
+++ b/libavcodec/options.c
@@ -103,7 +103,7 @@ static const AVOption options[]={
 {"umh", "umh motion estimation", 0, FF_OPT_TYPE_CONST, {.dbl = ME_UMH }, INT_MIN, INT_MAX, V|E, "me_method" },
 {"iter", "iter motion estimation", 0, FF_OPT_TYPE_CONST, {.dbl = ME_ITER }, INT_MIN, INT_MAX, V|E, "me_method" },
 {"extradata_size", NULL, OFFSET(extradata_size), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX},
-{"time_base", NULL, OFFSET(time_base), FF_OPT_TYPE_RATIONAL, {.dbl=0}, INT_MIN, INT_MAX},
+{"time_base", NULL, OFFSET(time_base), FF_OPT_TYPE_RATIONAL, {.dbl = 0}, INT_MIN, INT_MAX},
 {"g", "set the group of picture size", OFFSET(gop_size), FF_OPT_TYPE_INT, {.dbl = 12 }, INT_MIN, INT_MAX, V|E},
 {"ar", "set audio sampling rate (in Hz)", OFFSET(sample_rate), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX},
 {"ac", "set number of audio channels", OFFSET(channels), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX},
@@ -224,7 +224,7 @@ static const AVOption options[]={
 {"left", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_PRED_LEFT }, INT_MIN, INT_MAX, V|E, "pred"},
 {"plane", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_PRED_PLANE }, INT_MIN, INT_MAX, V|E, "pred"},
 {"median", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_PRED_MEDIAN }, INT_MIN, INT_MAX, V|E, "pred"},
-{"aspect", "sample aspect ratio", OFFSET(sample_aspect_ratio), FF_OPT_TYPE_RATIONAL, {.dbl=0}, 0, 10, V|E},
+{"aspect", "sample aspect ratio", OFFSET(sample_aspect_ratio), FF_OPT_TYPE_RATIONAL, {.dbl = 0}, 0, 10, V|E},
 {"debug", "print specific debug info", OFFSET(debug), FF_OPT_TYPE_FLAGS, {.dbl = DEFAULT }, 0, INT_MAX, V|A|S|E|D, "debug"},
 {"pict", "picture info", 0, FF_OPT_TYPE_CONST, {.dbl = FF_DEBUG_PICT_INFO }, INT_MIN, INT_MAX, V|D, "debug"},
 {"rc", "rate control", 0, FF_OPT_TYPE_CONST, {.dbl = FF_DEBUG_RC }, INT_MIN, INT_MAX, V|E, "debug"},
@@ -380,12 +380,14 @@ static const AVOption options[]={
 {"ivlc", "intra vlc table", 0, FF_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_INTRA_VLC }, INT_MIN, INT_MAX, V|E, "flags2"},
 {"b_sensitivity", "adjusts sensitivity of b_frame_strategy 1", OFFSET(b_sensitivity), FF_OPT_TYPE_INT, {.dbl = 40 }, 1, INT_MAX, V|E},
 {"compression_level", NULL, OFFSET(compression_level), FF_OPT_TYPE_INT, {.dbl = FF_COMPRESSION_DEFAULT }, INT_MIN, INT_MAX, V|A|E},
-{"lpc_coeff_precision", "LPC coefficient precision (FLAC)", OFFSET(lpc_coeff_precision), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, 0, INT_MAX, A|E},
 {"min_prediction_order", NULL, OFFSET(min_prediction_order), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
 {"max_prediction_order", NULL, OFFSET(max_prediction_order), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
-{"prediction_order_method", "search method for selecting prediction order", OFFSET(prediction_order_method), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
-{"min_partition_order", NULL, OFFSET(min_partition_order), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
-{"max_partition_order", NULL, OFFSET(max_partition_order), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
+#if FF_API_FLAC_GLOBAL_OPTS
+{"lpc_coeff_precision", "deprecated, use flac-specific options", OFFSET(lpc_coeff_precision), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, 0, INT_MAX, A|E},
+{"prediction_order_method", "deprecated, use flac-specific options", OFFSET(prediction_order_method), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
+{"min_partition_order", "deprecated, use flac-specific options", OFFSET(min_partition_order), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
+{"max_partition_order", "deprecated, use flac-specific options", OFFSET(max_partition_order), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
+#endif
 {"timecode_frame_start", "GOP timecode frame start number, in non drop frame format", OFFSET(timecode_frame_start), FF_OPT_TYPE_INT64, {.dbl = 0 }, 0, INT64_MAX, V|E},
 {"drop_frame_timecode", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_DROP_FRAME_TIMECODE }, INT_MIN, INT_MAX, V|E, "flags2"},
 {"non_linear_q", "use non linear quantizer", 0, FF_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_NON_LINEAR_QUANT }, INT_MIN, INT_MAX, V|E, "flags2"},
@@ -416,12 +418,14 @@ static const AVOption options[]={
 {"intra_refresh", "use periodic insertion of intra blocks instead of keyframes", 0, FF_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_INTRA_REFRESH }, INT_MIN, INT_MAX, V|E, "flags2"},
 {"crf_max", "in crf mode, prevents vbv from lowering quality beyond this point", OFFSET(crf_max), FF_OPT_TYPE_FLOAT, {.dbl = DEFAULT }, 0, 51, V|E},
 {"log_level_offset", "set the log level offset", OFFSET(log_level_offset), FF_OPT_TYPE_INT, {.dbl = 0 }, INT_MIN, INT_MAX },
-{"lpc_type", "specify LPC algorithm", OFFSET(lpc_type), FF_OPT_TYPE_INT, {.dbl = AV_LPC_TYPE_DEFAULT }, AV_LPC_TYPE_DEFAULT, AV_LPC_TYPE_NB-1, A|E},
+#if FF_API_FLAC_GLOBAL_OPTS
+{"lpc_type", "deprecated, use flac-specific options", OFFSET(lpc_type), FF_OPT_TYPE_INT, {.dbl = AV_LPC_TYPE_DEFAULT }, AV_LPC_TYPE_DEFAULT, AV_LPC_TYPE_NB-1, A|E},
 {"none",     NULL, 0, FF_OPT_TYPE_CONST, {.dbl = AV_LPC_TYPE_NONE },     INT_MIN, INT_MAX, A|E, "lpc_type"},
 {"fixed",    NULL, 0, FF_OPT_TYPE_CONST, {.dbl = AV_LPC_TYPE_FIXED },    INT_MIN, INT_MAX, A|E, "lpc_type"},
 {"levinson", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = AV_LPC_TYPE_LEVINSON }, INT_MIN, INT_MAX, A|E, "lpc_type"},
 {"cholesky", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = AV_LPC_TYPE_CHOLESKY }, INT_MIN, INT_MAX, A|E, "lpc_type"},
-{"lpc_passes", "number of passes to use for Cholesky factorization during LPC analysis", OFFSET(lpc_passes), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
+{"lpc_passes", "deprecated, use flac-specific options", OFFSET(lpc_passes), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
+#endif
 {"slices", "number of slices, used in parallelized decoding", OFFSET(slices), FF_OPT_TYPE_INT, {.dbl = 0 }, 0, INT_MAX, V|E},
 {"thread_type", "select multithreading type", OFFSET(thread_type), FF_OPT_TYPE_INT, {.dbl = FF_THREAD_SLICE|FF_THREAD_FRAME }, 0, INT_MAX, V|E|D, "thread_type"},
 {"slice", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_THREAD_SLICE }, INT_MIN, INT_MAX, V|E|D, "thread_type"},
diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c
index a8d0a61a85..bd432beb87 100644
--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
@@ -1384,7 +1384,7 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int l
 
 void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
 
     c->pix_abs[0][1] = sad16_x2_altivec;
     c->pix_abs[0][2] = sad16_y2_altivec;
@@ -1399,10 +1399,10 @@ void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
     c->pix_sum = pix_sum_altivec;
     c->diff_pixels = diff_pixels_altivec;
     c->get_pixels = get_pixels_altivec;
-    if (!h264_high_depth)
+    if (!high_bit_depth)
     c->clear_block = clear_block_altivec;
     c->add_bytes= add_bytes_altivec;
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
     c->put_pixels_tab[0][0] = put_pixels16_altivec;
     /* the two functions do the same thing, so use the same code */
     c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
diff --git a/libavcodec/ppc/dsputil_ppc.c b/libavcodec/ppc/dsputil_ppc.c
index 57f30ef703..327fe2c72f 100644
--- a/libavcodec/ppc/dsputil_ppc.c
+++ b/libavcodec/ppc/dsputil_ppc.c
@@ -153,11 +153,11 @@ static void prefetch_ppc(void *mem, int stride, int h)
 
 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
 
     // Common optimizations whether AltiVec is available or not
     c->prefetch = prefetch_ppc;
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
     switch (check_dcbzl_effect()) {
         case 32:
             c->clear_blocks = clear_blocks_dcbz32_ppc;
diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c
index 0e58846f51..9df18888ad 100644
--- a/libavcodec/ppc/h264_altivec.c
+++ b/libavcodec/ppc/h264_altivec.c
@@ -965,10 +965,10 @@ H264_WEIGHT( 8, 8)
 H264_WEIGHT( 8, 4)
 
 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
 
     if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) {
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
         c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
         c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
 
diff --git a/libavcodec/ps2/dsputil_mmi.c b/libavcodec/ps2/dsputil_mmi.c
index 4190f9da10..349583f1ba 100644
--- a/libavcodec/ps2/dsputil_mmi.c
+++ b/libavcodec/ps2/dsputil_mmi.c
@@ -142,9 +142,9 @@ static void put_pixels16_mmi(uint8_t *block, const uint8_t *pixels, int line_siz
 void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx)
 {
     const int idct_algo= avctx->idct_algo;
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
 
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
     c->clear_blocks = clear_blocks_mmi;
 
     c->put_pixels_tab[1][0] = put_pixels8_mmi;
diff --git a/libavcodec/ra144enc.c b/libavcodec/ra144enc.c
index a0912056d7..351ba9e871 100644
--- a/libavcodec/ra144enc.c
+++ b/libavcodec/ra144enc.c
@@ -54,7 +54,7 @@ static av_cold int ra144_encode_init(AVCodecContext * avctx)
     ractx->lpc_coef[1] = ractx->lpc_tables[1];
     ractx->avctx = avctx;
     ret = ff_lpc_init(&ractx->lpc_ctx, avctx->frame_size, LPC_ORDER,
-                      AV_LPC_TYPE_LEVINSON);
+                      FF_LPC_TYPE_LEVINSON);
     return ret;
 }
 
@@ -461,7 +461,7 @@ static int ra144_encode_frame(AVCodecContext *avctx, uint8_t *frame,
                                     32)];
 
     ff_lpc_calc_coefs(&ractx->lpc_ctx, lpc_data, NBLOCKS * BLOCKSIZE, LPC_ORDER,
-                      LPC_ORDER, 16, lpc_coefs, shift, AV_LPC_TYPE_LEVINSON,
+                      LPC_ORDER, 16, lpc_coefs, shift, FF_LPC_TYPE_LEVINSON,
                       0, ORDER_METHOD_EST, 12, 0);
     for (i = 0; i < LPC_ORDER; i++)
         block_coefs[NBLOCKS - 1][i] = -(lpc_coefs[LPC_ORDER - 1][i] <<
diff --git a/libavcodec/sh4/dsputil_align.c b/libavcodec/sh4/dsputil_align.c
index 93b663894a..8be9318cdb 100644
--- a/libavcodec/sh4/dsputil_align.c
+++ b/libavcodec/sh4/dsputil_align.c
@@ -333,9 +333,9 @@ DEFFUNC(avg,no_rnd,xy,16,OP_XY,PACK)
 
 void dsputil_init_align(DSPContext* c, AVCodecContext *avctx)
 {
-        const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+        const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
 
-        if (!h264_high_depth) {
+        if (!high_bit_depth) {
         c->put_pixels_tab[0][0] = put_rnd_pixels16_o;
         c->put_pixels_tab[0][1] = put_rnd_pixels16_x;
         c->put_pixels_tab[0][2] = put_rnd_pixels16_y;
@@ -405,7 +405,7 @@ void dsputil_init_align(DSPContext* c, AVCodecContext *avctx)
     dspfunc(avg_qpel, 1, 8);
     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
 
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
     dspfunc(put_h264_qpel, 0, 16);
     dspfunc(put_h264_qpel, 1, 8);
     dspfunc(put_h264_qpel, 2, 4);
@@ -415,7 +415,7 @@ void dsputil_init_align(DSPContext* c, AVCodecContext *avctx)
     }
 
 #undef dspfunc
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_sh4;
     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_sh4;
     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_sh4;
diff --git a/libavcodec/sh4/dsputil_sh4.c b/libavcodec/sh4/dsputil_sh4.c
index 219bb4c353..d254e1db6b 100644
--- a/libavcodec/sh4/dsputil_sh4.c
+++ b/libavcodec/sh4/dsputil_sh4.c
@@ -92,10 +92,10 @@ static void idct_add(uint8_t *dest, int line_size, DCTELEM *block)
 void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx)
 {
         const int idct_algo= avctx->idct_algo;
-        const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+        const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
         dsputil_init_align(c,avctx);
 
-        if (!h264_high_depth)
+        if (!high_bit_depth)
         c->clear_blocks = clear_blocks_sh4;
         if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SH4){
                 c->idct_put = idct_put;
diff --git a/libavcodec/sparc/dsputil_vis.c b/libavcodec/sparc/dsputil_vis.c
index ba921ad772..e4236602f6 100644
--- a/libavcodec/sparc/dsputil_vis.c
+++ b/libavcodec/sparc/dsputil_vis.c
@@ -3953,7 +3953,7 @@ void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx)
 {
   /* VIS-specific optimizations */
   int accel = vis_level ();
-  const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+  const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
 
   if (accel & ACCEL_SPARC_VIS) {
       if(avctx->idct_algo==FF_IDCT_SIMPLEVIS){
@@ -3963,7 +3963,7 @@ void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx)
           c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
       }
 
-      if (!h264_high_depth) {
+      if (!high_bit_depth) {
       c->put_pixels_tab[0][0] = MC_put_o_16_vis;
       c->put_pixels_tab[0][1] = MC_put_x_16_vis;
       c->put_pixels_tab[0][2] = MC_put_y_16_vis;
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index c5d932d399..86a1b4957a 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -1297,9 +1297,9 @@ int av_lockmgr_register(int (*cb)(void **mutex, enum AVLockOp op))
 unsigned int ff_toupper4(unsigned int x)
 {
     return     toupper( x     &0xFF)
-    + (toupper((x>>8 )&0xFF)<<8 )
-    + (toupper((x>>16)&0xFF)<<16)
-    + (toupper((x>>24)&0xFF)<<24);
+            + (toupper((x>>8 )&0xFF)<<8 )
+            + (toupper((x>>16)&0xFF)<<16)
+            + (toupper((x>>24)&0xFF)<<24);
 }
 
 #if !HAVE_PTHREADS
diff --git a/libavcodec/version.h b/libavcodec/version.h
index 56d1eac52c..067cf4af89 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -62,5 +62,8 @@
 #ifndef FF_API_OLD_FF_PICT_TYPES
 #define FF_API_OLD_FF_PICT_TYPES (LIBAVCODEC_VERSION_MAJOR < 54)
 #endif
+#ifndef FF_API_FLAC_GLOBAL_OPTS
+#define FF_API_FLAC_GLOBAL_OPTS (LIBAVCODEC_VERSION_MAJOR < 54)
+#endif
 
 #endif /* AVCODEC_VERSION_H */
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 985a15d2f1..1a5413bda4 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2322,7 +2322,7 @@ float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
     int mm_flags = av_get_cpu_flags();
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
 
     if (avctx->dsp_mask) {
         if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
@@ -2404,7 +2404,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
         c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
         c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
-        if (!h264_high_depth) {
+        if (!high_bit_depth) {
         c->clear_block  = clear_block_mmx;
         c->clear_blocks = clear_blocks_mmx;
         if ((mm_flags & AV_CPU_FLAG_SSE) &&
@@ -2421,7 +2421,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
         c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
 
-        if (!h264_high_depth) {
+        if (!high_bit_depth) {
         SET_HPEL_FUNCS(put, 0, 16, mmx);
         SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
         SET_HPEL_FUNCS(avg, 0, 16, mmx);
@@ -2436,13 +2436,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         c->gmc= gmc_mmx;
 #endif
 #if ARCH_X86_32 && HAVE_YASM
-        if (!h264_high_depth)
+        if (!high_bit_depth)
         c->emulated_edge_mc = emulated_edge_mc_mmx;
 #endif
 
         c->add_bytes= add_bytes_mmx;
 
-        if (!h264_high_depth)
+        if (!high_bit_depth)
         c->draw_edges = draw_edges_mmx;
 
         if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
@@ -2451,7 +2451,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         }
 
 #if HAVE_YASM
-        if (!h264_high_depth) {
+        if (!high_bit_depth) {
         c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
         c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
         }
@@ -2463,7 +2463,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         if (mm_flags & AV_CPU_FLAG_MMX2) {
             c->prefetch = prefetch_mmx2;
 
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
             c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
             c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
 
@@ -2480,7 +2480,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             }
 
             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
-                if (!h264_high_depth) {
+                if (!high_bit_depth) {
                 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
                 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
                 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
@@ -2529,7 +2529,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
             SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
 
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
             SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
             SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
@@ -2547,7 +2547,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
             c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;
 
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
             c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
             c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
             c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
@@ -2564,7 +2564,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
             c->prefetch = prefetch_3dnow;
 
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
             c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
             c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
 
@@ -2602,7 +2602,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
             SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
 
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
             SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
             SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
@@ -2617,7 +2617,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
 
 #if HAVE_YASM
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
             c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
             c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
             }
@@ -2635,7 +2635,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
         if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
             // these functions are slower than mmx on AMD, but faster on Intel
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
             c->put_pixels_tab[0][0] = put_pixels16_sse2;
             c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
             c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
@@ -2643,7 +2643,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             }
         }
         if(mm_flags & AV_CPU_FLAG_SSE2){
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
             H264_QPEL_FUNCS(0, 1, sse2);
             H264_QPEL_FUNCS(0, 2, sse2);
             H264_QPEL_FUNCS(0, 3, sse2);
@@ -2660,7 +2660,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         }
 #if HAVE_SSSE3
         if(mm_flags & AV_CPU_FLAG_SSSE3){
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
             H264_QPEL_FUNCS(1, 0, ssse3);
             H264_QPEL_FUNCS(1, 1, ssse3);
             H264_QPEL_FUNCS(1, 2, ssse3);
@@ -2675,7 +2675,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             H264_QPEL_FUNCS(3, 3, ssse3);
             }
 #if HAVE_YASM
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
             c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
             c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
             c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
@@ -2737,7 +2737,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                 }
             }
 
-            if (!h264_high_depth)
+            if (!high_bit_depth)
             c->emulated_edge_mc = emulated_edge_mc_sse;
             c->gmc= gmc_sse;
 #endif
diff --git a/libavformat/utils.c b/libavformat/utils.c
index b4c51d135a..7624cb4f67 100644
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@@ -3048,12 +3048,12 @@ static int ff_interleave_compare_dts(AVFormatContext *s, AVPacket *next, AVPacke
 {
     AVStream *st = s->streams[ pkt ->stream_index];
     AVStream *st2= s->streams[ next->stream_index];
-    int64_t a= st2->time_base.num * (int64_t)st ->time_base.den;
-    int64_t b= st ->time_base.num * (int64_t)st2->time_base.den;
-    int64_t dts1 = av_rescale_rnd(pkt->dts, b, a, AV_ROUND_DOWN);
-    if (dts1==next->dts && dts1==av_rescale_rnd(pkt->dts, b, a, AV_ROUND_UP))
+    int comp = av_compare_ts(next->dts, st2->time_base, pkt->dts,
+                             st->time_base);
+
+    if (comp == 0)
         return pkt->stream_index < next->stream_index;
-    return dts1 < next->dts;
+    return comp > 0;
 }
 
 int av_interleave_packet_per_dts(AVFormatContext *s, AVPacket *out, AVPacket *pkt, int flush){
diff --git a/libavutil/opt.h b/libavutil/opt.h
index 018d8a2b4f..b04c7905d6 100644
--- a/libavutil/opt.h
+++ b/libavutil/opt.h
@@ -67,6 +67,9 @@ typedef struct AVOption {
     union {
         double dbl;
         const char *str;
+        /* TODO those are unused now */
+        int64_t i64;
+        AVRational q;
     } default_val;
     double min;                 ///< minimum valid value for the option
     double max;                 ///< maximum valid value for the option
diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index 6713da23f6..69d28dd4cc 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -136,7 +136,7 @@ enum PixelFormat {
     PIX_FMT_BGR48BE,   ///< packed RGB 16:16:16, 48bpp, 16B, 16G, 16R, the 2-byte value for each R/G/B component is stored as big-endian
     PIX_FMT_BGR48LE,   ///< packed RGB 16:16:16, 48bpp, 16B, 16G, 16R, the 2-byte value for each R/G/B component is stored as little-endian
 
-    //the following 4 formats are deprecated and should be replaced by PIX_FMT_YUV420P16* with the bpp stored seperately
+    //the following 6 formats are deprecated and should be replaced by PIX_FMT_YUV420P16* with the bpp stored seperately
     PIX_FMT_YUV420P9BE, ///< planar YUV 4:2:0, 13.5bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
     PIX_FMT_YUV420P9LE, ///< planar YUV 4:2:0, 13.5bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian
     PIX_FMT_YUV420P10BE,///< planar YUV 4:2:0, 15bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 2c6269487a..f00a45cf38 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -107,9 +107,13 @@ const char *swscale_license(void)
         || (x)==PIX_FMT_YUV440P     \
         || (x)==PIX_FMT_MONOWHITE   \
         || (x)==PIX_FMT_MONOBLACK   \
+        || (x)==PIX_FMT_YUV420P9LE    \
+        || (x)==PIX_FMT_YUV420P10LE   \
         || (x)==PIX_FMT_YUV420P16LE   \
         || (x)==PIX_FMT_YUV422P16LE   \
         || (x)==PIX_FMT_YUV444P16LE   \
+        || (x)==PIX_FMT_YUV420P9BE    \
+        || (x)==PIX_FMT_YUV420P10BE   \
         || (x)==PIX_FMT_YUV420P16BE   \
         || (x)==PIX_FMT_YUV422P16BE   \
         || (x)==PIX_FMT_YUV444P16BE   \
diff --git a/tests/fate/h264.mak b/tests/fate/h264.mak
index 259e1e0ea0..9cb669c1a0 100644
--- a/tests/fate/h264.mak
+++ b/tests/fate/h264.mak
@@ -127,6 +127,13 @@ FATE_H264 = aud_mw_e                                                    \
             frext-hpcvflnl_bcrm_a                                       \
             frext-hpcvmolq_brcm_b                                       \
             frext-hpcvnl_brcm_a                                         \
+            frext-pph10i1_panasonic_a                                   \
+            frext-pph10i2_panasonic_a                                   \
+            frext-pph10i3_panasonic_a                                   \
+            frext-pph10i4_panasonic_a                                   \
+            frext-pph10i5_panasonic_a                                   \
+            frext-pph10i6_panasonic_a                                   \
+            frext-pph10i7_panasonic_a                                   \
             hcbp2_hhi_a                                                 \
             hcmp1_hhi_a                                                 \
             ls_sva_d                                                    \
@@ -301,6 +308,13 @@ fate-h264-conformance-frext-hpcvfl_bcrm_a: CMD = framecrc  -i $(SAMPLES)/h264-co
 fate-h264-conformance-frext-hpcvflnl_bcrm_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/HPCVFLNL_BRCM_A.264 -vsync 0
 fate-h264-conformance-frext-hpcvmolq_brcm_b: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/HPCVMOLQ_BRCM_B.264
 fate-h264-conformance-frext-hpcvnl_brcm_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/HPCVNL_BRCM_A.264
+fate-h264-conformance-frext-pph10i1_panasonic_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/PPH10I1_Panasonic_A.264
+fate-h264-conformance-frext-pph10i2_panasonic_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/PPH10I2_Panasonic_A.264
+fate-h264-conformance-frext-pph10i3_panasonic_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/PPH10I3_Panasonic_A.264
+fate-h264-conformance-frext-pph10i4_panasonic_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/PPH10I4_Panasonic_A.264
+fate-h264-conformance-frext-pph10i5_panasonic_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/PPH10I5_Panasonic_A.264
+fate-h264-conformance-frext-pph10i6_panasonic_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/PPH10I6_Panasonic_A.264
+fate-h264-conformance-frext-pph10i7_panasonic_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/PPH10I7_Panasonic_A.264
 fate-h264-conformance-hcbp2_hhi_a: CMD = framecrc  -vsync 0 -strict 1 -i $(SAMPLES)/h264-conformance/HCBP2_HHI_A.264
 fate-h264-conformance-hcmp1_hhi_a: CMD = framecrc  -vsync 0 -strict 1 -i $(SAMPLES)/h264-conformance/HCMP1_HHI_A.264
 fate-h264-conformance-ls_sva_d: CMD = framecrc  -i $(SAMPLES)/h264-conformance/LS_SVA_D.264
diff --git a/tests/ref/fate/h264-conformance-frext-pph10i1_panasonic_a b/tests/ref/fate/h264-conformance-frext-pph10i1_panasonic_a
new file mode 100644
index 0000000000..1cfc313e32
--- /dev/null
+++ b/tests/ref/fate/h264-conformance-frext-pph10i1_panasonic_a
@@ -0,0 +1,10 @@
+0, 0, 2764800, 0xcc4df07d
+0, 3600, 2764800, 0x85f9e6d4
+0, 7200, 2764800, 0x23ffe90d
+0, 10800, 2764800, 0xf0a6d453
+0, 14400, 2764800, 0x913a6392
+0, 18000, 2764800, 0xcc5f9736
+0, 21600, 2764800, 0x43f9f9ce
+0, 25200, 2764800, 0xc874b44e
+0, 28800, 2764800, 0x83b665e6
+0, 32400, 2764800, 0x5ea2e31e
diff --git a/tests/ref/fate/h264-conformance-frext-pph10i2_panasonic_a b/tests/ref/fate/h264-conformance-frext-pph10i2_panasonic_a
new file mode 100644
index 0000000000..274bdaf2b1
--- /dev/null
+++ b/tests/ref/fate/h264-conformance-frext-pph10i2_panasonic_a
@@ -0,0 +1,10 @@
+0, 0, 2764800, 0x4f710132
+0, 3600, 2764800, 0x57e5b713
+0, 7200, 2764800, 0xcca01477
+0, 10800, 2764800, 0xa19a95cd
+0, 14400, 2764800, 0x700a757d
+0, 18000, 2764800, 0xd8c6f60f
+0, 21600, 2764800, 0x95a1bbc7
+0, 25200, 2764800, 0x0582077a
+0, 28800, 2764800, 0x91595f91
+0, 32400, 2764800, 0xf5fe034a
diff --git a/tests/ref/fate/h264-conformance-frext-pph10i3_panasonic_a b/tests/ref/fate/h264-conformance-frext-pph10i3_panasonic_a
new file mode 100644
index 0000000000..195e45a67b
--- /dev/null
+++ b/tests/ref/fate/h264-conformance-frext-pph10i3_panasonic_a
@@ -0,0 +1,10 @@
+0, 0, 2764800, 0xda69f69e
+0, 3600, 2764800, 0x29ed832f
+0, 7200, 2764800, 0xb3244cc4
+0, 10800, 2764800, 0xe41a312c
+0, 14400, 2764800, 0xac0b344b
+0, 18000, 2764800, 0xc585aa20
+0, 21600, 2764800, 0x0952054c
+0, 25200, 2764800, 0xd1a02f87
+0, 28800, 2764800, 0xfcbfe87c
+0, 32400, 2764800, 0xe4e9b8a2
diff --git a/tests/ref/fate/h264-conformance-frext-pph10i4_panasonic_a b/tests/ref/fate/h264-conformance-frext-pph10i4_panasonic_a
new file mode 100644
index 0000000000..d351a7eb1f
--- /dev/null
+++ b/tests/ref/fate/h264-conformance-frext-pph10i4_panasonic_a
@@ -0,0 +1,19 @@
+0, 0, 6220800, 0xca2a2a5e
+0, 3600, 6220800, 0x8009a65e
+0, 7200, 6220800, 0x63e72b3b
+0, 10800, 6220800, 0x7459a1cc
+0, 14400, 6220800, 0x02191aa9
+0, 18000, 6220800, 0x88dca590
+0, 21600, 6220800, 0x56dd150a
+0, 25200, 6220800, 0x5f56a56f
+0, 28800, 6220800, 0x67ada4b7
+0, 32400, 6220800, 0x88dca590
+0, 36000, 6220800, 0xd3b09fe5
+0, 39600, 6220800, 0x2223998c
+0, 43200, 6220800, 0x5e5b2da5
+0, 46800, 6220800, 0x88dca590
+0, 50400, 6220800, 0x5e5b2da5
+0, 54000, 6220800, 0x88dca590
+0, 57600, 6220800, 0x5e5b2da5
+0, 61200, 6220800, 0x88dca590
+0, 64800, 6220800, 0x26e1ec8b
diff --git a/tests/ref/fate/h264-conformance-frext-pph10i5_panasonic_a b/tests/ref/fate/h264-conformance-frext-pph10i5_panasonic_a
new file mode 100644
index 0000000000..1afbac01e7
--- /dev/null
+++ b/tests/ref/fate/h264-conformance-frext-pph10i5_panasonic_a
@@ -0,0 +1,10 @@
+0, 0, 6220800, 0x1df58ce9
+0, 3600, 6220800, 0x8f2859ce
+0, 7200, 6220800, 0x229cc7ff
+0, 10800, 6220800, 0x73e86984
+0, 14400, 6220800, 0xb6d4504b
+0, 18000, 6220800, 0x4e7d4883
+0, 21600, 6220800, 0xbec3f0f7
+0, 25200, 6220800, 0x1d9af065
+0, 28800, 6220800, 0x44851549
+0, 32400, 6220800, 0xfcf8728e
diff --git a/tests/ref/fate/h264-conformance-frext-pph10i6_panasonic_a b/tests/ref/fate/h264-conformance-frext-pph10i6_panasonic_a
new file mode 100644
index 0000000000..6d105466c9
--- /dev/null
+++ b/tests/ref/fate/h264-conformance-frext-pph10i6_panasonic_a
@@ -0,0 +1,10 @@
+0, 0, 6220800, 0x408daf70
+0, 3600, 6220800, 0x59b254a3
+0, 7200, 6220800, 0x4cf4279c
+0, 10800, 6220800, 0x5c9437ae
+0, 14400, 6220800, 0x986c3eb8
+0, 18000, 6220800, 0x23fd883e
+0, 21600, 6220800, 0x84f222fe
+0, 25200, 6220800, 0xe7f91107
+0, 28800, 6220800, 0xb544b31e
+0, 32400, 6220800, 0x1ebdde56
diff --git a/tests/ref/fate/h264-conformance-frext-pph10i7_panasonic_a b/tests/ref/fate/h264-conformance-frext-pph10i7_panasonic_a
new file mode 100644
index 0000000000..28825446f9
--- /dev/null
+++ b/tests/ref/fate/h264-conformance-frext-pph10i7_panasonic_a
@@ -0,0 +1,10 @@
+0, 0, 6220800, 0xf81873fe
+0, 3600, 6220800, 0x7b96fbdc
+0, 7200, 6220800, 0x75dbafc4
+0, 10800, 6220800, 0x7524301e
+0, 14400, 6220800, 0x0f3621ab
+0, 18000, 6220800, 0xa5e25b35
+0, 21600, 6220800, 0x063a8116
+0, 25200, 6220800, 0x48ebc8ff
+0, 28800, 6220800, 0x1f635df8
+0, 32400, 6220800, 0xe282c8bd