Preparatory patch for high bit depth h264 decoding support.

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
Oskar Arvidsson 2011-05-09 11:18:37 -04:00 committed by Ronald S. Bultje
parent 325eefa2ca
commit de3e760720
2 changed files with 402 additions and 294 deletions

View File

@ -25,12 +25,19 @@
* @author Michael Niedermayer <michaelni@gmx.at> * @author Michael Niedermayer <michaelni@gmx.at>
*/ */
#define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom ) #define BIT_DEPTH 8
#define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) #define pixel uint8_t
#define av_clip_pixel av_clip_uint8
#define FUNCC(a) a ## _c
#define op_scale1(x) block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
#define op_scale2(x) dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
#define H264_WEIGHT(W,H) \ #define H264_WEIGHT(W,H) \
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \
int y; \ int y; \
offset <<= log2_denom; \ pixel *block = (pixel*)_block; \
stride /= sizeof(pixel); \
offset <<= (log2_denom + (BIT_DEPTH-8)); \
if(log2_denom) offset += 1<<(log2_denom-1); \ if(log2_denom) offset += 1<<(log2_denom-1); \
for(y=0; y<H; y++, block += stride){ \ for(y=0; y<H; y++, block += stride){ \
op_scale1(0); \ op_scale1(0); \
@ -54,8 +61,11 @@ static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride,
op_scale1(15); \ op_scale1(15); \
} \ } \
} \ } \
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \
int y; \ int y; \
pixel *dst = (pixel*)_dst; \
pixel *src = (pixel*)_src; \
stride /= sizeof(pixel); \
offset = ((offset + 1) | 1) << log2_denom; \ offset = ((offset + 1) | 1) << log2_denom; \
for(y=0; y<H; y++, dst += stride, src += stride){ \ for(y=0; y<H; y++, dst += stride, src += stride){ \
op_scale2(0); \ op_scale2(0); \
@ -95,11 +105,17 @@ H264_WEIGHT(2,2)
#undef op_scale2 #undef op_scale2
#undef H264_WEIGHT #undef H264_WEIGHT
static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0) static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
{ {
pixel *pix = (pixel*)_pix;
int i, d; int i, d;
xstride /= sizeof(pixel);
ystride /= sizeof(pixel);
alpha <<= BIT_DEPTH - 8;
beta <<= BIT_DEPTH - 8;
for( i = 0; i < 4; i++ ) { for( i = 0; i < 4; i++ ) {
if( tc0[i] < 0 ) { const int tc_orig = tc0[i] << (BIT_DEPTH - 8);
if( tc_orig < 0 ) {
pix += inner_iters*ystride; pix += inner_iters*ystride;
continue; continue;
} }
@ -115,44 +131,49 @@ static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, in
FFABS( p1 - p0 ) < beta && FFABS( p1 - p0 ) < beta &&
FFABS( q1 - q0 ) < beta ) { FFABS( q1 - q0 ) < beta ) {
int tc = tc0[i]; int tc = tc_orig;
int i_delta; int i_delta;
if( FFABS( p2 - p0 ) < beta ) { if( FFABS( p2 - p0 ) < beta ) {
if(tc0[i]) if(tc_orig)
pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc_orig, tc_orig );
tc++; tc++;
} }
if( FFABS( q2 - q0 ) < beta ) { if( FFABS( q2 - q0 ) < beta ) {
if(tc0[i]) if(tc_orig)
pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc_orig, tc_orig );
tc++; tc++;
} }
i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */ pix[-xstride] = av_clip_pixel( p0 + i_delta ); /* p0' */
pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */ pix[0] = av_clip_pixel( q0 - i_delta ); /* q0' */
} }
pix += ystride; pix += ystride;
} }
} }
} }
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) static void FUNCC(h264_v_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
{ {
h264_loop_filter_luma_c(pix, stride, 1, 4, alpha, beta, tc0); FUNCC(h264_loop_filter_luma)(pix, stride, sizeof(pixel), 4, alpha, beta, tc0);
} }
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) static void FUNCC(h264_h_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
{ {
h264_loop_filter_luma_c(pix, 1, stride, 4, alpha, beta, tc0); FUNCC(h264_loop_filter_luma)(pix, sizeof(pixel), stride, 4, alpha, beta, tc0);
} }
static void h264_h_loop_filter_luma_mbaff_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) static void FUNCC(h264_h_loop_filter_luma_mbaff)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
{ {
h264_loop_filter_luma_c(pix, 1, stride, 2, alpha, beta, tc0); FUNCC(h264_loop_filter_luma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0);
} }
static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int inner_iters, int alpha, int beta) static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
{ {
pixel *pix = (pixel*)_pix;
int d; int d;
xstride /= sizeof(pixel);
ystride /= sizeof(pixel);
alpha <<= BIT_DEPTH - 8;
beta <<= BIT_DEPTH - 8;
for( d = 0; d < 4 * inner_iters; d++ ) { for( d = 0; d < 4 * inner_iters; d++ ) {
const int p2 = pix[-3*xstride]; const int p2 = pix[-3*xstride];
const int p1 = pix[-2*xstride]; const int p1 = pix[-2*xstride];
@ -198,24 +219,29 @@ static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *p
pix += ystride; pix += ystride;
} }
} }
static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) static void FUNCC(h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta)
{ {
h264_loop_filter_luma_intra_c(pix, stride, 1, 4, alpha, beta); FUNCC(h264_loop_filter_luma_intra)(pix, stride, sizeof(pixel), 4, alpha, beta);
} }
static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) static void FUNCC(h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta)
{ {
h264_loop_filter_luma_intra_c(pix, 1, stride, 4, alpha, beta); FUNCC(h264_loop_filter_luma_intra)(pix, sizeof(pixel), stride, 4, alpha, beta);
} }
static void h264_h_loop_filter_luma_mbaff_intra_c(uint8_t *pix, int stride, int alpha, int beta) static void FUNCC(h264_h_loop_filter_luma_mbaff_intra)(uint8_t *pix, int stride, int alpha, int beta)
{ {
h264_loop_filter_luma_intra_c(pix, 1, stride, 2, alpha, beta); FUNCC(h264_loop_filter_luma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta);
} }
static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0) static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
{ {
pixel *pix = (pixel*)_pix;
int i, d; int i, d;
xstride /= sizeof(pixel);
ystride /= sizeof(pixel);
alpha <<= BIT_DEPTH - 8;
beta <<= BIT_DEPTH - 8;
for( i = 0; i < 4; i++ ) { for( i = 0; i < 4; i++ ) {
const int tc = tc0[i]; const int tc = ((tc0[i] - 1) << (BIT_DEPTH - 8)) + 1;
if( tc <= 0 ) { if( tc <= 0 ) {
pix += inner_iters*ystride; pix += inner_iters*ystride;
continue; continue;
@ -232,29 +258,34 @@ static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix,
int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */ pix[-xstride] = av_clip_pixel( p0 + delta ); /* p0' */
pix[0] = av_clip_uint8( q0 - delta ); /* q0' */ pix[0] = av_clip_pixel( q0 - delta ); /* q0' */
} }
pix += ystride; pix += ystride;
} }
} }
} }
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) static void FUNCC(h264_v_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
{ {
h264_loop_filter_chroma_c(pix, stride, 1, 2, alpha, beta, tc0); FUNCC(h264_loop_filter_chroma)(pix, stride, sizeof(pixel), 2, alpha, beta, tc0);
} }
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) static void FUNCC(h264_h_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
{ {
h264_loop_filter_chroma_c(pix, 1, stride, 2, alpha, beta, tc0); FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0);
} }
static void h264_h_loop_filter_chroma_mbaff_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) static void FUNCC(h264_h_loop_filter_chroma_mbaff)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
{ {
h264_loop_filter_chroma_c(pix, 1, stride, 1, alpha, beta, tc0); FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 1, alpha, beta, tc0);
} }
static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int inner_iters, int alpha, int beta) static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
{ {
pixel *pix = (pixel*)_pix;
int d; int d;
xstride /= sizeof(pixel);
ystride /= sizeof(pixel);
alpha <<= BIT_DEPTH - 8;
beta <<= BIT_DEPTH - 8;
for( d = 0; d < 4 * inner_iters; d++ ) { for( d = 0; d < 4 * inner_iters; d++ ) {
const int p0 = pix[-1*xstride]; const int p0 = pix[-1*xstride];
const int p1 = pix[-2*xstride]; const int p1 = pix[-2*xstride];
@ -271,15 +302,15 @@ static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t
pix += ystride; pix += ystride;
} }
} }
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) static void FUNCC(h264_v_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta)
{ {
h264_loop_filter_chroma_intra_c(pix, stride, 1, 2, alpha, beta); FUNCC(h264_loop_filter_chroma_intra)(pix, stride, sizeof(pixel), 2, alpha, beta);
} }
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) static void FUNCC(h264_h_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta)
{ {
h264_loop_filter_chroma_intra_c(pix, 1, stride, 2, alpha, beta); FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta);
} }
static void h264_h_loop_filter_chroma_mbaff_intra_c(uint8_t *pix, int stride, int alpha, int beta) static void FUNCC(h264_h_loop_filter_chroma_mbaff_intra)(uint8_t *pix, int stride, int alpha, int beta)
{ {
h264_loop_filter_chroma_intra_c(pix, 1, stride, 1, alpha, beta); FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 1, alpha, beta);
} }

View File

@ -28,68 +28,98 @@
#include "mathops.h" #include "mathops.h"
#include "dsputil.h" #include "dsputil.h"
static void pred4x4_vertical_c(uint8_t *src, const uint8_t *topright, int stride){ #define BIT_DEPTH 8
const uint32_t a= ((uint32_t*)(src-stride))[0];
((uint32_t*)(src+0*stride))[0]= a; #define pixel uint8_t
((uint32_t*)(src+1*stride))[0]= a; #define pixel4 uint32_t
((uint32_t*)(src+2*stride))[0]= a; #define dctcoef DCTELEM
((uint32_t*)(src+3*stride))[0]= a;
#define INIT_CLIP uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
#define CLIP(a) cm[a]
#define FUNC(a) a
#define FUNCC(a) a ## _c
#define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
#define AV_WN4P AV_WN32
#define AV_WN4PA AV_WN32A
static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright, int _stride){
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
const pixel4 a= ((pixel4*)(src-stride))[0];
((pixel4*)(src+0*stride))[0]= a;
((pixel4*)(src+1*stride))[0]= a;
((pixel4*)(src+2*stride))[0]= a;
((pixel4*)(src+3*stride))[0]= a;
} }
static void pred4x4_horizontal_c(uint8_t *src, const uint8_t *topright, int stride){ static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright, int _stride){
((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101; pixel *src = (pixel*)_src;
((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101; int stride = _stride/sizeof(pixel);
((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101; ((pixel4*)(src+0*stride))[0]= PIXEL_SPLAT_X4(src[-1+0*stride]);
((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101; ((pixel4*)(src+1*stride))[0]= PIXEL_SPLAT_X4(src[-1+1*stride]);
((pixel4*)(src+2*stride))[0]= PIXEL_SPLAT_X4(src[-1+2*stride]);
((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(src[-1+3*stride]);
} }
static void pred4x4_dc_c(uint8_t *src, const uint8_t *topright, int stride){ static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
+ src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3; + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
((uint32_t*)(src+0*stride))[0]= ((pixel4*)(src+0*stride))[0]=
((uint32_t*)(src+1*stride))[0]= ((pixel4*)(src+1*stride))[0]=
((uint32_t*)(src+2*stride))[0]= ((pixel4*)(src+2*stride))[0]=
((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
} }
static void pred4x4_left_dc_c(uint8_t *src, const uint8_t *topright, int stride){ static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2; const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
((uint32_t*)(src+0*stride))[0]= ((pixel4*)(src+0*stride))[0]=
((uint32_t*)(src+1*stride))[0]= ((pixel4*)(src+1*stride))[0]=
((uint32_t*)(src+2*stride))[0]= ((pixel4*)(src+2*stride))[0]=
((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
} }
static void pred4x4_top_dc_c(uint8_t *src, const uint8_t *topright, int stride){ static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2; const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
((uint32_t*)(src+0*stride))[0]= ((pixel4*)(src+0*stride))[0]=
((uint32_t*)(src+1*stride))[0]= ((pixel4*)(src+1*stride))[0]=
((uint32_t*)(src+2*stride))[0]= ((pixel4*)(src+2*stride))[0]=
((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
} }
static void pred4x4_128_dc_c(uint8_t *src, const uint8_t *topright, int stride){ static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
((uint32_t*)(src+0*stride))[0]= pixel *src = (pixel*)_src;
((uint32_t*)(src+1*stride))[0]= int stride = _stride/sizeof(pixel);
((uint32_t*)(src+2*stride))[0]= ((pixel4*)(src+0*stride))[0]=
((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U; ((pixel4*)(src+1*stride))[0]=
((pixel4*)(src+2*stride))[0]=
((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
} }
static void pred4x4_127_dc_c(uint8_t *src, const uint8_t *topright, int stride){ static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
((uint32_t*)(src+0*stride))[0]= pixel *src = (pixel*)_src;
((uint32_t*)(src+1*stride))[0]= int stride = _stride/sizeof(pixel);
((uint32_t*)(src+2*stride))[0]= ((pixel4*)(src+0*stride))[0]=
((uint32_t*)(src+3*stride))[0]= 127U*0x01010101U; ((pixel4*)(src+1*stride))[0]=
((pixel4*)(src+2*stride))[0]=
((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
} }
static void pred4x4_129_dc_c(uint8_t *src, const uint8_t *topright, int stride){ static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
((uint32_t*)(src+0*stride))[0]= pixel *src = (pixel*)_src;
((uint32_t*)(src+1*stride))[0]= int stride = _stride/sizeof(pixel);
((uint32_t*)(src+2*stride))[0]= ((pixel4*)(src+0*stride))[0]=
((uint32_t*)(src+3*stride))[0]= 129U*0x01010101U; ((pixel4*)(src+1*stride))[0]=
((pixel4*)(src+2*stride))[0]=
((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
} }
@ -117,7 +147,9 @@ static void pred4x4_129_dc_c(uint8_t *src, const uint8_t *topright, int stride){
const int av_unused t2= src[ 2-1*stride];\ const int av_unused t2= src[ 2-1*stride];\
const int av_unused t3= src[ 3-1*stride];\ const int av_unused t3= src[ 3-1*stride];\
static void pred4x4_down_right_c(uint8_t *src, const uint8_t *topright, int stride){ static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright, int _stride){
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
const int lt= src[-1-1*stride]; const int lt= src[-1-1*stride];
LOAD_TOP_EDGE LOAD_TOP_EDGE
LOAD_LEFT_EDGE LOAD_LEFT_EDGE
@ -140,7 +172,10 @@ static void pred4x4_down_right_c(uint8_t *src, const uint8_t *topright, int stri
src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2; src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
} }
static void pred4x4_down_left_c(uint8_t *src, const uint8_t *topright, int stride){ static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright, int _stride){
pixel *src = (pixel*)_src;
const pixel *topright = (const pixel*)_topright;
int stride = _stride/sizeof(pixel);
LOAD_TOP_EDGE LOAD_TOP_EDGE
LOAD_TOP_RIGHT_EDGE LOAD_TOP_RIGHT_EDGE
// LOAD_LEFT_EDGE // LOAD_LEFT_EDGE
@ -163,7 +198,9 @@ static void pred4x4_down_left_c(uint8_t *src, const uint8_t *topright, int strid
src[3+3*stride]=(t6 + 3*t7 + 2)>>2; src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
} }
static void pred4x4_vertical_right_c(uint8_t *src, const uint8_t *topright, int stride){ static void FUNCC(pred4x4_vertical_right)(uint8_t *_src, const uint8_t *topright, int _stride){
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
const int lt= src[-1-1*stride]; const int lt= src[-1-1*stride];
LOAD_TOP_EDGE LOAD_TOP_EDGE
LOAD_LEFT_EDGE LOAD_LEFT_EDGE
@ -186,7 +223,10 @@ static void pred4x4_vertical_right_c(uint8_t *src, const uint8_t *topright, int
src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
} }
static void pred4x4_vertical_left_c(uint8_t *src, const uint8_t *topright, int stride){ static void FUNCC(pred4x4_vertical_left)(uint8_t *_src, const uint8_t *_topright, int _stride){
pixel *src = (pixel*)_src;
const pixel *topright = (const pixel*)_topright;
int stride = _stride/sizeof(pixel);
LOAD_TOP_EDGE LOAD_TOP_EDGE
LOAD_TOP_RIGHT_EDGE LOAD_TOP_RIGHT_EDGE
@ -208,7 +248,9 @@ static void pred4x4_vertical_left_c(uint8_t *src, const uint8_t *topright, int s
src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2; src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
} }
static void pred4x4_horizontal_up_c(uint8_t *src, const uint8_t *topright, int stride){ static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright, int _stride){
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
LOAD_LEFT_EDGE LOAD_LEFT_EDGE
src[0+0*stride]=(l0 + l1 + 1)>>1; src[0+0*stride]=(l0 + l1 + 1)>>1;
@ -229,7 +271,9 @@ static void pred4x4_horizontal_up_c(uint8_t *src, const uint8_t *topright, int s
src[3+3*stride]=l3; src[3+3*stride]=l3;
} }
static void pred4x4_horizontal_down_c(uint8_t *src, const uint8_t *topright, int stride){ static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src, const uint8_t *topright, int _stride){
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
const int lt= src[-1-1*stride]; const int lt= src[-1-1*stride];
LOAD_TOP_EDGE LOAD_TOP_EDGE
LOAD_LEFT_EDGE LOAD_LEFT_EDGE
@ -252,34 +296,50 @@ static void pred4x4_horizontal_down_c(uint8_t *src, const uint8_t *topright, int
src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2; src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
} }
static void pred16x16_vertical_c(uint8_t *src, int stride){ static void FUNCC(pred16x16_vertical)(uint8_t *_src, int _stride){
int i; int i;
const uint32_t a= ((uint32_t*)(src-stride))[0]; pixel *src = (pixel*)_src;
const uint32_t b= ((uint32_t*)(src-stride))[1]; int stride = _stride/sizeof(pixel);
const uint32_t c= ((uint32_t*)(src-stride))[2]; const pixel4 a = ((pixel4*)(src-stride))[0];
const uint32_t d= ((uint32_t*)(src-stride))[3]; const pixel4 b = ((pixel4*)(src-stride))[1];
const pixel4 c = ((pixel4*)(src-stride))[2];
const pixel4 d = ((pixel4*)(src-stride))[3];
for(i=0; i<16; i++){ for(i=0; i<16; i++){
((uint32_t*)(src+i*stride))[0]= a; ((pixel4*)(src+i*stride))[0] = a;
((uint32_t*)(src+i*stride))[1]= b; ((pixel4*)(src+i*stride))[1] = b;
((uint32_t*)(src+i*stride))[2]= c; ((pixel4*)(src+i*stride))[2] = c;
((uint32_t*)(src+i*stride))[3]= d; ((pixel4*)(src+i*stride))[3] = d;
} }
} }
static void pred16x16_horizontal_c(uint8_t *src, int stride){ static void FUNCC(pred16x16_horizontal)(uint8_t *_src, int stride){
int i; int i;
pixel *src = (pixel*)_src;
stride /= sizeof(pixel);
for(i=0; i<16; i++){ for(i=0; i<16; i++){
((uint32_t*)(src+i*stride))[0]= ((pixel4*)(src+i*stride))[0] =
((uint32_t*)(src+i*stride))[1]= ((pixel4*)(src+i*stride))[1] =
((uint32_t*)(src+i*stride))[2]= ((pixel4*)(src+i*stride))[2] =
((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101; ((pixel4*)(src+i*stride))[3] = PIXEL_SPLAT_X4(src[-1+i*stride]);
} }
} }
static void pred16x16_dc_c(uint8_t *src, int stride){ #define PREDICT_16x16_DC(v)\
for(i=0; i<16; i++){\
AV_WN4P(src+ 0, v);\
AV_WN4P(src+ 4, v);\
AV_WN4P(src+ 8, v);\
AV_WN4P(src+12, v);\
src += stride;\
}
static void FUNCC(pred16x16_dc)(uint8_t *_src, int stride){
int i, dc=0; int i, dc=0;
pixel *src = (pixel*)_src;
pixel4 dcsplat;
stride /= sizeof(pixel);
for(i=0;i<16; i++){ for(i=0;i<16; i++){
dc+= src[-1+i*stride]; dc+= src[-1+i*stride];
@ -289,89 +349,59 @@ static void pred16x16_dc_c(uint8_t *src, int stride){
dc+= src[i-stride]; dc+= src[i-stride];
} }
dc= 0x01010101*((dc + 16)>>5); dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
PREDICT_16x16_DC(dcsplat);
for(i=0; i<16; i++){
((uint32_t*)(src+i*stride))[0]=
((uint32_t*)(src+i*stride))[1]=
((uint32_t*)(src+i*stride))[2]=
((uint32_t*)(src+i*stride))[3]= dc;
}
} }
static void pred16x16_left_dc_c(uint8_t *src, int stride){ static void FUNCC(pred16x16_left_dc)(uint8_t *_src, int stride){
int i, dc=0; int i, dc=0;
pixel *src = (pixel*)_src;
pixel4 dcsplat;
stride /= sizeof(pixel);
for(i=0;i<16; i++){ for(i=0;i<16; i++){
dc+= src[-1+i*stride]; dc+= src[-1+i*stride];
} }
dc= 0x01010101*((dc + 8)>>4); dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
PREDICT_16x16_DC(dcsplat);
for(i=0; i<16; i++){
((uint32_t*)(src+i*stride))[0]=
((uint32_t*)(src+i*stride))[1]=
((uint32_t*)(src+i*stride))[2]=
((uint32_t*)(src+i*stride))[3]= dc;
}
} }
static void pred16x16_top_dc_c(uint8_t *src, int stride){ static void FUNCC(pred16x16_top_dc)(uint8_t *_src, int stride){
int i, dc=0; int i, dc=0;
pixel *src = (pixel*)_src;
pixel4 dcsplat;
stride /= sizeof(pixel);
for(i=0;i<16; i++){ for(i=0;i<16; i++){
dc+= src[i-stride]; dc+= src[i-stride];
} }
dc= 0x01010101*((dc + 8)>>4);
for(i=0; i<16; i++){ dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
((uint32_t*)(src+i*stride))[0]= PREDICT_16x16_DC(dcsplat);
((uint32_t*)(src+i*stride))[1]=
((uint32_t*)(src+i*stride))[2]=
((uint32_t*)(src+i*stride))[3]= dc;
}
} }
static void pred16x16_128_dc_c(uint8_t *src, int stride){ #define PRED16x16_X(n, v) \
int i; static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, int stride){\
int i;\
for(i=0; i<16; i++){ pixel *src = (pixel*)_src;\
((uint32_t*)(src+i*stride))[0]= stride /= sizeof(pixel);\
((uint32_t*)(src+i*stride))[1]= PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
((uint32_t*)(src+i*stride))[2]=
((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
}
} }
static void pred16x16_127_dc_c(uint8_t *src, int stride){ PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1);
int i; PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0);
PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1);
for(i=0; i<16; i++){ static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src, int _stride, const int svq3, const int rv40){
((uint32_t*)(src+i*stride))[0]=
((uint32_t*)(src+i*stride))[1]=
((uint32_t*)(src+i*stride))[2]=
((uint32_t*)(src+i*stride))[3]= 0x01010101U*127U;
}
}
static void pred16x16_129_dc_c(uint8_t *src, int stride){
int i;
for(i=0; i<16; i++){
((uint32_t*)(src+i*stride))[0]=
((uint32_t*)(src+i*stride))[1]=
((uint32_t*)(src+i*stride))[2]=
((uint32_t*)(src+i*stride))[3]= 0x01010101U*129U;
}
}
static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3, const int rv40){
int i, j, k; int i, j, k;
int a; int a;
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; INIT_CLIP
const uint8_t * const src0 = src+7-stride; pixel *src = (pixel*)_src;
const uint8_t *src1 = src+8*stride-1; int stride = _stride/sizeof(pixel);
const uint8_t *src2 = src1-2*stride; // == src+6*stride-1; const pixel * const src0 = src +7-stride;
const pixel * src1 = src +8*stride-1;
const pixel * src2 = src1-2*stride; // == src+6*stride-1;
int H = src0[1] - src0[-1]; int H = src0[1] - src0[-1];
int V = src1[0] - src2[ 0]; int V = src1[0] - src2[ 0];
for(k=2; k<=8; ++k) { for(k=2; k<=8; ++k) {
@ -398,113 +428,115 @@ static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int
int b = a; int b = a;
a += V; a += V;
for(i=-16; i<0; i+=4) { for(i=-16; i<0; i+=4) {
src[16+i] = cm[ (b ) >> 5 ]; src[16+i] = CLIP((b ) >> 5);
src[17+i] = cm[ (b+ H) >> 5 ]; src[17+i] = CLIP((b+ H) >> 5);
src[18+i] = cm[ (b+2*H) >> 5 ]; src[18+i] = CLIP((b+2*H) >> 5);
src[19+i] = cm[ (b+3*H) >> 5 ]; src[19+i] = CLIP((b+3*H) >> 5);
b += 4*H; b += 4*H;
} }
src += stride; src += stride;
} }
} }
static void pred16x16_plane_c(uint8_t *src, int stride){ static void FUNCC(pred16x16_plane)(uint8_t *src, int stride){
pred16x16_plane_compat_c(src, stride, 0, 0); FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
} }
static void pred8x8_vertical_c(uint8_t *src, int stride){ static void FUNCC(pred8x8_vertical)(uint8_t *_src, int _stride){
int i; int i;
const uint32_t a= ((uint32_t*)(src-stride))[0]; pixel *src = (pixel*)_src;
const uint32_t b= ((uint32_t*)(src-stride))[1]; int stride = _stride/sizeof(pixel);
const pixel4 a= ((pixel4*)(src-stride))[0];
const pixel4 b= ((pixel4*)(src-stride))[1];
for(i=0; i<8; i++){ for(i=0; i<8; i++){
((uint32_t*)(src+i*stride))[0]= a; ((pixel4*)(src+i*stride))[0]= a;
((uint32_t*)(src+i*stride))[1]= b; ((pixel4*)(src+i*stride))[1]= b;
} }
} }
static void pred8x8_horizontal_c(uint8_t *src, int stride){ static void FUNCC(pred8x8_horizontal)(uint8_t *_src, int stride){
int i; int i;
pixel *src = (pixel*)_src;
stride /= sizeof(pixel);
for(i=0; i<8; i++){ for(i=0; i<8; i++){
((uint32_t*)(src+i*stride))[0]= ((pixel4*)(src+i*stride))[0]=
((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101; ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(src[-1+i*stride]);
} }
} }
static void pred8x8_128_dc_c(uint8_t *src, int stride){ #define PRED8x8_X(n, v)\
int i; static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, int stride){\
int i;\
for(i=0; i<8; i++){ pixel *src = (pixel*)_src;\
((uint32_t*)(src+i*stride))[0]= stride /= sizeof(pixel);\
((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U; for(i=0; i<8; i++){\
} ((pixel4*)(src+i*stride))[0]=\
((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(v);\
}\
} }
static void pred8x8_127_dc_c(uint8_t *src, int stride){ PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1);
int i; PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0);
PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1);
for(i=0; i<8; i++){ static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int stride){
((uint32_t*)(src+i*stride))[0]=
((uint32_t*)(src+i*stride))[1]= 0x01010101U*127U;
}
}
static void pred8x8_129_dc_c(uint8_t *src, int stride){
int i;
for(i=0; i<8; i++){
((uint32_t*)(src+i*stride))[0]=
((uint32_t*)(src+i*stride))[1]= 0x01010101U*129U;
}
}
static void pred8x8_left_dc_c(uint8_t *src, int stride){
int i; int i;
int dc0, dc2; int dc0, dc2;
pixel4 dc0splat, dc2splat;
pixel *src = (pixel*)_src;
stride /= sizeof(pixel);
dc0=dc2=0; dc0=dc2=0;
for(i=0;i<4; i++){ for(i=0;i<4; i++){
dc0+= src[-1+i*stride]; dc0+= src[-1+i*stride];
dc2+= src[-1+(i+4)*stride]; dc2+= src[-1+(i+4)*stride];
} }
dc0= 0x01010101*((dc0 + 2)>>2); dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
dc2= 0x01010101*((dc2 + 2)>>2); dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
for(i=0; i<4; i++){ for(i=0; i<4; i++){
((uint32_t*)(src+i*stride))[0]= ((pixel4*)(src+i*stride))[0]=
((uint32_t*)(src+i*stride))[1]= dc0; ((pixel4*)(src+i*stride))[1]= dc0splat;
} }
for(i=4; i<8; i++){ for(i=4; i<8; i++){
((uint32_t*)(src+i*stride))[0]= ((pixel4*)(src+i*stride))[0]=
((uint32_t*)(src+i*stride))[1]= dc2; ((pixel4*)(src+i*stride))[1]= dc2splat;
} }
} }
static void pred8x8_top_dc_c(uint8_t *src, int stride){ static void FUNCC(pred8x8_top_dc)(uint8_t *_src, int stride){
int i; int i;
int dc0, dc1; int dc0, dc1;
pixel4 dc0splat, dc1splat;
pixel *src = (pixel*)_src;
stride /= sizeof(pixel);
dc0=dc1=0; dc0=dc1=0;
for(i=0;i<4; i++){ for(i=0;i<4; i++){
dc0+= src[i-stride]; dc0+= src[i-stride];
dc1+= src[4+i-stride]; dc1+= src[4+i-stride];
} }
dc0= 0x01010101*((dc0 + 2)>>2); dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
dc1= 0x01010101*((dc1 + 2)>>2); dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
for(i=0; i<4; i++){ for(i=0; i<4; i++){
((uint32_t*)(src+i*stride))[0]= dc0; ((pixel4*)(src+i*stride))[0]= dc0splat;
((uint32_t*)(src+i*stride))[1]= dc1; ((pixel4*)(src+i*stride))[1]= dc1splat;
} }
for(i=4; i<8; i++){ for(i=4; i<8; i++){
((uint32_t*)(src+i*stride))[0]= dc0; ((pixel4*)(src+i*stride))[0]= dc0splat;
((uint32_t*)(src+i*stride))[1]= dc1; ((pixel4*)(src+i*stride))[1]= dc1splat;
} }
} }
static void pred8x8_dc_c(uint8_t *src, int stride){ static void FUNCC(pred8x8_dc)(uint8_t *_src, int stride){
int i; int i;
int dc0, dc1, dc2, dc3; int dc0, dc1, dc2;
pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
pixel *src = (pixel*)_src;
stride /= sizeof(pixel);
dc0=dc1=dc2=0; dc0=dc1=dc2=0;
for(i=0;i<4; i++){ for(i=0;i<4; i++){
@ -512,51 +544,53 @@ static void pred8x8_dc_c(uint8_t *src, int stride){
dc1+= src[4+i-stride]; dc1+= src[4+i-stride];
dc2+= src[-1+(i+4)*stride]; dc2+= src[-1+(i+4)*stride];
} }
dc3= 0x01010101*((dc1 + dc2 + 4)>>3); dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
dc0= 0x01010101*((dc0 + 4)>>3); dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
dc1= 0x01010101*((dc1 + 2)>>2); dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
dc2= 0x01010101*((dc2 + 2)>>2); dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
for(i=0; i<4; i++){ for(i=0; i<4; i++){
((uint32_t*)(src+i*stride))[0]= dc0; ((pixel4*)(src+i*stride))[0]= dc0splat;
((uint32_t*)(src+i*stride))[1]= dc1; ((pixel4*)(src+i*stride))[1]= dc1splat;
} }
for(i=4; i<8; i++){ for(i=4; i<8; i++){
((uint32_t*)(src+i*stride))[0]= dc2; ((pixel4*)(src+i*stride))[0]= dc2splat;
((uint32_t*)(src+i*stride))[1]= dc3; ((pixel4*)(src+i*stride))[1]= dc3splat;
} }
} }
//the following 4 function should not be optimized! //the following 4 function should not be optimized!
static void pred8x8_mad_cow_dc_l0t(uint8_t *src, int stride){ static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
pred8x8_top_dc_c(src, stride); FUNCC(pred8x8_top_dc)(src, stride);
pred4x4_dc_c(src, NULL, stride); FUNCC(pred4x4_dc)(src, NULL, stride);
} }
static void pred8x8_mad_cow_dc_0lt(uint8_t *src, int stride){ static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
pred8x8_dc_c(src, stride); FUNCC(pred8x8_dc)(src, stride);
pred4x4_top_dc_c(src, NULL, stride); FUNCC(pred4x4_top_dc)(src, NULL, stride);
} }
static void pred8x8_mad_cow_dc_l00(uint8_t *src, int stride){ static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
pred8x8_left_dc_c(src, stride); FUNCC(pred8x8_left_dc)(src, stride);
pred4x4_128_dc_c(src + 4*stride , NULL, stride); FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride);
pred4x4_128_dc_c(src + 4*stride + 4, NULL, stride); FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
} }
static void pred8x8_mad_cow_dc_0l0(uint8_t *src, int stride){ static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
pred8x8_left_dc_c(src, stride); FUNCC(pred8x8_left_dc)(src, stride);
pred4x4_128_dc_c(src , NULL, stride); FUNCC(pred4x4_128_dc)(src , NULL, stride);
pred4x4_128_dc_c(src + 4, NULL, stride); FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
} }
static void pred8x8_plane_c(uint8_t *src, int stride){ static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){
int j, k; int j, k;
int a; int a;
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; INIT_CLIP
const uint8_t * const src0 = src+3-stride; pixel *src = (pixel*)_src;
const uint8_t *src1 = src+4*stride-1; int stride = _stride/sizeof(pixel);
const uint8_t *src2 = src1-2*stride; // == src+2*stride-1; const pixel * const src0 = src +3-stride;
const pixel * src1 = src +4*stride-1;
const pixel * src2 = src1-2*stride; // == src+2*stride-1;
int H = src0[1] - src0[-1]; int H = src0[1] - src0[-1];
int V = src1[0] - src2[ 0]; int V = src1[0] - src2[ 0];
for(k=2; k<=4; ++k) { for(k=2; k<=4; ++k) {
@ -571,14 +605,14 @@ static void pred8x8_plane_c(uint8_t *src, int stride){
for(j=8; j>0; --j) { for(j=8; j>0; --j) {
int b = a; int b = a;
a += V; a += V;
src[0] = cm[ (b ) >> 5 ]; src[0] = CLIP((b ) >> 5);
src[1] = cm[ (b+ H) >> 5 ]; src[1] = CLIP((b+ H) >> 5);
src[2] = cm[ (b+2*H) >> 5 ]; src[2] = CLIP((b+2*H) >> 5);
src[3] = cm[ (b+3*H) >> 5 ]; src[3] = CLIP((b+3*H) >> 5);
src[4] = cm[ (b+4*H) >> 5 ]; src[4] = CLIP((b+4*H) >> 5);
src[5] = cm[ (b+5*H) >> 5 ]; src[5] = CLIP((b+5*H) >> 5);
src[6] = cm[ (b+6*H) >> 5 ]; src[6] = CLIP((b+6*H) >> 5);
src[7] = cm[ (b+7*H) >> 5 ]; src[7] = CLIP((b+7*H) >> 5);
src += stride; src += stride;
} }
} }
@ -616,46 +650,64 @@ static void pred8x8_plane_c(uint8_t *src, int stride){
#define PREDICT_8x8_DC(v) \ #define PREDICT_8x8_DC(v) \
int y; \ int y; \
for( y = 0; y < 8; y++ ) { \ for( y = 0; y < 8; y++ ) { \
((uint32_t*)src)[0] = \ ((pixel4*)src)[0] = \
((uint32_t*)src)[1] = v; \ ((pixel4*)src)[1] = v; \
src += stride; \ src += stride; \
} }
static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
{ {
PREDICT_8x8_DC(0x80808080); pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
} }
static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
{ {
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
PREDICT_8x8_LOAD_LEFT; PREDICT_8x8_LOAD_LEFT;
const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101; const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
PREDICT_8x8_DC(dc); PREDICT_8x8_DC(dc);
} }
static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
{ {
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
PREDICT_8x8_LOAD_TOP; PREDICT_8x8_LOAD_TOP;
const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101; const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
PREDICT_8x8_DC(dc); PREDICT_8x8_DC(dc);
} }
static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
{ {
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
PREDICT_8x8_LOAD_LEFT; PREDICT_8x8_LOAD_LEFT;
PREDICT_8x8_LOAD_TOP; PREDICT_8x8_LOAD_TOP;
const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7 const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
+t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101; +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
PREDICT_8x8_DC(dc); PREDICT_8x8_DC(dc);
} }
static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride) static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
{ {
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
PREDICT_8x8_LOAD_LEFT; PREDICT_8x8_LOAD_LEFT;
#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\ #define ROW(y) ((pixel4*)(src+y*stride))[0] =\
((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y ((pixel4*)(src+y*stride))[1] = PIXEL_SPLAT_X4(l##y)
ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
#undef ROW #undef ROW
} }
static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride) static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
{ {
int y; int y;
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
PREDICT_8x8_LOAD_TOP; PREDICT_8x8_LOAD_TOP;
src[0] = t0; src[0] = t0;
src[1] = t1; src[1] = t1;
@ -665,11 +717,15 @@ static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright,
src[5] = t5; src[5] = t5;
src[6] = t6; src[6] = t6;
src[7] = t7; src[7] = t7;
for( y = 1; y < 8; y++ ) for( y = 1; y < 8; y++ ) {
*(uint64_t*)(src+y*stride) = *(uint64_t*)src; ((pixel4*)(src+y*stride))[0] = ((pixel4*)src)[0];
((pixel4*)(src+y*stride))[1] = ((pixel4*)src)[1];
}
} }
static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride) static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
{ {
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
PREDICT_8x8_LOAD_TOP; PREDICT_8x8_LOAD_TOP;
PREDICT_8x8_LOAD_TOPRIGHT; PREDICT_8x8_LOAD_TOPRIGHT;
SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2; SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
@ -688,8 +744,10 @@ static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright
SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2; SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
SRC(7,7)= (t14 + 3*t15 + 2) >> 2; SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
} }
static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride) static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
{ {
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
PREDICT_8x8_LOAD_TOP; PREDICT_8x8_LOAD_TOP;
PREDICT_8x8_LOAD_LEFT; PREDICT_8x8_LOAD_LEFT;
PREDICT_8x8_LOAD_TOPLEFT; PREDICT_8x8_LOAD_TOPLEFT;
@ -708,10 +766,11 @@ static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_toprigh
SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2; SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2; SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2; SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
} }
static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride) static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
{ {
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
PREDICT_8x8_LOAD_TOP; PREDICT_8x8_LOAD_TOP;
PREDICT_8x8_LOAD_LEFT; PREDICT_8x8_LOAD_LEFT;
PREDICT_8x8_LOAD_TOPLEFT; PREDICT_8x8_LOAD_TOPLEFT;
@ -738,8 +797,10 @@ static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_top
SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2; SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
SRC(7,0)= (t6 + t7 + 1) >> 1; SRC(7,0)= (t6 + t7 + 1) >> 1;
} }
static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride) static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
{ {
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
PREDICT_8x8_LOAD_TOP; PREDICT_8x8_LOAD_TOP;
PREDICT_8x8_LOAD_LEFT; PREDICT_8x8_LOAD_LEFT;
PREDICT_8x8_LOAD_TOPLEFT; PREDICT_8x8_LOAD_TOPLEFT;
@ -766,8 +827,10 @@ static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_to
SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2; SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2; SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
} }
static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride) static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
{ {
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
PREDICT_8x8_LOAD_TOP; PREDICT_8x8_LOAD_TOP;
PREDICT_8x8_LOAD_TOPRIGHT; PREDICT_8x8_LOAD_TOPRIGHT;
SRC(0,0)= (t0 + t1 + 1) >> 1; SRC(0,0)= (t0 + t1 + 1) >> 1;
@ -793,8 +856,10 @@ static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topr
SRC(7,6)= (t10 + t11 + 1) >> 1; SRC(7,6)= (t10 + t11 + 1) >> 1;
SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2; SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
} }
static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride) static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
{ {
pixel *src = (pixel*)_src;
int stride = _stride/sizeof(pixel);
PREDICT_8x8_LOAD_LEFT; PREDICT_8x8_LOAD_LEFT;
SRC(0,0)= (l0 + l1 + 1) >> 1; SRC(0,0)= (l0 + l1 + 1) >> 1;
SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2; SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
@ -825,11 +890,14 @@ static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topr
#undef PL #undef PL
#undef SRC #undef SRC
static void pred4x4_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){ static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
int i; int i;
pixel *pix = (pixel*)_pix;
const dctcoef *block = (const dctcoef*)_block;
stride /= sizeof(pixel);
pix -= stride; pix -= stride;
for(i=0; i<4; i++){ for(i=0; i<4; i++){
uint8_t v = pix[0]; pixel v = pix[0];
pix[1*stride]= v += block[0]; pix[1*stride]= v += block[0];
pix[2*stride]= v += block[4]; pix[2*stride]= v += block[4];
pix[3*stride]= v += block[8]; pix[3*stride]= v += block[8];
@ -839,10 +907,13 @@ static void pred4x4_vertical_add_c(uint8_t *pix, const DCTELEM *block, int strid
} }
} }
static void pred4x4_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){ static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
int i; int i;
pixel *pix = (pixel*)_pix;
const dctcoef *block = (const dctcoef*)_block;
stride /= sizeof(pixel);
for(i=0; i<4; i++){ for(i=0; i<4; i++){
uint8_t v = pix[-1]; pixel v = pix[-1];
pix[0]= v += block[0]; pix[0]= v += block[0];
pix[1]= v += block[1]; pix[1]= v += block[1];
pix[2]= v += block[2]; pix[2]= v += block[2];
@ -852,11 +923,14 @@ static void pred4x4_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int str
} }
} }
static void pred8x8l_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){ static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
int i; int i;
pixel *pix = (pixel*)_pix;
const dctcoef *block = (const dctcoef*)_block;
stride /= sizeof(pixel);
pix -= stride; pix -= stride;
for(i=0; i<8; i++){ for(i=0; i<8; i++){
uint8_t v = pix[0]; pixel v = pix[0];
pix[1*stride]= v += block[0]; pix[1*stride]= v += block[0];
pix[2*stride]= v += block[8]; pix[2*stride]= v += block[8];
pix[3*stride]= v += block[16]; pix[3*stride]= v += block[16];
@ -870,10 +944,13 @@ static void pred8x8l_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stri
} }
} }
static void pred8x8l_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){ static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
int i; int i;
pixel *pix = (pixel*)_pix;
const dctcoef *block = (const dctcoef*)_block;
stride /= sizeof(pixel);
for(i=0; i<8; i++){ for(i=0; i<8; i++){
uint8_t v = pix[-1]; pixel v = pix[-1];
pix[0]= v += block[0]; pix[0]= v += block[0];
pix[1]= v += block[1]; pix[1]= v += block[1];
pix[2]= v += block[2]; pix[2]= v += block[2];
@ -887,26 +964,26 @@ static void pred8x8l_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int st
} }
} }
static void pred16x16_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
int i; int i;
for(i=0; i<16; i++) for(i=0; i<16; i++)
pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride); FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
} }
static void pred16x16_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
int i; int i;
for(i=0; i<16; i++) for(i=0; i<16; i++)
pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride); FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
} }
static void pred8x8_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
int i; int i;
for(i=0; i<4; i++) for(i=0; i<4; i++)
pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride); FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
} }
static void pred8x8_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
int i; int i;
for(i=0; i<4; i++) for(i=0; i<4; i++)
pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride); FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
} }