From 622958449b9388cca0f4a4e287b3e94422e4a573 Mon Sep 17 00:00:00 2001 From: Attila Nagy Date: Fri, 10 Jun 2011 14:10:21 +0300 Subject: [PATCH 1/8] New loop filter interface Separate simple filter with reduced no. of parameters. MB filter level picking based on precalculated table. Level table updated for each frame. Inside and edge limits precalculated and updated just when sharpness changes. HEV threshhold is constant. ARM targets use scalars and others vectors. Change works only with --target=generic-gnu All other targets have to be updated! Change-Id: I6b73aca6b525075b20129a371699b2561bd4d51c --- vp8/common/generic/systemdependent.c | 4 +- vp8/common/loopfilter.c | 694 ++++++++++++++------------- vp8/common/loopfilter.h | 67 ++- vp8/common/loopfilter_filters.c | 80 ++- vp8/common/onyxc_int.h | 21 +- vp8/decoder/onyxd_if.c | 2 +- vp8/decoder/threading.c | 218 +++++---- vp8/encoder/onyx_if.c | 2 +- 8 files changed, 578 insertions(+), 510 deletions(-) diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c index 133938097..c61629407 100644 --- a/vp8/common/generic/systemdependent.c +++ b/vp8/common/generic/systemdependent.c @@ -108,9 +108,9 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_c; rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c; rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_c; - rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_c; rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_c; - rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_c; rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_c; #if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_INTERNAL_STATS) diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c index a3242716f..be3f53593 100644 --- a/vp8/common/loopfilter.c +++ b/vp8/common/loopfilter.c @@ -9,152 +9,149 @@ */ -#include "vpx_ports/config.h" +#include "vpx_config.h" #include "loopfilter.h" #include "onyxc_int.h" +#include "vpx_mem/vpx_mem.h" typedef unsigned char uc; - prototype_loopfilter(vp8_loop_filter_horizontal_edge_c); prototype_loopfilter(vp8_loop_filter_vertical_edge_c); prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c); prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c); -prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c); -prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c); + +prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_c); +prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_c); /* Horizontal MB filtering */ -void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - -void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } /* Vertical MB Filtering */ -void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - -void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } /* Horizontal B Filtering */ -void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit); } /* Vertical B Filtering */ -void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit); } -void vp8_init_loop_filter(VP8_COMMON *cm) +static void lf_init_lut(loop_filter_info_n *lfi) { - loop_filter_info *lfi = cm->lf_info; - LOOPFILTERTYPE lft = cm->filter_type; - int sharpness_lvl = cm->sharpness_level; - int frame_type = cm->frame_type; - int i, j; + int filt_lvl; - int block_inside_limit = 0; - int HEVThresh; - - /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */ - for (i = 0; i <= MAX_LOOP_FILTER; i++) + for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) { - int filt_lvl = i; - - if (frame_type == KEY_FRAME) + if (filt_lvl >= 40) { - if (filt_lvl >= 40) - HEVThresh = 2; - else if (filt_lvl >= 15) - HEVThresh = 1; - else - HEVThresh = 0; + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3; + } + else if (filt_lvl >= 20) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2; + } + else if (filt_lvl >= 15) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1; } else { - if (filt_lvl >= 40) - HEVThresh = 3; - else if (filt_lvl >= 20) - HEVThresh = 2; - else if (filt_lvl >= 15) - HEVThresh = 1; - else - HEVThresh = 0; + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0; } + } + + lfi->mode_lf_lut[DC_PRED] = 1; + lfi->mode_lf_lut[V_PRED] = 1; + lfi->mode_lf_lut[H_PRED] = 1; + lfi->mode_lf_lut[TM_PRED] = 1; + lfi->mode_lf_lut[B_PRED] = 0; + + lfi->mode_lf_lut[ZEROMV] = 1; + lfi->mode_lf_lut[NEARESTMV] = 2; + lfi->mode_lf_lut[NEARMV] = 2; + lfi->mode_lf_lut[NEWMV] = 2; + lfi->mode_lf_lut[SPLITMV] = 3; + +} + +void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl) +{ + int i; + + /* For each possible value for the loop filter fill out limits */ + for (i = 0; i <= MAX_LOOP_FILTER; i++) + { + int filt_lvl = i; + int block_inside_limit = 0; /* Set loop filter paramaeters that control sharpness. */ block_inside_limit = filt_lvl >> (sharpness_lvl > 0); @@ -169,119 +166,120 @@ void vp8_init_loop_filter(VP8_COMMON *cm) if (block_inside_limit < 1) block_inside_limit = 1; - for (j = 0; j < 16; j++) - { - lfi[i].lim[j] = block_inside_limit; - lfi[i].mbflim[j] = filt_lvl + 2; - lfi[i].flim[j] = filt_lvl; - lfi[i].thr[j] = HEVThresh; - } - - } - - /* Set up the function pointers depending on the type of loop filtering selected */ - if (lft == NORMAL_LOOPFILTER) - { - cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v); - cm->lf_bv = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v); - cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h); - cm->lf_bh = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h); - } - else - { - cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v); - cm->lf_bv = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v); - cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h); - cm->lf_bh = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h); + vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH); + vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), + SIMD_WIDTH); + vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit), + SIMD_WIDTH); } } -/* Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding - * each frame. Check last_frame_type to skip the function most of times. - */ -void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type) +void vp8_loop_filter_init(VP8_COMMON *cm) { - int HEVThresh; - int i, j; + loop_filter_info_n *lfi = &cm->lf_info; + int i; - /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */ - for (i = 0; i <= MAX_LOOP_FILTER; i++) + /* init limits for given sharpness*/ + vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + + /* init LUT for lvl and hev thr picking */ + lf_init_lut(lfi); + + /* init hev threshold const vectors */ + for(i = 0; i < 4 ; i++) { - int filt_lvl = i; - - if (frame_type == KEY_FRAME) - { - if (filt_lvl >= 40) - HEVThresh = 2; - else if (filt_lvl >= 15) - HEVThresh = 1; - else - HEVThresh = 0; - } - else - { - if (filt_lvl >= 40) - HEVThresh = 3; - else if (filt_lvl >= 20) - HEVThresh = 2; - else if (filt_lvl >= 15) - HEVThresh = 1; - else - HEVThresh = 0; - } - - for (j = 0; j < 16; j++) - { - /*lfi[i].lim[j] = block_inside_limit; - lfi[i].mbflim[j] = filt_lvl+2;*/ - /*lfi[i].flim[j] = filt_lvl;*/ - lfi[i].thr[j] = HEVThresh; - } + vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH); } } - -int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level) +void vp8_loop_filter_frame_init(VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl, + int sharpness_lvl) { - MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi; + int seg, /* segment number */ + ref, /* index in ref_lf_deltas */ + mode; /* index in mode_lf_deltas */ - if (mbd->mode_ref_lf_delta_enabled) + loop_filter_info_n *lfi = &cm->lf_info; + + /* update limits if sharpness has changed */ + if(cm->last_sharpness_level != sharpness_lvl) { + vp8_loop_filter_update_sharpness(lfi, sharpness_lvl); + cm->last_sharpness_level = sharpness_lvl; + } + + for(seg = 0; seg < MAX_MB_SEGMENTS; seg++) + { + int lvl_seg = default_filt_lvl; + int lvl_ref, lvl_mode; + + /* Note the baseline filter values for each segment */ + if (mbd->segmentation_enabled) + { + /* Abs value */ + if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) + { + lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; + } + else /* Delta Value */ + { + lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; + lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0; + } + } + + if (!mbd->mode_ref_lf_delta_enabled) + { + /* we could get rid of this if we assume that deltas are set to + * zero when not in use; encoder always uses deltas + */ + vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 ); + continue; + } + + lvl_ref = lvl_seg; + + /* INTRA_FRAME */ + ref = INTRA_FRAME; + /* Apply delta for reference frame */ - filter_level += mbd->ref_lf_deltas[mbmi->ref_frame]; + lvl_ref += mbd->ref_lf_deltas[ref]; - /* Apply delta for mode */ - if (mbmi->ref_frame == INTRA_FRAME) + /* Apply delta for Intra modes */ + mode = 0; /* B_PRED */ + /* Only the split mode BPRED has a further special case */ + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ + + lfi->lvl[seg][ref][mode] = lvl_mode; + + mode = 1; /* all the rest of Intra modes */ + lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; /* clamp */ + lfi->lvl[seg][ref][mode] = lvl_mode; + + /* LAST, GOLDEN, ALT */ + for(ref = 1; ref < MAX_REF_FRAMES; ref++) { - /* Only the split mode BPRED has a further special case */ - if (mbmi->mode == B_PRED) - filter_level += mbd->mode_lf_deltas[0]; + int lvl_ref = lvl_seg; + + /* Apply delta for reference frame */ + lvl_ref += mbd->ref_lf_deltas[ref]; + + /* Apply delta for Inter modes */ + for (mode = 1; mode < 4; mode++) + { + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ + + lfi->lvl[seg][ref][mode] = lvl_mode; + } } - else - { - /* Zero motion mode */ - if (mbmi->mode == ZEROMV) - filter_level += mbd->mode_lf_deltas[1]; - - /* Split MB motion mode */ - else if (mbmi->mode == SPLITMV) - filter_level += mbd->mode_lf_deltas[3]; - - /* All other inter motion modes (Nearest, Near, New) */ - else - filter_level += mbd->mode_lf_deltas[2]; - } - - /* Range check */ - if (filter_level > MAX_LOOP_FILTER) - filter_level = MAX_LOOP_FILTER; - else if (filter_level < 0) - filter_level = 0; } - return filter_level; } - void vp8_loop_filter_frame ( VP8_COMMON *cm, @@ -290,49 +288,23 @@ void vp8_loop_filter_frame ) { YV12_BUFFER_CONFIG *post = cm->frame_to_show; - loop_filter_info *lfi = cm->lf_info; + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + FRAME_TYPE frame_type = cm->frame_type; int mb_row; int mb_col; - - int baseline_filter_level[MAX_MB_SEGMENTS]; int filter_level; - int alt_flt_enabled = mbd->segmentation_enabled; - int i; unsigned char *y_ptr, *u_ptr, *v_ptr; - mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */ - - /* Note the baseline filter values for each segment */ - if (alt_flt_enabled) - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - { - /* Abs value */ - if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) - baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - /* Delta Value */ - else - { - baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */ - } - } - } - else - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - baseline_filter_level[i] = default_filt_lvl; - } + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; /* Initialize the loop filter for this frame. */ - if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) - vp8_init_loop_filter(cm); - else if (frame_type != cm->last_frame_type) - vp8_frame_init_loop_filter(lfi, frame_type); + vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl, cm->sharpness_level); /* Set up the buffer pointers */ y_ptr = post->y_buffer; @@ -344,51 +316,79 @@ void vp8_loop_filter_frame { for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { - int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; - int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED && - mbd->mode_info_context->mbmi.mode != SPLITMV && - mbd->mode_info_context->mbmi.mb_skip_coeff); + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); - filter_level = baseline_filter_level[Segment]; + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; - /* Distance of Mb to the various image edges. - * These specified to 8th pel as they are always compared to values that are in 1/8th pel units - * Apply any context driven MB level adjustment - */ - filter_level = vp8_adjust_mb_lf_value(mbd, filter_level); + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; if (filter_level) { - if (mb_col > 0) - cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; - if (!skip_lf) - cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v) + (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); - /* don't apply across umv border */ - if (mb_row > 0) - cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v) + (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); - if (!skip_lf) - cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h) + (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h) + (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); + } + else + { + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } } y_ptr += 16; u_ptr += 8; v_ptr += 8; - mbd->mode_info_context++; /* step to next MB */ + mode_info_context++; /* step to next MB */ } y_ptr += post->y_stride * 16 - post->y_width; u_ptr += post->uv_stride * 8 - post->uv_width; v_ptr += post->uv_stride * 8 - post->uv_width; - mbd->mode_info_context++; /* Skip border mb */ + mode_info_context++; /* Skip border mb */ } } - void vp8_loop_filter_frame_yonly ( VP8_COMMON *cm, @@ -399,49 +399,28 @@ void vp8_loop_filter_frame_yonly { YV12_BUFFER_CONFIG *post = cm->frame_to_show; - int i; unsigned char *y_ptr; int mb_row; int mb_col; - loop_filter_info *lfi = cm->lf_info; - int baseline_filter_level[MAX_MB_SEGMENTS]; + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + int filter_level; - int alt_flt_enabled = mbd->segmentation_enabled; FRAME_TYPE frame_type = cm->frame_type; - (void) sharpness_lvl; + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; - /*MODE_INFO * this_mb_mode_info = cm->mi;*/ /* Point at base of Mb MODE_INFO list */ - mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */ + sharpness_lvl = cm->sharpness_level; - /* Note the baseline filter values for each segment */ - if (alt_flt_enabled) - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - { - /* Abs value */ - if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) - baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - /* Delta Value */ - else - { - baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */ - } - } - } - else - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - baseline_filter_level[i] = default_filt_lvl; - } +#if 0 + if(default_filt_lvl == 0) /* no filter applied */ + return; +#endif /* Initialize the loop filter for this frame. */ - if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) - vp8_init_loop_filter(cm); - else if (frame_type != cm->last_frame_type) - vp8_frame_init_loop_filter(lfi, frame_type); + vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl, sharpness_lvl); /* Set up the buffer pointers */ y_ptr = post->y_buffer; @@ -451,44 +430,75 @@ void vp8_loop_filter_frame_yonly { for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { - int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; - int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED && - mbd->mode_info_context->mbmi.mode != SPLITMV && - mbd->mode_info_context->mbmi.mb_skip_coeff); + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); - filter_level = baseline_filter_level[Segment]; + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; - /* Apply any context driven MB level adjustment */ - filter_level = vp8_adjust_mb_lf_value(mbd, filter_level); + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; if (filter_level) { - if (mb_col > 0) - cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; - if (!skip_lf) - cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); - /* don't apply across umv border */ - if (mb_row > 0) - cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); - if (!skip_lf) - cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + } + else + { + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } } y_ptr += 16; - mbd->mode_info_context ++; /* step to next MB */ + mode_info_context ++; /* step to next MB */ } y_ptr += post->y_stride * 16 - post->y_width; - mbd->mode_info_context ++; /* Skip border mb */ + mode_info_context ++; /* Skip border mb */ } } - void vp8_loop_filter_partial_frame ( VP8_COMMON *cm, @@ -500,25 +510,32 @@ void vp8_loop_filter_partial_frame { YV12_BUFFER_CONFIG *post = cm->frame_to_show; - int i; unsigned char *y_ptr; int mb_row; int mb_col; - /*int mb_rows = post->y_height >> 4;*/ int mb_cols = post->y_width >> 4; - int linestocopy; + int linestocopy, i; + + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; - loop_filter_info *lfi = cm->lf_info; - int baseline_filter_level[MAX_MB_SEGMENTS]; int filter_level; int alt_flt_enabled = mbd->segmentation_enabled; FRAME_TYPE frame_type = cm->frame_type; - (void) sharpness_lvl; + const MODE_INFO *mode_info_context; - /*MODE_INFO * this_mb_mode_info = cm->mi + (post->y_height>>5) * (mb_cols + 1);*/ /* Point at base of Mb MODE_INFO list */ - mbd->mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); /* Point at base of Mb MODE_INFO list */ + int lvl_seg[MAX_MB_SEGMENTS]; + + sharpness_lvl = cm->sharpness_level; + +#if 0 + if(default_filt_lvl == 0) /* no filter applied */ + return; +#endif + + mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); linestocopy = (post->y_height >> (4 + Fraction)); @@ -531,29 +548,24 @@ void vp8_loop_filter_partial_frame if (alt_flt_enabled) { for (i = 0; i < MAX_MB_SEGMENTS; i++) - { - /* Abs value */ + { /* Abs value */ if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) - baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + { + lvl_seg[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + } /* Delta Value */ else { - baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */ + lvl_seg[i] = default_filt_lvl + + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + lvl_seg[i] = (lvl_seg[i] > 0) ? + ((lvl_seg[i] > 63) ? 63: lvl_seg[i]) : 0; } } } else - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - baseline_filter_level[i] = default_filt_lvl; - } + lvl_seg[0] = default_filt_lvl; - /* Initialize the loop filter for this frame. */ - if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) - vp8_init_loop_filter(cm); - else if (frame_type != cm->last_frame_type) - vp8_frame_init_loop_filter(lfi, frame_type); /* Set up the buffer pointers */ y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride; @@ -563,32 +575,64 @@ void vp8_loop_filter_partial_frame { for (mb_col = 0; mb_col < mb_cols; mb_col++) { - int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; - int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED && - mbd->mode_info_context->mbmi.mode != SPLITMV && - mbd->mode_info_context->mbmi.mb_skip_coeff); + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); - filter_level = baseline_filter_level[Segment]; + if (alt_flt_enabled) + filter_level = lvl_seg[mode_info_context->mbmi.segment_id]; + else + filter_level = lvl_seg[0]; if (filter_level) { - if (mb_col > 0) - cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; - if (!skip_lf) - cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); - cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); - if (!skip_lf) - cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + } + else + { + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } } y_ptr += 16; - mbd->mode_info_context += 1; /* step to next MB */ + mode_info_context += 1; /* step to next MB */ } y_ptr += post->y_stride * 16 - post->y_width; - mbd->mode_info_context += 1; /* Skip border mb */ + mode_info_context += 1; /* Skip border mb */ } } diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h index ca136b3a4..2d6dad306 100644 --- a/vp8/common/loopfilter.h +++ b/vp8/common/loopfilter.h @@ -13,6 +13,7 @@ #define loopfilter_h #include "vpx_ports/mem.h" +#include "vpx_config.h" #define MAX_LOOP_FILTER 63 @@ -22,27 +23,46 @@ typedef enum SIMPLE_LOOPFILTER = 1 } LOOPFILTERTYPE; -/* FRK - * Need to align this structure so when it is declared and +#if ARCH_ARM +#define SIMD_WIDTH 1 +#else +#define SIMD_WIDTH 16 +#endif + +/* Need to align this structure so when it is declared and * passed it can be loaded into vector registers. */ typedef struct { - DECLARE_ALIGNED(16, signed char, lim[16]); - DECLARE_ALIGNED(16, signed char, flim[16]); - DECLARE_ALIGNED(16, signed char, thr[16]); - DECLARE_ALIGNED(16, signed char, mbflim[16]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]); + unsigned char lvl[4][4][4]; + unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1]; + unsigned char mode_lf_lut[10]; +} loop_filter_info_n; + +typedef struct +{ + const unsigned char * mblim; + const unsigned char * blim; + const unsigned char * lim; + const unsigned char * hev_thr; } loop_filter_info; #define prototype_loopfilter(sym) \ - void sym(unsigned char *src, int pitch, const signed char *flimit,\ - const signed char *limit, const signed char *thresh, int count) + void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ + const unsigned char *limit, const unsigned char *thresh, int count) #define prototype_loopfilter_block(sym) \ - void sym(unsigned char *y, unsigned char *u, unsigned char *v,\ + void sym(unsigned char *y, unsigned char *u, unsigned char *v, \ int ystride, int uv_stride, loop_filter_info *lfi) +#define prototype_simple_loopfilter(sym) \ + void sym(unsigned char *y, int ystride, const unsigned char *blimit) + #if ARCH_X86 || ARCH_X86_64 #include "x86/loopfilter_x86.h" #endif @@ -71,38 +91,39 @@ extern prototype_loopfilter_block(vp8_lf_normal_mb_h); #endif extern prototype_loopfilter_block(vp8_lf_normal_b_h); - #ifndef vp8_lf_simple_mb_v -#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_c +#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_c #endif -extern prototype_loopfilter_block(vp8_lf_simple_mb_v); +extern prototype_simple_loopfilter(vp8_lf_simple_mb_v); #ifndef vp8_lf_simple_b_v #define vp8_lf_simple_b_v vp8_loop_filter_bvs_c #endif -extern prototype_loopfilter_block(vp8_lf_simple_b_v); +extern prototype_simple_loopfilter(vp8_lf_simple_b_v); #ifndef vp8_lf_simple_mb_h -#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_c +#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_c #endif -extern prototype_loopfilter_block(vp8_lf_simple_mb_h); +extern prototype_simple_loopfilter(vp8_lf_simple_mb_h); #ifndef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_c #endif -extern prototype_loopfilter_block(vp8_lf_simple_b_h); +extern prototype_simple_loopfilter(vp8_lf_simple_b_h); typedef prototype_loopfilter_block((*vp8_lf_block_fn_t)); +typedef prototype_simple_loopfilter((*vp8_slf_block_fn_t)); + typedef struct { vp8_lf_block_fn_t normal_mb_v; vp8_lf_block_fn_t normal_b_v; vp8_lf_block_fn_t normal_mb_h; vp8_lf_block_fn_t normal_b_h; - vp8_lf_block_fn_t simple_mb_v; - vp8_lf_block_fn_t simple_b_v; - vp8_lf_block_fn_t simple_mb_h; - vp8_lf_block_fn_t simple_b_h; + vp8_slf_block_fn_t simple_mb_v; + vp8_slf_block_fn_t simple_b_v; + vp8_slf_block_fn_t simple_mb_h; + vp8_slf_block_fn_t simple_b_h; } vp8_loopfilter_rtcd_vtable_t; #if CONFIG_RUNTIME_CPU_DETECT @@ -115,9 +136,9 @@ typedef void loop_filter_uvfunction ( unsigned char *u, /* source pointer */ int p, /* pitch */ - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, unsigned char *v ); diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c index 694052924..10228ae09 100644 --- a/vp8/common/loopfilter_filters.c +++ b/vp8/common/loopfilter_filters.c @@ -24,8 +24,9 @@ static __inline signed char vp8_signed_char_clamp(int t) /* should we apply any filter at all ( 11111111 yes, 00000000 no) */ -static __inline signed char vp8_filter_mask(signed char limit, signed char flimit, - uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3) +static __inline signed char vp8_filter_mask(uc limit, uc blimit, + uc p3, uc p2, uc p1, uc p0, + uc q0, uc q1, uc q2, uc q3) { signed char mask = 0; mask |= (abs(p3 - p2) > limit) * -1; @@ -34,13 +35,13 @@ static __inline signed char vp8_filter_mask(signed char limit, signed char flimi mask |= (abs(q1 - q0) > limit) * -1; mask |= (abs(q2 - q1) > limit) * -1; mask |= (abs(q3 - q2) > limit) * -1; - mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = ~mask; return mask; } /* is there high variance internal edge ( 11111111 yes, 00000000 no) */ -static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1) +static __inline signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) { signed char hev = 0; hev |= (abs(p1 - p0) > thresh) * -1; @@ -48,7 +49,8 @@ static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, return hev; } -static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *op0, uc *oq0, uc *oq1) +static __inline void vp8_filter(signed char mask, uc hev, uc *op1, + uc *op0, uc *oq0, uc *oq1) { signed char ps0, qs0; @@ -98,9 +100,9 @@ void vp8_loop_filter_horizontal_edge_c ( unsigned char *s, int p, /* pitch */ - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count ) { @@ -113,11 +115,11 @@ void vp8_loop_filter_horizontal_edge_c */ do { - mask = vp8_filter_mask(limit[i], flimit[i], + mask = vp8_filter_mask(limit[0], blimit[0], s[-4*p], s[-3*p], s[-2*p], s[-1*p], s[0*p], s[1*p], s[2*p], s[3*p]); - hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]); + hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); @@ -130,9 +132,9 @@ void vp8_loop_filter_vertical_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count ) { @@ -145,10 +147,10 @@ void vp8_loop_filter_vertical_edge_c */ do { - mask = vp8_filter_mask(limit[i], flimit[i], + mask = vp8_filter_mask(limit[0], blimit[0], s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); - hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]); + hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); vp8_filter(mask, hev, s - 2, s - 1, s, s + 1); @@ -157,7 +159,7 @@ void vp8_loop_filter_vertical_edge_c while (++i < count * 8); } -static __inline void vp8_mbfilter(signed char mask, signed char hev, +static __inline void vp8_mbfilter(signed char mask, uc hev, uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2) { signed char s, u; @@ -216,9 +218,9 @@ void vp8_mbloop_filter_horizontal_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count ) { @@ -232,11 +234,11 @@ void vp8_mbloop_filter_horizontal_edge_c do { - mask = vp8_filter_mask(limit[i], flimit[i], + mask = vp8_filter_mask(limit[0], blimit[0], s[-4*p], s[-3*p], s[-2*p], s[-1*p], s[0*p], s[1*p], s[2*p], s[3*p]); - hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]); + hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p); @@ -251,9 +253,9 @@ void vp8_mbloop_filter_vertical_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count ) { @@ -264,10 +266,10 @@ void vp8_mbloop_filter_vertical_edge_c do { - mask = vp8_filter_mask(limit[i], flimit[i], + mask = vp8_filter_mask(limit[0], blimit[0], s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); - hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]); + hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2); @@ -278,13 +280,13 @@ void vp8_mbloop_filter_vertical_edge_c } /* should we apply any filter at all ( 11111111 yes, 00000000 no) */ -static __inline signed char vp8_simple_filter_mask(signed char limit, signed char flimit, uc p1, uc p0, uc q0, uc q1) +static __inline signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1) { /* Why does this cause problems for win32? * error C2143: syntax error : missing ';' before 'type' * (void) limit; */ - signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= flimit * 2 + limit) * -1; + signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1; return mask; } @@ -317,47 +319,37 @@ void vp8_loop_filter_simple_horizontal_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, - int count + const unsigned char *blimit ) { signed char mask = 0; int i = 0; - (void) thresh; do { - /*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1*p],s[0*p]);*/ - mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2*p], s[-1*p], s[0*p], s[1*p]); + mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p); ++s; } - while (++i < count * 8); + while (++i < 16); } void vp8_loop_filter_simple_vertical_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, - int count + const unsigned char *blimit ) { signed char mask = 0; int i = 0; - (void) thresh; do { - /*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1],s[0]);*/ - mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2], s[-1], s[0], s[1]); + mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]); vp8_simple_filter(mask, s - 2, s - 1, s, s + 1); s += p; } - while (++i < count * 8); + while (++i < 16); } diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h index a381dfe87..4356b5133 100644 --- a/vp8/common/onyxc_int.h +++ b/vp8/common/onyxc_int.h @@ -83,6 +83,7 @@ typedef struct VP8_COMMON_RTCD } VP8_COMMON_RTCD; typedef struct VP8Common + { struct vpx_internal_error_info error; @@ -107,7 +108,8 @@ typedef struct VP8Common YV12_BUFFER_CONFIG post_proc_buffer; YV12_BUFFER_CONFIG temp_scale_frame; - FRAME_TYPE last_frame_type; /* Save last frame's frame type for loopfilter init checking and motion search. */ + + FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */ FRAME_TYPE frame_type; int show_frame; @@ -148,11 +150,9 @@ typedef struct VP8Common INTERPOLATIONFILTERTYPE mcomp_filter_type; LOOPFILTERTYPE last_filter_type; LOOPFILTERTYPE filter_type; - loop_filter_info lf_info[MAX_LOOP_FILTER+1]; - prototype_loopfilter_block((*lf_mbv)); - prototype_loopfilter_block((*lf_mbh)); - prototype_loopfilter_block((*lf_bv)); - prototype_loopfilter_block((*lf_bh)); + + loop_filter_info_n lf_info; + int filter_level; int last_sharpness_level; int sharpness_level; @@ -205,10 +205,9 @@ typedef struct VP8Common struct postproc_state postproc_state; } VP8_COMMON; - -int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level); -void vp8_init_loop_filter(VP8_COMMON *cm); -void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type); -extern void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val); +void vp8_loop_filter_init(VP8_COMMON *cm); +void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd, + int default_filt_lvl, int sharpness_lvl); +void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val); #endif diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c index 5f81ee638..aeb1607b5 100644 --- a/vp8/decoder/onyxd_if.c +++ b/vp8/decoder/onyxd_if.c @@ -95,7 +95,7 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf) { VP8_COMMON *cm = &pbi->common; - vp8_init_loop_filter(cm); + vp8_loop_filter_init(cm); cm->last_frame_type = KEY_FRAME; cm->last_filter_type = cm->filter_type; cm->last_sharpness_level = cm->sharpness_level; diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index a7af9acfb..0c21689c0 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -274,9 +274,7 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride; int filter_level; - loop_filter_info *lfi = pc->lf_info; - int alt_flt_enabled = xd->segmentation_enabled; - int Segment; + loop_filter_info_n *lfi_n = &pc->lf_info; pbi->mb_row_di[ithread].mb_row = mb_row; pbi->mb_row_di[ithread].mbd.current_bc = &pbi->mbc[mb_row%num_part]; @@ -362,7 +360,16 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) if (pbi->common.filter_level) { - int skip_lf; + int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED && + xd->mode_info_context->mbmi.mode != SPLITMV && + xd->mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode]; + const int seg = xd->mode_info_context->mbmi.segment_id; + const int ref_frame = xd->mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + if( mb_row != pc->mb_rows-1 ) { /* Save decoded MB last row data for next-row decoding */ @@ -388,35 +395,57 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) } } - /* update loopfilter info */ - Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0; - skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED && - xd->mode_info_context->mbmi.mode != SPLITMV && - xd->mode_info_context->mbmi.mb_skip_coeff); - - filter_level = pbi->mt_baseline_filter_level[Segment]; - /* Distance of Mb to the various image edges. - * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units - * Apply any context driven MB level adjustment - */ - filter_level = vp8_adjust_mb_lf_value(xd, filter_level); - /* loopfilter on this macroblock. */ if (filter_level) { - if (mb_col > 0) - pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); + if(pc->filter_type == NORMAL_LOOPFILTER) + { + loop_filter_info lfi; + FRAME_TYPE frame_type = pc->frame_type; + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; - if (!skip_lf) - pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); + if (mb_col > 0) + LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_v) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); - /* don't apply across umv border */ - if (mb_row > 0) - pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, normal_b_v) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); - if (!skip_lf) - pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_h) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, normal_b_h) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + } + else + { + if (mb_col > 0) + LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_v) + (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, simple_b_v) + (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_h) + (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, simple_b_h) + (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]); + } } + } recon_yoffset += 16; @@ -681,53 +710,6 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) } } - -static void lpf_init( VP8D_COMP *pbi, int default_filt_lvl) -{ - VP8_COMMON *cm = &pbi->common; - MACROBLOCKD *mbd = &pbi->mb; - /*YV12_BUFFER_CONFIG *post = &cm->new_frame;*/ /*frame_to_show;*/ - loop_filter_info *lfi = cm->lf_info; - FRAME_TYPE frame_type = cm->frame_type; - - /*int mb_row; - int mb_col; - int baseline_filter_level[MAX_MB_SEGMENTS];*/ - int alt_flt_enabled = mbd->segmentation_enabled; - - int i; - /*unsigned char *y_ptr, *u_ptr, *v_ptr;*/ - - /* Note the baseline filter values for each segment */ - if (alt_flt_enabled) - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - { - /* Abs value */ - if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) - pbi->mt_baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - /* Delta Value */ - else - { - pbi->mt_baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - pbi->mt_baseline_filter_level[i] = (pbi->mt_baseline_filter_level[i] >= 0) ? ((pbi->mt_baseline_filter_level[i] <= MAX_LOOP_FILTER) ? pbi->mt_baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */ - } - } - } - else - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - pbi->mt_baseline_filter_level[i] = default_filt_lvl; - } - - /* Initialize the loop filter for this frame. */ - if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) - vp8_init_loop_filter(cm); - else if (frame_type != cm->last_frame_type) - vp8_frame_init_loop_filter(lfi, frame_type); -} - - void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) { int mb_row; @@ -738,12 +720,10 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) volatile int *last_row_current_mb_col = NULL; int nsync = pbi->sync_range; - int filter_level; - loop_filter_info *lfi = pc->lf_info; - int alt_flt_enabled = xd->segmentation_enabled; - int Segment; + int filter_level = pc->filter_level; + loop_filter_info_n *lfi_n = &pc->lf_info; - if(pbi->common.filter_level) + if (filter_level) { /* Set above_row buffer to 127 for decoding first MB row */ vpx_memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, pc->yv12_fb[pc->lst_fb_idx].y_width + 5); @@ -764,7 +744,9 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) vpx_memset(pbi->mt_uleft_col[i], (unsigned char)129, 8); vpx_memset(pbi->mt_vleft_col[i], (unsigned char)129, 8); } - lpf_init(pbi, pc->filter_level); + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init(pc, &pbi->mb, filter_level, pc->sharpness_level); } setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count); @@ -774,7 +756,6 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1)) { - xd->current_bc = &pbi->mbc[mb_row%num_part]; /* vp8_decode_mb_row(pbi, pc, mb_row, xd); */ @@ -875,7 +856,16 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) if (pbi->common.filter_level) { - int skip_lf; + int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED && + xd->mode_info_context->mbmi.mode != SPLITMV && + xd->mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode]; + const int seg = xd->mode_info_context->mbmi.segment_id; + const int ref_frame = xd->mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + /* Save decoded MB last row data for next-row decoding */ if(mb_row != pc->mb_rows-1) { @@ -901,36 +891,58 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) } } - /* update loopfilter info */ - Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0; - skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED && - xd->mode_info_context->mbmi.mode != SPLITMV && - xd->mode_info_context->mbmi.mb_skip_coeff); - filter_level = pbi->mt_baseline_filter_level[Segment]; - /* Distance of Mb to the various image edges. - * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units - * Apply any context driven MB level adjustment - */ - filter_level = vp8_adjust_mb_lf_value(xd, filter_level); - /* loopfilter on this macroblock. */ if (filter_level) { - if (mb_col > 0) - pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); + if(pc->filter_type == NORMAL_LOOPFILTER) + { + loop_filter_info lfi; + FRAME_TYPE frame_type = pc->frame_type; + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; - if (!skip_lf) - pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); + if (mb_col > 0) + LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_v) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); - /* don't apply across umv border */ - if (mb_row > 0) - pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, normal_b_v) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); - if (!skip_lf) - pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_h) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, normal_b_h) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + } + else + { + if (mb_col > 0) + LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_v) + (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, simple_b_v) + (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_h) + (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, simple_b_h) + (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]); + } } - } + } recon_yoffset += 16; recon_uvoffset += 8; diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 73b4c7dcd..d2b0bf36a 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -2105,7 +2105,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) //when needed. This will avoid unnecessary calls of vp8cx_init_quantizer() for every frame. vp8cx_init_quantizer(cpi); { - vp8_init_loop_filter(cm); + vp8_loop_filter_init(cm); cm->last_frame_type = KEY_FRAME; cm->last_filter_type = cm->filter_type; cm->last_sharpness_level = cm->sharpness_level; From 01433c50436a669b7e10faf94382dbe03a8827bf Mon Sep 17 00:00:00 2001 From: Johann Date: Mon, 20 Jun 2011 14:48:57 -0400 Subject: [PATCH 2/8] update x86 asm for loopfilter Change-Id: I1ed739522db7c00c189851c7095c1b64ef6412ce --- vp8/common/x86/loopfilter_mmx.asm | 78 +++++------- vp8/common/x86/loopfilter_sse2.asm | 63 ++++------ vp8/common/x86/loopfilter_x86.c | 170 +++++++-------------------- vp8/common/x86/loopfilter_x86.h | 24 ++-- vp8/common/x86/x86_systemdependent.c | 10 +- 5 files changed, 111 insertions(+), 234 deletions(-) diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm index c6c215c3c..ad47284cf 100644 --- a/vp8/common/x86/loopfilter_mmx.asm +++ b/vp8/common/x86/loopfilter_mmx.asm @@ -16,7 +16,7 @@ ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -122,12 +122,10 @@ next8_h: paddusb mm5, mm5 ; abs(p0-q0)*2 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; get flimit - movq mm2, [rdx] ; flimit mm2 - paddb mm2, mm2 ; flimit*2 (less than 255) - paddb mm7, mm2 ; flimit * 2 + limit (less than 255) + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit - psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por mm1, mm5 pxor mm5, mm5 pcmpeqb mm1, mm5 ; mask mm1 @@ -230,7 +228,7 @@ next8_h: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -406,9 +404,9 @@ next8_v: pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero psrlw mm5, 1 ; abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; + mov rdx, arg(2) ;blimit ; - movq mm2, [rdx] ;flimit mm2 + movq mm4, [rdx] ;blimit movq mm1, mm3 ; mm1=mm3=p0 movq mm7, mm6 ; mm7=mm6=q0 @@ -419,10 +417,7 @@ next8_v: paddusb mm1, mm1 ; abs(q0-p0)*2 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - paddb mm2, mm2 ; flimit*2 (less than 255) - paddb mm4, mm2 ; flimit * 2 + limit (less than 255) - - psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por mm1, mm0; ; mask pxor mm0, mm0 @@ -603,7 +598,7 @@ next8_v: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -719,17 +714,15 @@ next8_mbh: paddusb mm5, mm5 ; abs(p0-q0)*2 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; get flimit - movq mm2, [rdx] ; flimit mm2 - paddb mm2, mm2 ; flimit*2 (less than 255) - paddb mm7, mm2 ; flimit * 2 + limit (less than 255) + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit - psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por mm1, mm5 pxor mm5, mm5 pcmpeqb mm1, mm5 ; mask mm1 - ; mm1 = mask, mm0=q0, mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0) ; mm6 = p0, ; calculate high edge variance @@ -922,7 +915,7 @@ next8_mbh: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -1108,9 +1101,9 @@ next8_mbv: pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero psrlw mm5, 1 ; abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; + mov rdx, arg(2) ;blimit ; - movq mm2, [rdx] ;flimit mm2 + movq mm4, [rdx] ;blimit movq mm1, mm3 ; mm1=mm3=p0 movq mm7, mm6 ; mm7=mm6=q0 @@ -1121,10 +1114,7 @@ next8_mbv: paddusb mm1, mm1 ; abs(q0-p0)*2 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - paddb mm2, mm2 ; flimit*2 (less than 255) - paddb mm4, mm2 ; flimit * 2 + limit (less than 255) - - psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por mm1, mm0; ; mask pxor mm0, mm0 @@ -1392,16 +1382,13 @@ next8_mbv: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, -; const char *limit, -; const char *thresh, -; int count +; const char *blimit ;) global sym(vp8_loop_filter_simple_horizontal_edge_mmx) sym(vp8_loop_filter_simple_horizontal_edge_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 3 GET_GOT rbx push rsi push rdi @@ -1410,14 +1397,10 @@ sym(vp8_loop_filter_simple_horizontal_edge_mmx): mov rsi, arg(0) ;src_ptr movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - movsxd rcx, dword ptr arg(5) ;count + mov rcx, 2 ; count nexts8_h: - mov rdx, arg(3) ;limit - movq mm7, [rdx] - mov rdx, arg(2) ;flimit ; get flimit + mov rdx, arg(2) ;blimit ; get blimit movq mm3, [rdx] ; - paddb mm3, mm3 ; flimit*2 (less than 255) - paddb mm3, mm7 ; flimit * 2 + limit (less than 255) mov rdi, rsi ; rdi points to row +1 for indirect addressing add rdi, rax @@ -1445,7 +1428,7 @@ nexts8_h: paddusb mm5, mm5 ; abs(p0-q0)*2 paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit pxor mm3, mm3 pcmpeqb mm5, mm3 @@ -1515,16 +1498,13 @@ nexts8_h: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, -; const char *limit, -; const char *thresh, -; int count +; const char *blimit ;) global sym(vp8_loop_filter_simple_vertical_edge_mmx) sym(vp8_loop_filter_simple_vertical_edge_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 3 GET_GOT rbx push rsi push rdi @@ -1539,7 +1519,7 @@ sym(vp8_loop_filter_simple_vertical_edge_mmx): movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? lea rsi, [rsi + rax*4- 2]; ; - movsxd rcx, dword ptr arg(5) ;count + mov rcx, 2 ; count nexts8_v: lea rdi, [rsi + rax]; @@ -1602,14 +1582,10 @@ nexts8_v: paddusb mm5, mm5 ; abs(p0-q0)*2 paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; get flimit + mov rdx, arg(2) ;blimit ; get blimit movq mm7, [rdx] - mov rdx, arg(3) ; get limit - movq mm6, [rdx] - paddb mm7, mm7 ; flimit*2 (less than 255) - paddb mm7, mm6 ; flimit * 2 + limit (less than 255) - psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit pxor mm7, mm7 pcmpeqb mm5, mm7 ; mm5 = mask diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm index c2ce1a106..4efff7eb5 100644 --- a/vp8/common/x86/loopfilter_sse2.asm +++ b/vp8/common/x86/loopfilter_sse2.asm @@ -110,7 +110,7 @@ psubusb xmm6, xmm5 ; p1-=p0 por xmm6, xmm4 ; abs(p1 - p0) - mov rdx, arg(2) ; get flimit + mov rdx, arg(2) ; get blimit movdqa t1, xmm6 ; save to t1 @@ -123,7 +123,7 @@ psubusb xmm1, xmm7 por xmm2, xmm3 ; abs(p1-q1) - movdqa xmm4, XMMWORD PTR [rdx] ; flimit + movdqa xmm7, XMMWORD PTR [rdx] ; blimit movdqa xmm3, xmm0 ; q0 pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero @@ -134,13 +134,11 @@ psrlw xmm2, 1 ; abs(p1-q1)/2 psubusb xmm5, xmm3 ; p0-=q0 - paddb xmm4, xmm4 ; flimit*2 (less than 255) psubusb xmm3, xmm6 ; q0-=p0 por xmm5, xmm3 ; abs(p0 - q0) paddusb xmm5, xmm5 ; abs(p0-q0)*2 - paddb xmm7, xmm4 ; flimit * 2 + limit (less than 255) movdqa xmm4, t0 ; hev get abs (q1 - q0) @@ -150,7 +148,7 @@ movdqa xmm2, XMMWORD PTR [rdx] ; hev - psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit psubusb xmm4, xmm2 ; hev psubusb xmm3, xmm2 ; hev @@ -278,7 +276,7 @@ ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -328,7 +326,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -574,7 +572,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -624,7 +622,7 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2): ;( ; unsigned char *u, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; unsigned char *v @@ -904,7 +902,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): movdqa xmm4, XMMWORD PTR [rdx]; limit pmaxub xmm0, xmm7 - mov rdx, arg(2) ; flimit + mov rdx, arg(2) ; blimit psubusb xmm0, xmm4 movdqa xmm5, xmm2 ; q1 @@ -921,12 +919,11 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): psrlw xmm5, 1 ; abs(p1-q1)/2 psubusb xmm6, xmm3 ; q0-p0 - movdqa xmm2, XMMWORD PTR [rdx]; flimit + movdqa xmm4, XMMWORD PTR [rdx]; blimit mov rdx, arg(4) ; get thresh por xmm1, xmm6 ; abs(q0-p0) - paddb xmm2, xmm2 ; flimit*2 (less than 255) movdqa xmm6, t0 ; get abs (q1 - q0) @@ -939,10 +936,9 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh - paddb xmm4, xmm2 ; flimit * 2 + limit (less than 255) psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh - psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh por xmm1, xmm0 ; mask @@ -1014,7 +1010,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -1081,7 +1077,7 @@ sym(vp8_loop_filter_vertical_edge_sse2): ;( ; unsigned char *u, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; unsigned char *v @@ -1239,7 +1235,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -1308,7 +1304,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2): ;( ; unsigned char *u, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; unsigned char *v @@ -1376,16 +1372,13 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, -; const char *limit, -; const char *thresh, -; int count +; const char *blimit, ;) global sym(vp8_loop_filter_simple_horizontal_edge_sse2) sym(vp8_loop_filter_simple_horizontal_edge_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 3 SAVE_XMM 7 GET_GOT rbx push rsi @@ -1394,13 +1387,8 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): mov rsi, arg(0) ;src_ptr movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - mov rdx, arg(2) ;flimit ; get flimit + mov rdx, arg(2) ;blimit movdqa xmm3, XMMWORD PTR [rdx] - mov rdx, arg(3) ;limit - movdqa xmm7, XMMWORD PTR [rdx] - - paddb xmm3, xmm3 ; flimit*2 (less than 255) - paddb xmm3, xmm7 ; flimit * 2 + limit (less than 255) mov rdi, rsi ; rdi points to row +1 for indirect addressing add rdi, rax @@ -1428,7 +1416,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): paddusb xmm5, xmm5 ; abs(p0-q0)*2 paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit pxor xmm3, xmm3 pcmpeqb xmm5, xmm3 @@ -1493,16 +1481,13 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, -; const char *limit, -; const char *thresh, -; int count +; const char *blimit, ;) global sym(vp8_loop_filter_simple_vertical_edge_sse2) sym(vp8_loop_filter_simple_vertical_edge_sse2): push rbp ; save old base pointer value. mov rbp, rsp ; set new base pointer value. - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 3 SAVE_XMM 7 GET_GOT rbx ; save callee-saved reg push rsi @@ -1607,14 +1592,10 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): paddusb xmm5, xmm5 ; abs(p0-q0)*2 paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - mov rdx, arg(2) ;flimit + mov rdx, arg(2) ;blimit movdqa xmm7, XMMWORD PTR [rdx] - mov rdx, arg(3) ; get limit - movdqa xmm6, XMMWORD PTR [rdx] - paddb xmm7, xmm7 ; flimit*2 (less than 255) - paddb xmm7, xmm6 ; flimit * 2 + limit (less than 255) - psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit pxor xmm7, xmm7 pcmpeqb xmm5, xmm7 ; mm5 = mask diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c index a52420c98..9360ac17c 100644 --- a/vp8/common/x86/loopfilter_x86.c +++ b/vp8/common/x86/loopfilter_x86.c @@ -9,30 +9,18 @@ */ -#include "vpx_ports/config.h" +#include "vpx_config.h" #include "vp8/common/loopfilter.h" -prototype_loopfilter(vp8_loop_filter_horizontal_edge_c); -prototype_loopfilter(vp8_loop_filter_vertical_edge_c); -prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c); -prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c); -prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c); -prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c); - prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx); prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx); prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx); prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx); -prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx); -prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx); prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2); prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2); prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2); prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_sse2); -prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2); -prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2); -prototype_loopfilter(vp8_fast_loop_filter_vertical_edges_sse2); extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2; extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2; @@ -44,23 +32,13 @@ extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2; void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - - -void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } @@ -68,23 +46,13 @@ void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - - -void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } @@ -92,27 +60,23 @@ void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit); } @@ -120,27 +84,23 @@ void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); } #endif @@ -150,20 +110,10 @@ void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr); -} - - -void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); } @@ -171,20 +121,10 @@ void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr); -} - - -void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); } @@ -192,24 +132,20 @@ void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride); + vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride); } -void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit); } @@ -217,36 +153,20 @@ void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4); + vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4); } -void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); } #endif - -#if 0 -void vp8_fast_loop_filter_vertical_edges_sse(unsigned char *y_ptr, - int y_stride, - loop_filter_info *lfi) -{ - - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); -} -#endif diff --git a/vp8/common/x86/loopfilter_x86.h b/vp8/common/x86/loopfilter_x86.h index 80dbebc8d..1ed6c213f 100644 --- a/vp8/common/x86/loopfilter_x86.h +++ b/vp8/common/x86/loopfilter_x86.h @@ -24,10 +24,10 @@ extern prototype_loopfilter_block(vp8_loop_filter_mbv_mmx); extern prototype_loopfilter_block(vp8_loop_filter_bv_mmx); extern prototype_loopfilter_block(vp8_loop_filter_mbh_mmx); extern prototype_loopfilter_block(vp8_loop_filter_bh_mmx); -extern prototype_loopfilter_block(vp8_loop_filter_mbvs_mmx); -extern prototype_loopfilter_block(vp8_loop_filter_bvs_mmx); -extern prototype_loopfilter_block(vp8_loop_filter_mbhs_mmx); -extern prototype_loopfilter_block(vp8_loop_filter_bhs_mmx); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx); +extern prototype_simple_loopfilter(vp8_loop_filter_bvs_mmx); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx); +extern prototype_simple_loopfilter(vp8_loop_filter_bhs_mmx); #if !CONFIG_RUNTIME_CPU_DETECT @@ -44,13 +44,13 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_mmx); #define vp8_lf_normal_b_h vp8_loop_filter_bh_mmx #undef vp8_lf_simple_mb_v -#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_mmx +#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_mmx #undef vp8_lf_simple_b_v #define vp8_lf_simple_b_v vp8_loop_filter_bvs_mmx #undef vp8_lf_simple_mb_h -#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_mmx +#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_mmx #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_mmx @@ -63,10 +63,10 @@ extern prototype_loopfilter_block(vp8_loop_filter_mbv_sse2); extern prototype_loopfilter_block(vp8_loop_filter_bv_sse2); extern prototype_loopfilter_block(vp8_loop_filter_mbh_sse2); extern prototype_loopfilter_block(vp8_loop_filter_bh_sse2); -extern prototype_loopfilter_block(vp8_loop_filter_mbvs_sse2); -extern prototype_loopfilter_block(vp8_loop_filter_bvs_sse2); -extern prototype_loopfilter_block(vp8_loop_filter_mbhs_sse2); -extern prototype_loopfilter_block(vp8_loop_filter_bhs_sse2); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2); +extern prototype_simple_loopfilter(vp8_loop_filter_bvs_sse2); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2); +extern prototype_simple_loopfilter(vp8_loop_filter_bhs_sse2); #if !CONFIG_RUNTIME_CPU_DETECT @@ -83,13 +83,13 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_sse2); #define vp8_lf_normal_b_h vp8_loop_filter_bh_sse2 #undef vp8_lf_simple_mb_v -#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_sse2 +#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_sse2 #undef vp8_lf_simple_b_v #define vp8_lf_simple_b_v vp8_loop_filter_bvs_sse2 #undef vp8_lf_simple_mb_h -#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_sse2 +#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_sse2 #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_sse2 diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c index 87374f3c6..33a984b79 100644 --- a/vp8/common/x86/x86_systemdependent.c +++ b/vp8/common/x86/x86_systemdependent.c @@ -9,7 +9,7 @@ */ -#include "vpx_ports/config.h" +#include "vpx_config.h" #include "vpx_ports/x86.h" #include "vp8/common/g_common.h" #include "vp8/common/subpixel.h" @@ -63,9 +63,9 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_mmx; rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_mmx; rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_mmx; - rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_mmx; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_mmx; rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_mmx; - rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_mmx; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_mmx; rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_mmx; #if CONFIG_POSTPROC @@ -101,9 +101,9 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_sse2; rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_sse2; rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_sse2; - rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_sse2; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_sse2; rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_sse2; - rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_sse2; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_sse2; rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_sse2; #if CONFIG_POSTPROC From ede0b15c9d0d05f0d53a13e5962192f9d29edc13 Mon Sep 17 00:00:00 2001 From: Fritz Koenig Date: Thu, 7 Jul 2011 09:30:24 -0700 Subject: [PATCH 3/8] Reduce motion vector search on alt-ref frame. Clamp mv search to accomodate subpixel filtering of UV mv. Change-Id: Iab3ed405993ef6bf779ad7cf60863153068fb7d1 --- vp8/encoder/temporal_filter.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c index c1ca7d4ed..601a364ae 100644 --- a/vp8/encoder/temporal_filter.c +++ b/vp8/encoder/temporal_filter.c @@ -259,10 +259,19 @@ static void vp8_temporal_filter_iterate_c for (mb_row = 0; mb_row < mb_rows; mb_row++) { #if ALT_REF_MC_ENABLED - // Reduced search extent by 3 for 6-tap filter & smaller UMV border - cpi->mb.mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 19)); + // Source frames are extended to 16 pixels. This is different than + // L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS) + // A 6 tap filter is used for motion search. This requires 2 pixels + // before and 3 pixels after. So the largest Y mv on a border would + // then be 16 - 3. The UV blocks are half the size of the Y and + // therefore only extended by 8. The largest mv that a UV block + // can support is 8 - 3. A UV mv is half of a Y mv. + // (16 - 3) >> 1 == 6 which is greater than 8 - 3. + // To keep the mv in play for both Y and UV planes the max that it + // can be on a border is therefore 16 - 5. + cpi->mb.mv_row_min = -((mb_row * 16) + (16 - 5)); cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16) - + (VP8BORDERINPIXELS - 19); + + (16 - 5); #endif for (mb_col = 0; mb_col < mb_cols; mb_col++) @@ -274,10 +283,9 @@ static void vp8_temporal_filter_iterate_c vpx_memset(count, 0, 384*sizeof(unsigned short)); #if ALT_REF_MC_ENABLED - // Reduced search extent by 3 for 6-tap filter & smaller UMV border - cpi->mb.mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 19)); + cpi->mb.mv_col_min = -((mb_col * 16) + (16 - 5)); cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16) - + (VP8BORDERINPIXELS - 19); + + (16 - 5); #endif for (frame = 0; frame < frame_count; frame++) From 283b0e25ac6e0a2bd3d5f0b8cd1d0a50bbda7318 Mon Sep 17 00:00:00 2001 From: Attila Nagy Date: Wed, 6 Jul 2011 13:35:33 +0300 Subject: [PATCH 4/8] Update armv7 loopfilter to new interface Change-Id: I65105a9c63832669237e6a6a7fcb4ea3ea683346 --- vp8/common/arm/loopfilter_arm.c | 157 +++--- vp8/common/arm/loopfilter_arm.h | 22 +- vp8/common/arm/neon/loopfilter_neon.asm | 296 +++++----- .../loopfiltersimplehorizontaledge_neon.asm | 100 ++-- .../loopfiltersimpleverticaledge_neon.asm | 172 +++--- vp8/common/arm/neon/mbloopfilter_neon.asm | 530 ++++++++---------- 6 files changed, 617 insertions(+), 660 deletions(-) diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c index 6d1caa485..1ec2b7484 100644 --- a/vp8/common/arm/loopfilter_arm.c +++ b/vp8/common/arm/loopfilter_arm.c @@ -9,30 +9,36 @@ */ -#include "vpx_ports/config.h" -#include +#include "vpx_config.h" #include "vp8/common/loopfilter.h" #include "vp8/common/onyxc_int.h" +#if HAVE_ARMV6 extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6); extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6); extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6); +#endif -extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_y_neon); -extern prototype_loopfilter(vp8_loop_filter_vertical_edge_y_neon); -extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_y_neon); -extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_y_neon); -extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_neon); -extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_neon); +#if HAVE_ARMV7 +typedef void loopfilter_y_neon(unsigned char *src, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh); +typedef void loopfilter_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh, + unsigned char *v); -extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_neon; -extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_neon; -extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_neon; -extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon; +extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; +extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; +extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; +extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; +#endif #if HAVE_ARMV6 /*ARMV6 loopfilter functions*/ @@ -40,13 +46,13 @@ extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon; void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, @@ -55,20 +61,20 @@ void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsi (void) u_ptr; (void) v_ptr; (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); } /* Vertical MB Filtering */ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, @@ -77,22 +83,22 @@ void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsi (void) u_ptr; (void) v_ptr; (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); } /* Horizontal B Filtering */ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, @@ -101,24 +107,24 @@ void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig (void) u_ptr; (void) v_ptr; (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); } /* Vertical B Filtering */ void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, @@ -127,9 +133,9 @@ void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig (void) u_ptr; (void) v_ptr; (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); } #endif @@ -139,83 +145,58 @@ void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr); -} - -void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); } /* Vertical MB Filtering */ void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); if (u_ptr) - vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr); -} - -void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); } /* Horizontal B Filtering */ void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr); if (u_ptr) - vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride); -} - -void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride); } /* Vertical B Filtering */ void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr); if (u_ptr) - vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4); -} - -void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4); } #endif diff --git a/vp8/common/arm/loopfilter_arm.h b/vp8/common/arm/loopfilter_arm.h index cd62207d7..27159b59f 100644 --- a/vp8/common/arm/loopfilter_arm.h +++ b/vp8/common/arm/loopfilter_arm.h @@ -12,6 +12,8 @@ #ifndef LOOPFILTER_ARM_H #define LOOPFILTER_ARM_H +#include "vpx_config.h" + #if HAVE_ARMV6 extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6); extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6); @@ -46,18 +48,19 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6); #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6 -#endif -#endif +#endif /* !CONFIG_RUNTIME_CPU_DETECT */ + +#endif /* HAVE_ARMV6 */ #if HAVE_ARMV7 extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon); extern prototype_loopfilter_block(vp8_loop_filter_bv_neon); extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon); extern prototype_loopfilter_block(vp8_loop_filter_bh_neon); -extern prototype_loopfilter_block(vp8_loop_filter_mbvs_neon); -extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon); -extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon); -extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_mbvs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_bvs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_mbhs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_bhs_neon); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_lf_normal_mb_v @@ -83,7 +86,8 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon); #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon -#endif -#endif +#endif /* !CONFIG_RUNTIME_CPU_DETECT */ -#endif +#endif /* HAVE_ARMV7 */ + +#endif /* LOOPFILTER_ARM_H */ diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm index e73dd6401..e44be0a1e 100644 --- a/vp8/common/arm/neon/loopfilter_neon.asm +++ b/vp8/common/arm/neon/loopfilter_neon.asm @@ -14,109 +14,97 @@ EXPORT |vp8_loop_filter_vertical_edge_y_neon| EXPORT |vp8_loop_filter_vertical_edge_uv_neon| ARM - REQUIRE8 - PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -; flimit, limit, and thresh should be positive numbers. -; All 16 elements in these variables are equal. - -; void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) ; r0 unsigned char *src ; r1 int pitch -; r2 const signed char *flimit -; r3 const signed char *limit -; sp const signed char *thresh, -; sp+4 int count (unused) +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, |vp8_loop_filter_horizontal_edge_y_neon| PROC - stmdb sp!, {lr} - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit - sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r12, [sp, #4] ; load thresh pointer + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines + ldr r3, [sp, #4] ; load thresh + add r12, r2, r1 + add r1, r1, r1 - vld1.u8 {q3}, [r2], r1 ; p3 - vld1.u8 {q4}, [r2], r1 ; p2 - vld1.u8 {q5}, [r2], r1 ; p1 - vld1.u8 {q6}, [r2], r1 ; p0 - vld1.u8 {q7}, [r2], r1 ; q0 - vld1.u8 {q8}, [r2], r1 ; q1 - vld1.u8 {q9}, [r2], r1 ; q2 - vld1.u8 {q10}, [r2] ; q3 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - sub r0, r0, r1, lsl #1 + vdup.u8 q2, r3 ; duplicate thresh + + vld1.u8 {q3}, [r2@128], r1 ; p3 + vld1.u8 {q4}, [r12@128], r1 ; p2 + vld1.u8 {q5}, [r2@128], r1 ; p1 + vld1.u8 {q6}, [r12@128], r1 ; p0 + vld1.u8 {q7}, [r2@128], r1 ; q0 + vld1.u8 {q8}, [r12@128], r1 ; q1 + vld1.u8 {q9}, [r2@128] ; q2 + vld1.u8 {q10}, [r12@128] ; q3 + + sub r2, r2, r1, lsl #1 + sub r12, r12, r1, lsl #1 bl vp8_loop_filter_neon - vst1.u8 {q5}, [r0], r1 ; store op1 - vst1.u8 {q6}, [r0], r1 ; store op0 - vst1.u8 {q7}, [r0], r1 ; store oq0 - vst1.u8 {q8}, [r0], r1 ; store oq1 + vst1.u8 {q5}, [r2@128], r1 ; store op1 + vst1.u8 {q6}, [r12@128], r1 ; store op0 + vst1.u8 {q7}, [r2@128], r1 ; store oq0 + vst1.u8 {q8}, [r12@128], r1 ; store oq1 - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| -; void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; unsigned char *v) + ; r0 unsigned char *u, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, ; sp+4 unsigned char *v |vp8_loop_filter_horizontal_edge_uv_neon| PROC - stmdb sp!, {lr} - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + ldr r12, [sp, #4] ; load thresh ldr r2, [sp, #8] ; load v ptr + vdup.u8 q2, r12 ; duplicate thresh sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines - vld1.u8 {d6}, [r3], r1 ; p3 - vld1.u8 {d8}, [r3], r1 ; p2 - vld1.u8 {d10}, [r3], r1 ; p1 - vld1.u8 {d12}, [r3], r1 ; p0 - vld1.u8 {d14}, [r3], r1 ; q0 - vld1.u8 {d16}, [r3], r1 ; q1 - vld1.u8 {d18}, [r3], r1 ; q2 - vld1.u8 {d20}, [r3] ; q3 - - ldr r3, [sp, #4] ; load thresh pointer - sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines - vld1.u8 {d7}, [r12], r1 ; p3 - vld1.u8 {d9}, [r12], r1 ; p2 - vld1.u8 {d11}, [r12], r1 ; p1 - vld1.u8 {d13}, [r12], r1 ; p0 - vld1.u8 {d15}, [r12], r1 ; q0 - vld1.u8 {d17}, [r12], r1 ; q1 - vld1.u8 {d19}, [r12], r1 ; q2 - vld1.u8 {d21}, [r12] ; q3 - vld1.s8 {d4[], d5[]}, [r3] ; thresh + vld1.u8 {d6}, [r3@64], r1 ; p3 + vld1.u8 {d7}, [r12@64], r1 ; p3 + vld1.u8 {d8}, [r3@64], r1 ; p2 + vld1.u8 {d9}, [r12@64], r1 ; p2 + vld1.u8 {d10}, [r3@64], r1 ; p1 + vld1.u8 {d11}, [r12@64], r1 ; p1 + vld1.u8 {d12}, [r3@64], r1 ; p0 + vld1.u8 {d13}, [r12@64], r1 ; p0 + vld1.u8 {d14}, [r3@64], r1 ; q0 + vld1.u8 {d15}, [r12@64], r1 ; q0 + vld1.u8 {d16}, [r3@64], r1 ; q1 + vld1.u8 {d17}, [r12@64], r1 ; q1 + vld1.u8 {d18}, [r3@64], r1 ; q2 + vld1.u8 {d19}, [r12@64], r1 ; q2 + vld1.u8 {d20}, [r3@64] ; q3 + vld1.u8 {d21}, [r12@64] ; q3 bl vp8_loop_filter_neon sub r0, r0, r1, lsl #1 sub r2, r2, r1, lsl #1 - vst1.u8 {d10}, [r0], r1 ; store u op1 - vst1.u8 {d11}, [r2], r1 ; store v op1 - vst1.u8 {d12}, [r0], r1 ; store u op0 - vst1.u8 {d13}, [r2], r1 ; store v op0 - vst1.u8 {d14}, [r0], r1 ; store u oq0 - vst1.u8 {d15}, [r2], r1 ; store v oq0 - vst1.u8 {d16}, [r0] ; store u oq1 - vst1.u8 {d17}, [r2] ; store v oq1 + vst1.u8 {d10}, [r0@64], r1 ; store u op1 + vst1.u8 {d11}, [r2@64], r1 ; store v op1 + vst1.u8 {d12}, [r0@64], r1 ; store u op0 + vst1.u8 {d13}, [r2@64], r1 ; store v op0 + vst1.u8 {d14}, [r0@64], r1 ; store u oq0 + vst1.u8 {d15}, [r2@64], r1 ; store v oq0 + vst1.u8 {d16}, [r0@64] ; store u oq1 + vst1.u8 {d17}, [r2@64] ; store v oq1 - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon| ; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, @@ -124,39 +112,38 @@ ; const signed char *limit, ; const signed char *thresh, ; int count) -; r0 unsigned char *src, -; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 int count (unused) +; r0 unsigned char *src +; r1 int pitch +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, + |vp8_loop_filter_vertical_edge_y_neon| PROC - stmdb sp!, {lr} - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit - sub r2, r0, #4 ; src ptr down by 4 columns - sub r0, r0, #2 ; dst ptr - ldr r12, [sp, #4] ; load thresh pointer + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + sub r2, r0, #4 ; src ptr down by 4 columns + add r1, r1, r1 + ldr r3, [sp, #4] ; load thresh + add r12, r2, r1, asr #1 - vld1.u8 {d6}, [r2], r1 ; load first 8-line src data - vld1.u8 {d8}, [r2], r1 + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d8}, [r12], r1 vld1.u8 {d10}, [r2], r1 - vld1.u8 {d12}, [r2], r1 + vld1.u8 {d12}, [r12], r1 vld1.u8 {d14}, [r2], r1 - vld1.u8 {d16}, [r2], r1 + vld1.u8 {d16}, [r12], r1 vld1.u8 {d18}, [r2], r1 - vld1.u8 {d20}, [r2], r1 - - vld1.s8 {d4[], d5[]}, [r12] ; thresh + vld1.u8 {d20}, [r12], r1 vld1.u8 {d7}, [r2], r1 ; load second 8-line src data - vld1.u8 {d9}, [r2], r1 + vld1.u8 {d9}, [r12], r1 vld1.u8 {d11}, [r2], r1 - vld1.u8 {d13}, [r2], r1 + vld1.u8 {d13}, [r12], r1 vld1.u8 {d15}, [r2], r1 - vld1.u8 {d17}, [r2], r1 - vld1.u8 {d19}, [r2], r1 - vld1.u8 {d21}, [r2] + vld1.u8 {d17}, [r12], r1 + vld1.u8 {d19}, [r2] + vld1.u8 {d21}, [r12] ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -164,6 +151,8 @@ vtrn.32 q5, q9 vtrn.32 q6, q10 + vdup.u8 q2, r3 ; duplicate thresh + vtrn.16 q3, q5 vtrn.16 q4, q6 vtrn.16 q7, q9 @@ -178,28 +167,34 @@ vswp d12, d11 vswp d16, d13 + + sub r0, r0, #2 ; dst ptr + vswp d14, d12 vswp d16, d15 + add r12, r0, r1, asr #1 + ;store op1, op0, oq0, oq1 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 - vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1 - vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r0], r1 - vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1 - vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r0], r1 - vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1 - vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r0], r1 - vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0], r1 - vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r0] + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1 - ldmia sp!, {pc} + vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1 + vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1 + vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1 + vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1 + vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1 + vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1 + vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] + vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12] + + pop {pc} ENDP ; |vp8_loop_filter_vertical_edge_y_neon| ; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch @@ -209,38 +204,36 @@ ; unsigned char *v) ; r0 unsigned char *u, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, ; sp+4 unsigned char *v |vp8_loop_filter_vertical_edge_uv_neon| PROC - stmdb sp!, {lr} - sub r12, r0, #4 ; move u pointer down by 4 columns - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit - + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + sub r12, r0, #4 ; move u pointer down by 4 columns ldr r2, [sp, #8] ; load v ptr - - vld1.u8 {d6}, [r12], r1 ;load u data - vld1.u8 {d8}, [r12], r1 - vld1.u8 {d10}, [r12], r1 - vld1.u8 {d12}, [r12], r1 - vld1.u8 {d14}, [r12], r1 - vld1.u8 {d16}, [r12], r1 - vld1.u8 {d18}, [r12], r1 - vld1.u8 {d20}, [r12] - + vdup.u8 q1, r3 ; duplicate limit sub r3, r2, #4 ; move v pointer down by 4 columns + + vld1.u8 {d6}, [r12], r1 ;load u data vld1.u8 {d7}, [r3], r1 ;load v data + vld1.u8 {d8}, [r12], r1 vld1.u8 {d9}, [r3], r1 + vld1.u8 {d10}, [r12], r1 vld1.u8 {d11}, [r3], r1 + vld1.u8 {d12}, [r12], r1 vld1.u8 {d13}, [r3], r1 + vld1.u8 {d14}, [r12], r1 vld1.u8 {d15}, [r3], r1 + vld1.u8 {d16}, [r12], r1 vld1.u8 {d17}, [r3], r1 + vld1.u8 {d18}, [r12], r1 vld1.u8 {d19}, [r3], r1 + vld1.u8 {d20}, [r12] vld1.u8 {d21}, [r3] - ldr r12, [sp, #4] ; load thresh pointer + ldr r12, [sp, #4] ; load thresh ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -248,6 +241,8 @@ vtrn.32 q5, q9 vtrn.32 q6, q10 + vdup.u8 q2, r12 ; duplicate thresh + vtrn.16 q3, q5 vtrn.16 q4, q6 vtrn.16 q7, q9 @@ -258,18 +253,16 @@ vtrn.8 q7, q8 vtrn.8 q9, q10 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - bl vp8_loop_filter_neon - sub r0, r0, #2 - sub r2, r2, #2 - vswp d12, d11 vswp d16, d13 vswp d14, d12 vswp d16, d15 + sub r0, r0, #2 + sub r2, r2, #2 + ;store op1, op0, oq0, oq1 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1 @@ -288,7 +281,7 @@ vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0] vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_vertical_edge_uv_neon| ; void vp8_loop_filter_neon(); @@ -316,42 +309,44 @@ vabd.u8 q14, q8, q7 ; abs(q1 - q0) vabd.u8 q3, q9, q8 ; abs(q2 - q1) vabd.u8 q4, q10, q9 ; abs(q3 - q2) - vabd.u8 q9, q6, q7 ; abs(p0 - q0) vmax.u8 q11, q11, q12 vmax.u8 q12, q13, q14 vmax.u8 q3, q3, q4 vmax.u8 q15, q11, q12 + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + ; vp8_hevmask vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 vmax.u8 q15, q15, q3 - vadd.u8 q0, q0, q0 ; flimit * 2 - vadd.u8 q0, q0, q1 ; flimit * 2 + limit - vcge.u8 q15, q1, q15 + vmov.u8 q10, #0x80 ; 0x80 vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 - vshr.u8 q2, q2, #1 ; a = a / 2 - vqadd.u8 q9, q9, q2 ; a = b + a - vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 - vmov.u8 q0, #0x80 ; 0x80 + vcge.u8 q15, q1, q15 ; vp8_filter() function ; convert to signed - veor q7, q7, q0 ; qs0 - veor q6, q6, q0 ; ps0 - veor q5, q5, q0 ; ps1 - veor q8, q8, q0 ; qs1 + veor q7, q7, q10 ; qs0 + vshr.u8 q2, q2, #1 ; a = a / 2 + veor q6, q6, q10 ; ps0 + + veor q5, q5, q10 ; ps1 + vqadd.u8 q9, q9, q2 ; a = b + a + + veor q8, q8, q10 ; qs1 vmov.u8 q10, #3 ; #3 vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) vsubl.s8 q11, d15, d13 + vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 + vmovl.u8 q4, d20 vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) @@ -378,19 +373,20 @@ vshr.s8 q2, q2, #3 ; Filter2 >>= 3 vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2) vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1) ; outer tap adjustments: ++vp8_filter >> 1 vrshr.s8 q1, q1, #1 vbic q1, q1, q14 ; vp8_filter &= ~hev - + vmov.u8 q0, #0x80 ; 0x80 vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter) vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter) - veor q5, q13, q0 ; *op1 = u^0x80 veor q6, q11, q0 ; *op0 = u^0x80 veor q7, q10, q0 ; *oq0 = u^0x80 + veor q5, q13, q0 ; *op1 = u^0x80 veor q8, q12, q0 ; *oq1 = u^0x80 bx lr diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm index 7c5ea3644..adf848b9c 100644 --- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm +++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm @@ -9,99 +9,109 @@ ; - EXPORT |vp8_loop_filter_simple_horizontal_edge_neon| + ;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon| + EXPORT |vp8_loop_filter_bhs_neon| + EXPORT |vp8_loop_filter_mbhs_neon| ARM - REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit -;are equal. So, in the code, only one load is needed -;for flimit. Same way applies to limit and thresh. -; r0 unsigned char *s, -; r1 int p, //pitch -; r2 const signed char *flimit, -; r3 const signed char *limit, -; stack(r4) const signed char *thresh (unused) -; //stack(r5) int count --unused + +; r0 unsigned char *s, PRESERVE +; r1 int p, PRESERVE +; q1 limit, PRESERVE |vp8_loop_filter_simple_horizontal_edge_neon| PROC - sub r0, r0, r1, lsl #1 ; move src pointer down by 2 lines - vld1.u8 {q5}, [r0], r1 ; p1 - vld1.s8 {d2[], d3[]}, [r2] ; flimit - vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 - vld1.u8 {q6}, [r0], r1 ; p0 - vmov.u8 q0, #0x80 ; 0x80 - vld1.u8 {q7}, [r0], r1 ; q0 - vmov.u8 q10, #0x03 ; 0x03 - vld1.u8 {q8}, [r0] ; q1 + sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines + + vld1.u8 {q7}, [r0@128], r1 ; q0 + vld1.u8 {q5}, [r3@128], r1 ; p0 + vld1.u8 {q8}, [r0@128] ; q1 + vld1.u8 {q6}, [r3@128] ; p1 - ;vp8_filter_mask() function vabd.u8 q15, q6, q7 ; abs(p0 - q0) vabd.u8 q14, q5, q8 ; abs(p1 - q1) + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vmov.u8 q0, #0x80 ; 0x80 + vmov.s16 q13, #3 vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 - ;vp8_filter() function veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value - vadd.u8 q1, q1, q1 ; flimit * 2 - vadd.u8 q1, q1, q13 ; flimit * 2 + limit - vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1 -;;;;;;;;;; - ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) vsubl.s8 q3, d15, d13 vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1) - ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0) - vadd.s16 q11, q2, q2 ; 3 * ( qs0 - ps0) - vadd.s16 q12, q3, q3 + vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0) + vmul.s16 q3, q3, q13 + vmov.u8 q10, #0x03 ; 0x03 vmov.u8 q9, #0x04 ; 0x04 - vadd.s16 q2, q2, q11 - vadd.s16 q3, q3, q12 - vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0) vaddw.s8 q3, q3, d9 - ;vqadd.s8 q4, q4, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) vqmovn.s16 d9, q3 -;;;;;;;;;;;;; - vand q4, q4, q15 ; vp8_filter &= mask + vand q14, q4, q15 ; vp8_filter &= mask - vqadd.s8 q2, q4, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) - vqadd.s8 q4, q4, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q4, q4, #3 ; Filter1 >>= 3 + vshr.s8 q4, q3, #3 ; Filter1 >>= 3 - sub r0, r0, r1, lsl #1 + sub r0, r0, r1 ;calculate output vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1) - add r3, r0, r1 - veor q6, q11, q0 ; *op0 = u^0x80 veor q7, q10, q0 ; *oq0 = u^0x80 - vst1.u8 {q6}, [r0] ; store op0 - vst1.u8 {q7}, [r3] ; store oq0 + vst1.u8 {q6}, [r3@128] ; store op0 + vst1.u8 {q7}, [r0@128] ; store oq0 bx lr ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon| -;----------------- +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_bhs_neon| PROC + push {r4, lr} + ldrb r3, [r2] ; load blim from mem + vdup.s8 q1, r3 ; duplicate blim + + add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride + bl vp8_loop_filter_simple_horizontal_edge_neon + ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1 + add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride + bl vp8_loop_filter_simple_horizontal_edge_neon + add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride + pop {r4, lr} + b vp8_loop_filter_simple_horizontal_edge_neon + ENDP ;|vp8_loop_filter_bhs_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_mbhs_neon| PROC + ldrb r3, [r2] ; load blim from mem + vdup.s8 q1, r3 ; duplicate mblim + b vp8_loop_filter_simple_horizontal_edge_neon + ENDP ;|vp8_loop_filter_bhs_neon| END diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm index a7f7b690e..e690df2f7 100644 --- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm +++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm @@ -9,59 +9,54 @@ ; - EXPORT |vp8_loop_filter_simple_vertical_edge_neon| + ;EXPORT |vp8_loop_filter_simple_vertical_edge_neon| + EXPORT |vp8_loop_filter_bvs_neon| + EXPORT |vp8_loop_filter_mbvs_neon| ARM - REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;Note: flimit, limit, and thresh should be positive numbers. All 16 elements in flimit -;are equal. So, in the code, only one load is needed -;for flimit. Same way applies to limit and thresh. -; r0 unsigned char *s, -; r1 int p, //pitch -; r2 const signed char *flimit, -; r3 const signed char *limit, -; stack(r4) const signed char *thresh (unused) -; //stack(r5) int count --unused + +; r0 unsigned char *s, PRESERVE +; r1 int p, PRESERVE +; q1 limit, PRESERVE |vp8_loop_filter_simple_vertical_edge_neon| PROC sub r0, r0, #2 ; move src pointer down by 2 columns + add r12, r1, r1 + add r3, r0, r1 - vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r1 - vld1.s8 {d2[], d3[]}, [r2] ; flimit - vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 - vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r0], r1 - vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r1 - vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r0], r1 - vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r1 - vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r0], r1 - vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r1 - vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r0], r1 + vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12 + vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12 + vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12 + vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12 + vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12 + vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12 + vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12 + vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12 - vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vmov.u8 q0, #0x80 ; 0x80 - vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 - vmov.u8 q11, #0x03 ; 0x03 - vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vmov.u8 q12, #0x04 ; 0x04 - vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 - vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 - vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12 + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12 + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12 + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12 + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12 + vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3] vswp d7, d10 vswp d12, d9 - ;vswp q4, q5 ; p1:q3, p0:q5, q0:q4, q1:q6 ;vp8_filter_mask() function ;vp8_hevmask() function sub r0, r0, r1, lsl #4 vabd.u8 q15, q5, q4 ; abs(p0 - q0) vabd.u8 q14, q3, q6 ; abs(p1 - q1) + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vmov.u8 q0, #0x80 ; 0x80 + vmov.s16 q11, #3 vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value @@ -69,80 +64,91 @@ veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value - vadd.u8 q1, q1, q1 ; flimit * 2 - vadd.u8 q1, q1, q13 ; flimit * 2 + limit vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 - ;vp8_filter() function -;;;;;;;;;; - ;vqsub.s8 q2, q5, q4 ; ( qs0 - ps0) vsubl.s8 q2, d8, d10 ; ( qs0 - ps0) vsubl.s8 q13, d9, d11 - vqsub.s8 q1, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) - ;vmul.i8 q2, q2, q11 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0) - vadd.s16 q14, q13, q13 - vadd.s16 q2, q2, q10 - vadd.s16 q13, q13, q14 + vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0) + vmul.s16 q13, q13, q11 - ;vqadd.s8 q1, q1, q2 - vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) - vaddw.s8 q13, q13, d3 + vmov.u8 q11, #0x03 ; 0x03 + vmov.u8 q12, #0x04 ; 0x04 - vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d3, q13 + vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d29 + + vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d29, q13 add r0, r0, #1 - add r2, r0, r1 -;;;;;;;;;;; + add r3, r0, r1 - vand q1, q1, q15 ; vp8_filter &= mask + vand q14, q14, q15 ; vp8_filter &= mask - vqadd.s8 q2, q1, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) - vqadd.s8 q1, q1, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + vshr.s8 q14, q3, #3 ; Filter1 >>= 3 ;calculate output - vqsub.s8 q10, q4, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1) vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1) - veor q7, q10, q0 ; *oq0 = u^0x80 veor q6, q11, q0 ; *op0 = u^0x80 - - add r3, r2, r1 + veor q7, q10, q0 ; *oq0 = u^0x80 + add r12, r1, r1 vswp d13, d14 - add r12, r3, r1 ;store op1, op0, oq0, oq1 - vst2.8 {d12[0], d13[0]}, [r0] - vst2.8 {d12[1], d13[1]}, [r2] - vst2.8 {d12[2], d13[2]}, [r3] - vst2.8 {d12[3], d13[3]}, [r12], r1 - add r0, r12, r1 - vst2.8 {d12[4], d13[4]}, [r12] - vst2.8 {d12[5], d13[5]}, [r0], r1 - add r2, r0, r1 - vst2.8 {d12[6], d13[6]}, [r0] - vst2.8 {d12[7], d13[7]}, [r2], r1 - add r3, r2, r1 - vst2.8 {d14[0], d15[0]}, [r2] - vst2.8 {d14[1], d15[1]}, [r3], r1 - add r12, r3, r1 - vst2.8 {d14[2], d15[2]}, [r3] - vst2.8 {d14[3], d15[3]}, [r12], r1 - add r0, r12, r1 - vst2.8 {d14[4], d15[4]}, [r12] - vst2.8 {d14[5], d15[5]}, [r0], r1 - add r2, r0, r1 - vst2.8 {d14[6], d15[6]}, [r0] - vst2.8 {d14[7], d15[7]}, [r2] + vst2.8 {d12[0], d13[0]}, [r0], r12 + vst2.8 {d12[1], d13[1]}, [r3], r12 + vst2.8 {d12[2], d13[2]}, [r0], r12 + vst2.8 {d12[3], d13[3]}, [r3], r12 + vst2.8 {d12[4], d13[4]}, [r0], r12 + vst2.8 {d12[5], d13[5]}, [r3], r12 + vst2.8 {d12[6], d13[6]}, [r0], r12 + vst2.8 {d12[7], d13[7]}, [r3], r12 + vst2.8 {d14[0], d15[0]}, [r0], r12 + vst2.8 {d14[1], d15[1]}, [r3], r12 + vst2.8 {d14[2], d15[2]}, [r0], r12 + vst2.8 {d14[3], d15[3]}, [r3], r12 + vst2.8 {d14[4], d15[4]}, [r0], r12 + vst2.8 {d14[5], d15[5]}, [r3], r12 + vst2.8 {d14[6], d15[6]}, [r0], r12 + vst2.8 {d14[7], d15[7]}, [r3] bx lr ENDP ; |vp8_loop_filter_simple_vertical_edge_neon| -;----------------- +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit +|vp8_loop_filter_bvs_neon| PROC + push {r4, lr} + ldrb r3, [r2] ; load blim from mem + mov r4, r0 + add r0, r0, #4 + vdup.s8 q1, r3 ; duplicate blim + bl vp8_loop_filter_simple_vertical_edge_neon + ; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1 + add r0, r4, #8 + bl vp8_loop_filter_simple_vertical_edge_neon + add r0, r4, #12 + pop {r4, lr} + b vp8_loop_filter_simple_vertical_edge_neon + ENDP ;|vp8_loop_filter_bvs_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_mbvs_neon| PROC + ldrb r3, [r2] ; load mblim from mem + vdup.s8 q1, r3 ; duplicate mblim + b vp8_loop_filter_simple_vertical_edge_neon + ENDP ;|vp8_loop_filter_bvs_neon| END diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm index 72f0f9271..f41c156df 100644 --- a/vp8/common/arm/neon/mbloopfilter_neon.asm +++ b/vp8/common/arm/neon/mbloopfilter_neon.asm @@ -14,155 +14,143 @@ EXPORT |vp8_mbloop_filter_vertical_edge_y_neon| EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon| ARM - REQUIRE8 - PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -; flimit, limit, and thresh should be positive numbers. -; All 16 elements in these variables are equal. - ; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh) ; r0 unsigned char *src, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 int count (unused) +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, |vp8_mbloop_filter_horizontal_edge_y_neon| PROC - stmdb sp!, {lr} - sub r0, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r12, [sp, #4] ; load thresh pointer + push {lr} + add r1, r1, r1 ; double stride + ldr r12, [sp, #4] ; load thresh + sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines + vdup.u8 q2, r12 ; thresh + add r12, r0, r1, lsr #1 ; move src pointer up by 1 line - vld1.u8 {q3}, [r0], r1 ; p3 - vld1.s8 {d2[], d3[]}, [r3] ; limit - vld1.u8 {q4}, [r0], r1 ; p2 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - vld1.u8 {q5}, [r0], r1 ; p1 - vld1.u8 {q6}, [r0], r1 ; p0 - vld1.u8 {q7}, [r0], r1 ; q0 - vld1.u8 {q8}, [r0], r1 ; q1 - vld1.u8 {q9}, [r0], r1 ; q2 - vld1.u8 {q10}, [r0], r1 ; q3 + vld1.u8 {q3}, [r0@128], r1 ; p3 + vld1.u8 {q4}, [r12@128], r1 ; p2 + vld1.u8 {q5}, [r0@128], r1 ; p1 + vld1.u8 {q6}, [r12@128], r1 ; p0 + vld1.u8 {q7}, [r0@128], r1 ; q0 + vld1.u8 {q8}, [r12@128], r1 ; q1 + vld1.u8 {q9}, [r0@128], r1 ; q2 + vld1.u8 {q10}, [r12@128], r1 ; q3 bl vp8_mbloop_filter_neon - sub r0, r0, r1, lsl #3 - add r0, r0, r1 - add r2, r0, r1 - add r3, r2, r1 + sub r12, r12, r1, lsl #2 + add r0, r12, r1, lsr #1 - vst1.u8 {q4}, [r0] ; store op2 - vst1.u8 {q5}, [r2] ; store op1 - vst1.u8 {q6}, [r3], r1 ; store op0 - add r12, r3, r1 - vst1.u8 {q7}, [r3] ; store oq0 - vst1.u8 {q8}, [r12], r1 ; store oq1 - vst1.u8 {q9}, [r12] ; store oq2 + vst1.u8 {q4}, [r12@128],r1 ; store op2 + vst1.u8 {q5}, [r0@128],r1 ; store op1 + vst1.u8 {q6}, [r12@128], r1 ; store op0 + vst1.u8 {q7}, [r0@128],r1 ; store oq0 + vst1.u8 {q8}, [r12@128] ; store oq1 + vst1.u8 {q9}, [r0@128] ; store oq2 - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon| ; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh, ; unsigned char *v) ; r0 unsigned char *u, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, ; sp+4 unsigned char *v + |vp8_mbloop_filter_horizontal_edge_uv_neon| PROC - stmdb sp!, {lr} - sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines - vld1.s8 {d2[], d3[]}, [r3] ; limit - ldr r3, [sp, #8] ; load v ptr - ldr r12, [sp, #4] ; load thresh pointer - sub r3, r3, r1, lsl #2 ; move v pointer down by 4 lines + push {lr} + ldr r12, [sp, #4] ; load thresh + sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines + vdup.u8 q2, r12 ; thresh + ldr r12, [sp, #8] ; load v ptr + sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines - vld1.u8 {d6}, [r0], r1 ; p3 - vld1.u8 {d7}, [r3], r1 ; p3 - vld1.u8 {d8}, [r0], r1 ; p2 - vld1.u8 {d9}, [r3], r1 ; p2 - vld1.u8 {d10}, [r0], r1 ; p1 - vld1.u8 {d11}, [r3], r1 ; p1 - vld1.u8 {d12}, [r0], r1 ; p0 - vld1.u8 {d13}, [r3], r1 ; p0 - vld1.u8 {d14}, [r0], r1 ; q0 - vld1.u8 {d15}, [r3], r1 ; q0 - vld1.u8 {d16}, [r0], r1 ; q1 - vld1.u8 {d17}, [r3], r1 ; q1 - vld1.u8 {d18}, [r0], r1 ; q2 - vld1.u8 {d19}, [r3], r1 ; q2 - vld1.u8 {d20}, [r0], r1 ; q3 - vld1.u8 {d21}, [r3], r1 ; q3 - - vld1.s8 {d4[], d5[]}, [r12] ; thresh + vld1.u8 {d6}, [r0@64], r1 ; p3 + vld1.u8 {d7}, [r12@64], r1 ; p3 + vld1.u8 {d8}, [r0@64], r1 ; p2 + vld1.u8 {d9}, [r12@64], r1 ; p2 + vld1.u8 {d10}, [r0@64], r1 ; p1 + vld1.u8 {d11}, [r12@64], r1 ; p1 + vld1.u8 {d12}, [r0@64], r1 ; p0 + vld1.u8 {d13}, [r12@64], r1 ; p0 + vld1.u8 {d14}, [r0@64], r1 ; q0 + vld1.u8 {d15}, [r12@64], r1 ; q0 + vld1.u8 {d16}, [r0@64], r1 ; q1 + vld1.u8 {d17}, [r12@64], r1 ; q1 + vld1.u8 {d18}, [r0@64], r1 ; q2 + vld1.u8 {d19}, [r12@64], r1 ; q2 + vld1.u8 {d20}, [r0@64], r1 ; q3 + vld1.u8 {d21}, [r12@64], r1 ; q3 bl vp8_mbloop_filter_neon sub r0, r0, r1, lsl #3 - sub r3, r3, r1, lsl #3 + sub r12, r12, r1, lsl #3 add r0, r0, r1 - add r3, r3, r1 + add r12, r12, r1 - vst1.u8 {d8}, [r0], r1 ; store u op2 - vst1.u8 {d9}, [r3], r1 ; store v op2 - vst1.u8 {d10}, [r0], r1 ; store u op1 - vst1.u8 {d11}, [r3], r1 ; store v op1 - vst1.u8 {d12}, [r0], r1 ; store u op0 - vst1.u8 {d13}, [r3], r1 ; store v op0 - vst1.u8 {d14}, [r0], r1 ; store u oq0 - vst1.u8 {d15}, [r3], r1 ; store v oq0 - vst1.u8 {d16}, [r0], r1 ; store u oq1 - vst1.u8 {d17}, [r3], r1 ; store v oq1 - vst1.u8 {d18}, [r0], r1 ; store u oq2 - vst1.u8 {d19}, [r3], r1 ; store v oq2 + vst1.u8 {d8}, [r0@64], r1 ; store u op2 + vst1.u8 {d9}, [r12@64], r1 ; store v op2 + vst1.u8 {d10}, [r0@64], r1 ; store u op1 + vst1.u8 {d11}, [r12@64], r1 ; store v op1 + vst1.u8 {d12}, [r0@64], r1 ; store u op0 + vst1.u8 {d13}, [r12@64], r1 ; store v op0 + vst1.u8 {d14}, [r0@64], r1 ; store u oq0 + vst1.u8 {d15}, [r12@64], r1 ; store v oq0 + vst1.u8 {d16}, [r0@64], r1 ; store u oq1 + vst1.u8 {d17}, [r12@64], r1 ; store v oq1 + vst1.u8 {d18}, [r0@64], r1 ; store u oq2 + vst1.u8 {d19}, [r12@64], r1 ; store v oq2 - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon| ; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh) ; r0 unsigned char *src, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 int count (unused) +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, |vp8_mbloop_filter_vertical_edge_y_neon| PROC - stmdb sp!, {lr} + push {lr} + ldr r12, [sp, #4] ; load thresh sub r0, r0, #4 ; move src pointer down by 4 columns + vdup.s8 q2, r12 ; thresh + add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines vld1.u8 {d6}, [r0], r1 ; load first 8-line src data - ldr r12, [sp, #4] ; load thresh pointer + vld1.u8 {d7}, [r12], r1 ; load second 8-line src data vld1.u8 {d8}, [r0], r1 - sub sp, sp, #32 + vld1.u8 {d9}, [r12], r1 vld1.u8 {d10}, [r0], r1 + vld1.u8 {d11}, [r12], r1 vld1.u8 {d12}, [r0], r1 + vld1.u8 {d13}, [r12], r1 vld1.u8 {d14}, [r0], r1 + vld1.u8 {d15}, [r12], r1 vld1.u8 {d16}, [r0], r1 + vld1.u8 {d17}, [r12], r1 vld1.u8 {d18}, [r0], r1 + vld1.u8 {d19}, [r12], r1 vld1.u8 {d20}, [r0], r1 - - vld1.u8 {d7}, [r0], r1 ; load second 8-line src data - vld1.u8 {d9}, [r0], r1 - vld1.u8 {d11}, [r0], r1 - vld1.u8 {d13}, [r0], r1 - vld1.u8 {d15}, [r0], r1 - vld1.u8 {d17}, [r0], r1 - vld1.u8 {d19}, [r0], r1 - vld1.u8 {d21}, [r0], r1 + vld1.u8 {d21}, [r12], r1 ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -180,133 +168,11 @@ vtrn.8 q7, q8 vtrn.8 q9, q10 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - vld1.s8 {d2[], d3[]}, [r3] ; limit - mov r12, sp - vst1.u8 {q3}, [r12]! - vst1.u8 {q10}, [r12]! - - bl vp8_mbloop_filter_neon - - sub r0, r0, r1, lsl #4 - - add r2, r0, r1 - - add r3, r2, r1 - - vld1.u8 {q3}, [sp]! - vld1.u8 {q10}, [sp]! - - ;transpose to 16x8 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - add r12, r3, r1 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - ;store op2, op1, op0, oq0, oq1, oq2 - vst1.8 {d6}, [r0] - vst1.8 {d8}, [r2] - vst1.8 {d10}, [r3] - vst1.8 {d12}, [r12], r1 - add r0, r12, r1 - vst1.8 {d14}, [r12] - vst1.8 {d16}, [r0], r1 - add r2, r0, r1 - vst1.8 {d18}, [r0] - vst1.8 {d20}, [r2], r1 - add r3, r2, r1 - vst1.8 {d7}, [r2] - vst1.8 {d9}, [r3], r1 - add r12, r3, r1 - vst1.8 {d11}, [r3] - vst1.8 {d13}, [r12], r1 - add r0, r12, r1 - vst1.8 {d15}, [r12] - vst1.8 {d17}, [r0], r1 - add r2, r0, r1 - vst1.8 {d19}, [r0] - vst1.8 {d21}, [r2] - - ldmia sp!, {pc} - ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon| - -; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; unsigned char *v) -; r0 unsigned char *u, -; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 unsigned char *v -|vp8_mbloop_filter_vertical_edge_uv_neon| PROC - stmdb sp!, {lr} - sub r0, r0, #4 ; move src pointer down by 4 columns - vld1.s8 {d2[], d3[]}, [r3] ; limit - ldr r3, [sp, #8] ; load v ptr - ldr r12, [sp, #4] ; load thresh pointer - - sub r3, r3, #4 ; move v pointer down by 4 columns - - vld1.u8 {d6}, [r0], r1 ;load u data - vld1.u8 {d7}, [r3], r1 ;load v data - vld1.u8 {d8}, [r0], r1 - vld1.u8 {d9}, [r3], r1 - vld1.u8 {d10}, [r0], r1 - vld1.u8 {d11}, [r3], r1 - vld1.u8 {d12}, [r0], r1 - vld1.u8 {d13}, [r3], r1 - vld1.u8 {d14}, [r0], r1 - vld1.u8 {d15}, [r3], r1 - vld1.u8 {d16}, [r0], r1 - vld1.u8 {d17}, [r3], r1 - vld1.u8 {d18}, [r0], r1 - vld1.u8 {d19}, [r3], r1 - vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r3], r1 - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - sub sp, sp, #32 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - mov r12, sp - vst1.u8 {q3}, [r12]! - vst1.u8 {q10}, [r12]! - - bl vp8_mbloop_filter_neon - sub r0, r0, r1, lsl #3 - sub r3, r3, r1, lsl #3 - vld1.u8 {q3}, [sp]! - vld1.u8 {q10}, [sp]! + bl vp8_mbloop_filter_neon + + sub r12, r12, r1, lsl #3 ;transpose to 16x8 matrix vtrn.32 q3, q7 @@ -326,23 +192,118 @@ ;store op2, op1, op0, oq0, oq1, oq2 vst1.8 {d6}, [r0], r1 - vst1.8 {d7}, [r3], r1 + vst1.8 {d7}, [r12], r1 vst1.8 {d8}, [r0], r1 - vst1.8 {d9}, [r3], r1 + vst1.8 {d9}, [r12], r1 vst1.8 {d10}, [r0], r1 - vst1.8 {d11}, [r3], r1 + vst1.8 {d11}, [r12], r1 vst1.8 {d12}, [r0], r1 - vst1.8 {d13}, [r3], r1 + vst1.8 {d13}, [r12], r1 vst1.8 {d14}, [r0], r1 - vst1.8 {d15}, [r3], r1 + vst1.8 {d15}, [r12], r1 vst1.8 {d16}, [r0], r1 - vst1.8 {d17}, [r3], r1 + vst1.8 {d17}, [r12], r1 vst1.8 {d18}, [r0], r1 - vst1.8 {d19}, [r3], r1 - vst1.8 {d20}, [r0], r1 - vst1.8 {d21}, [r3], r1 + vst1.8 {d19}, [r12], r1 + vst1.8 {d20}, [r0] + vst1.8 {d21}, [r12] - ldmia sp!, {pc} + pop {pc} + ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon| + +; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh, +; unsigned char *v) +; r0 unsigned char *u, +; r1 int pitch, +; r2 const signed char *flimit, +; r3 const signed char *limit, +; sp const signed char *thresh, +; sp+4 unsigned char *v +|vp8_mbloop_filter_vertical_edge_uv_neon| PROC + push {lr} + ldr r12, [sp, #4] ; load thresh + sub r0, r0, #4 ; move u pointer down by 4 columns + vdup.u8 q2, r12 ; thresh + ldr r12, [sp, #8] ; load v ptr + sub r12, r12, #4 ; move v pointer down by 4 columns + + vld1.u8 {d6}, [r0], r1 ;load u data + vld1.u8 {d7}, [r12], r1 ;load v data + vld1.u8 {d8}, [r0], r1 + vld1.u8 {d9}, [r12], r1 + vld1.u8 {d10}, [r0], r1 + vld1.u8 {d11}, [r12], r1 + vld1.u8 {d12}, [r0], r1 + vld1.u8 {d13}, [r12], r1 + vld1.u8 {d14}, [r0], r1 + vld1.u8 {d15}, [r12], r1 + vld1.u8 {d16}, [r0], r1 + vld1.u8 {d17}, [r12], r1 + vld1.u8 {d18}, [r0], r1 + vld1.u8 {d19}, [r12], r1 + vld1.u8 {d20}, [r0], r1 + vld1.u8 {d21}, [r12], r1 + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + sub r0, r0, r1, lsl #3 + + bl vp8_mbloop_filter_neon + + sub r12, r12, r1, lsl #3 + + ;transpose to 16x8 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + ;store op2, op1, op0, oq0, oq1, oq2 + vst1.8 {d6}, [r0], r1 + vst1.8 {d7}, [r12], r1 + vst1.8 {d8}, [r0], r1 + vst1.8 {d9}, [r12], r1 + vst1.8 {d10}, [r0], r1 + vst1.8 {d11}, [r12], r1 + vst1.8 {d12}, [r0], r1 + vst1.8 {d13}, [r12], r1 + vst1.8 {d14}, [r0], r1 + vst1.8 {d15}, [r12], r1 + vst1.8 {d16}, [r0], r1 + vst1.8 {d17}, [r12], r1 + vst1.8 {d18}, [r0], r1 + vst1.8 {d19}, [r12], r1 + vst1.8 {d20}, [r0] + vst1.8 {d21}, [r12] + + pop {pc} ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon| ; void vp8_mbloop_filter_neon() @@ -350,26 +311,19 @@ ; functions do the necessary load, transpose (if necessary), preserve (if ; necessary) and store. -; TODO: -; The vertical filter writes p3/q3 back out because two 4 element writes are -; much simpler than ordering and writing two 3 element sets (or three 2 elements -; sets, or whichever other combinations are possible). -; If we can preserve q3 and q10, the vertical filter will be able to avoid -; storing those values on the stack and reading them back after the filter. - ; r0,r1 PRESERVE -; r2 flimit -; r3 PRESERVE -; q1 limit +; r2 mblimit +; r3 limit + ; q2 thresh -; q3 p3 +; q3 p3 PRESERVE ; q4 p2 ; q5 p1 ; q6 p0 ; q7 q0 ; q8 q1 ; q9 q2 -; q10 q3 +; q10 q3 PRESERVE |vp8_mbloop_filter_neon| PROC @@ -378,12 +332,12 @@ vabd.u8 q12, q4, q5 ; abs(p2 - p1) vabd.u8 q13, q5, q6 ; abs(p1 - p0) vabd.u8 q14, q8, q7 ; abs(q1 - q0) - vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q1, q9, q8 ; abs(q2 - q1) vabd.u8 q0, q10, q9 ; abs(q3 - q2) vmax.u8 q11, q11, q12 vmax.u8 q12, q13, q14 - vmax.u8 q3, q3, q0 + vmax.u8 q1, q1, q0 vmax.u8 q15, q11, q12 vabd.u8 q12, q6, q7 ; abs(p0 - q0) @@ -391,44 +345,46 @@ ; vp8_hevmask vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1 vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1 - vmax.u8 q15, q15, q3 + vmax.u8 q15, q15, q1 - vld1.s8 {d4[], d5[]}, [r2] ; flimit + vdup.u8 q1, r3 ; limit + vdup.u8 q2, r2 ; mblimit vmov.u8 q0, #0x80 ; 0x80 - vadd.u8 q2, q2, q2 ; flimit * 2 - vadd.u8 q2, q2, q1 ; flimit * 2 + limit vcge.u8 q15, q1, q15 vabd.u8 q1, q5, q8 ; a = abs(p1 - q1) vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2 - vshr.u8 q1, q1, #1 ; a = a / 2 - vqadd.u8 q12, q12, q1 ; a = b + a - vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1 + vmov.u16 q11, #3 ; #3 ; vp8_filter ; convert to signed veor q7, q7, q0 ; qs0 + vshr.u8 q1, q1, #1 ; a = a / 2 veor q6, q6, q0 ; ps0 veor q5, q5, q0 ; ps1 + + vqadd.u8 q12, q12, q1 ; a = b + a + veor q8, q8, q0 ; qs1 veor q4, q4, q0 ; ps2 veor q9, q9, q0 ; qs2 vorr q14, q13, q14 ; vp8_hevmask + vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1 + vsubl.s8 q2, d14, d12 ; qs0 - ps0 vsubl.s8 q13, d15, d13 vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) - vadd.s16 q10, q2, q2 ; 3 * (qs0 - ps0) - vadd.s16 q11, q13, q13 + vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0) + vand q15, q15, q12 ; vp8_filter_mask - vadd.s16 q2, q2, q10 - vadd.s16 q13, q13, q11 + vmul.i16 q13, q13, q11 vmov.u8 q12, #3 ; #3 @@ -447,23 +403,19 @@ vand q13, q1, q14 ; Filter2 &= hev - vmov.u8 d7, #9 ; #9 - vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4) vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3) - vmov.u8 d6, #18 ; #18 + vmov q0, q15 vshr.s8 q2, q2, #3 ; Filter1 >>= 3 vshr.s8 q13, q13, #3 ; Filter2 >>= 3 - vmov q10, q15 + vmov q11, q15 vmov q12, q15 vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1) - vmov.u8 d5, #27 ; #27 - vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2) vbic q1, q1, q14 ; vp8_filter &= ~hev @@ -471,35 +423,43 @@ ; roughly 1/7th difference across boundary ; roughly 2/7th difference across boundary ; roughly 3/7th difference across boundary - vmov q11, q15 + + vmov.u8 d5, #9 ; #9 + vmov.u8 d4, #18 ; #18 + vmov q13, q15 vmov q14, q15 - vmlal.s8 q10, d2, d7 ; Filter2 * 9 - vmlal.s8 q11, d3, d7 - vmlal.s8 q12, d2, d6 ; Filter2 * 18 - vmlal.s8 q13, d3, d6 - vmlal.s8 q14, d2, d5 ; Filter2 * 27 + vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9 + vmlal.s8 q11, d3, d5 + vmov.u8 d5, #27 ; #27 + vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18 + vmlal.s8 q13, d3, d4 + vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27 vmlal.s8 q15, d3, d5 - vqshrn.s16 d20, q10, #7 ; u = clamp((63 + Filter2 * 9)>>7) - vqshrn.s16 d21, q11, #7 + + vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7) + vqshrn.s16 d1, q11, #7 vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7) vqshrn.s16 d25, q13, #7 vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7) vqshrn.s16 d29, q15, #7 - vqsub.s8 q11, q9, q10 ; s = clamp(qs2 - u) - vqadd.s8 q10, q4, q10 ; s = clamp(ps2 + u) + vmov.u8 q1, #0x80 ; 0x80 + + vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u) + vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u) vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u) vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u) vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u) vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u) - veor q9, q11, q0 ; *oq2 = s^0x80 - veor q4, q10, q0 ; *op2 = s^0x80 - veor q8, q13, q0 ; *oq1 = s^0x80 - veor q5, q12, q0 ; *op2 = s^0x80 - veor q7, q15, q0 ; *oq0 = s^0x80 - veor q6, q14, q0 ; *op0 = s^0x80 + + veor q9, q11, q1 ; *oq2 = s^0x80 + veor q4, q0, q1 ; *op2 = s^0x80 + veor q8, q13, q1 ; *oq1 = s^0x80 + veor q5, q12, q1 ; *op2 = s^0x80 + veor q7, q15, q1 ; *oq0 = s^0x80 + veor q6, q14, q1 ; *op0 = s^0x80 bx lr ENDP ; |vp8_mbloop_filter_neon| From c231b0175dbfb8b1a1b06707bc300d2910abba7a Mon Sep 17 00:00:00 2001 From: Attila Nagy Date: Mon, 11 Jul 2011 12:39:18 +0300 Subject: [PATCH 5/8] Update armv6 loopfilter to new interface Change-Id: I5fe581d797571a7a9432fbd17fc557591d0c1afa --- vp8/common/arm/arm_systemdependent.c | 6 +- vp8/common/arm/armv6/loopfilter_v6.asm | 64 ++++++++++++-------- vp8/common/arm/armv6/simpleloopfilter_v6.asm | 29 +++------ vp8/common/arm/loopfilter_arm.c | 46 +++----------- vp8/common/arm/loopfilter_arm.h | 12 ++-- 5 files changed, 70 insertions(+), 87 deletions(-) diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c index bd5c0759d..8896cf03f 100644 --- a/vp8/common/arm/arm_systemdependent.c +++ b/vp8/common/arm/arm_systemdependent.c @@ -51,9 +51,11 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_armv6; rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6; rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_armv6; - rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6; + rtcd->loopfilter.simple_mb_v = + vp8_loop_filter_simple_vertical_edge_armv6; rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_armv6; - rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6; + rtcd->loopfilter.simple_mb_h = + vp8_loop_filter_simple_horizontal_edge_armv6; rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_armv6; rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6; diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm index c7441b055..1cbbbcdef 100644 --- a/vp8/common/arm/armv6/loopfilter_v6.asm +++ b/vp8/common/arm/armv6/loopfilter_v6.asm @@ -53,14 +53,11 @@ count RN r5 ;r0 unsigned char *src_ptr, ;r1 int src_pixel_step, -;r2 const char *flimit, +;r2 const char *blimit, ;r3 const char *limit, ;stack const char *thresh, ;stack int count -;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed -;for flimit. Same way applies to limit and thresh. - ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |vp8_loop_filter_horizontal_edge_armv6| PROC ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- @@ -72,14 +69,18 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r9, [src], pstep ; p3 - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit ldr r10, [src], pstep ; p2 - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit ldr r11, [src], pstep ; p1 - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r6], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r6] ; thresh + orr r2, r2, r2, lsl #8 mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |Hnext8| ; vp8_filter_mask() function @@ -275,14 +276,18 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r9, [src], pstep ; p3 - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit ldr r10, [src], pstep ; p2 - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit ldr r11, [src], pstep ; p1 - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r6], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r6] ; thresh + orr r2, r2, r2, lsl #8 mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |MBHnext8| @@ -584,15 +589,19 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r6, [src], pstep ; load source data - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit ldr r7, [src], pstep - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit ldr r8, [src], pstep - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r12], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r12] ; thresh + orr r2, r2, r2, lsl #8 ldr lr, [src], pstep mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |Vnext8| @@ -855,18 +864,22 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r6, [src], pstep ; load source data - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit pld [src, #23] ldr r7, [src], pstep - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit pld [src, #23] ldr r8, [src], pstep - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r12], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r12] ; thresh + orr r2, r2, r2, lsl #8 pld [src, #23] ldr lr, [src], pstep mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |MBVnext8| ; vp8_filter_mask() function @@ -906,6 +919,7 @@ count RN r5 str lr, [sp, #8] ldr lr, [src], pstep + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 ldr lr, [sp, #8] ; load back (f)limit accumulator @@ -954,6 +968,7 @@ count RN r5 beq mbvskip_filter ; skip filtering + ;vp8_hevmask() function ;calculate high edge variance @@ -1121,6 +1136,7 @@ count RN r5 smlabb r8, r6, lr, r7 smlatb r6, r6, lr, r7 smlabb r9, r10, lr, r7 + smlatb r10, r10, lr, r7 ssat r8, #8, r8, asr #7 ssat r6, #8, r6, asr #7 diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm index 40a71f49d..5e00cf01b 100644 --- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm +++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm @@ -45,35 +45,28 @@ MEND + src RN r0 pstep RN r1 ;r0 unsigned char *src_ptr, ;r1 int src_pixel_step, -;r2 const char *flimit, -;r3 const char *limit, -;stack const char *thresh, -;stack int count - -; All 16 elements in flimit are equal. So, in the code, only one load is needed -; for flimit. Same applies to limit. thresh is not used in simple looopfilter +;r2 const char *blimit ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |vp8_loop_filter_simple_horizontal_edge_armv6| PROC ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- stmdb sp!, {r4 - r11, lr} - ldr r12, [r3] ; limit + ldrb r12, [r2] ; blimit ldr r3, [src, -pstep, lsl #1] ; p1 ldr r4, [src, -pstep] ; p0 ldr r5, [src] ; q0 ldr r6, [src, pstep] ; q1 - ldr r7, [r2] ; flimit + orr r12, r12, r12, lsl #8 ; blimit ldr r2, c0x80808080 - ldr r9, [sp, #40] ; count for 8-in-parallel - uadd8 r7, r7, r7 ; flimit * 2 - mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time - uadd8 r12, r7, r12 ; flimit * 2 + limit + orr r12, r12, r12, lsl #16 ; blimit + mov r9, #4 ; double the count. we're doing 4 at a time mov lr, #0 ; need 0 in a couple places |simple_hnext8| @@ -148,34 +141,32 @@ pstep RN r1 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- stmdb sp!, {r4 - r11, lr} - ldr r12, [r2] ; r12: flimit + ldrb r12, [r2] ; r12: blimit ldr r2, c0x80808080 - ldr r7, [r3] ; limit + orr r12, r12, r12, lsl #8 ; load soure data to r7, r8, r9, r10 ldrh r3, [src, #-2] pld [src, #23] ; preload for next block ldrh r4, [src], pstep - uadd8 r12, r12, r12 ; flimit * 2 + orr r12, r12, r12, lsl #16 ldrh r5, [src, #-2] pld [src, #23] ldrh r6, [src], pstep - uadd8 r12, r12, r7 ; flimit * 2 + limit pkhbt r7, r3, r4, lsl #16 ldrh r3, [src, #-2] pld [src, #23] ldrh r4, [src], pstep - ldr r11, [sp, #40] ; count (r11) for 8-in-parallel pkhbt r8, r5, r6, lsl #16 ldrh r5, [src, #-2] pld [src, #23] ldrh r6, [src], pstep - mov r11, r11, lsl #1 ; 4-in-parallel + mov r11, #4 ; double the count. we're doing 4 at a time |simple_vnext8| ; vp8_simple_filter_mask() function diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c index 1ec2b7484..c841d455a 100644 --- a/vp8/common/arm/loopfilter_arm.c +++ b/vp8/common/arm/loopfilter_arm.c @@ -18,8 +18,6 @@ extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6); extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); -extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6); -extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6); #endif #if HAVE_ARMV7 @@ -55,15 +53,6 @@ void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); -} - /* Vertical MB Filtering */ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) @@ -77,15 +66,6 @@ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); -} - /* Horizontal B Filtering */ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) @@ -101,15 +81,12 @@ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit); } /* Vertical B Filtering */ @@ -127,15 +104,12 @@ void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit); } #endif diff --git a/vp8/common/arm/loopfilter_arm.h b/vp8/common/arm/loopfilter_arm.h index 27159b59f..390a547b0 100644 --- a/vp8/common/arm/loopfilter_arm.h +++ b/vp8/common/arm/loopfilter_arm.h @@ -19,10 +19,10 @@ extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6); extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6); extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6); extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_bvs_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_bhs_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_lf_normal_mb_v @@ -38,13 +38,13 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6); #define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6 #undef vp8_lf_simple_mb_v -#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6 +#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_armv6 #undef vp8_lf_simple_b_v #define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6 #undef vp8_lf_simple_mb_h -#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6 +#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_armv6 #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6 From d89eb6ad5a597b8651295326c9d5f31d4c392cd2 Mon Sep 17 00:00:00 2001 From: Fritz Koenig Date: Tue, 12 Jul 2011 16:29:15 -0700 Subject: [PATCH 6/8] Remove rotting NDS_NITRO code. Code has not been used and is no longer relevant. Change-Id: I38590513da7c7a436804ff8a1a3805d9697f575d --- vpx_mem/include/nds/vpx_mem_nds.h | 30 ---- vpx_mem/vpx_mem_tracker.c | 25 ---- vpx_scale/arm/nds/yv12extend.c | 221 ------------------------------ 3 files changed, 276 deletions(-) delete mode 100644 vpx_mem/include/nds/vpx_mem_nds.h delete mode 100644 vpx_scale/arm/nds/yv12extend.c diff --git a/vpx_mem/include/nds/vpx_mem_nds.h b/vpx_mem/include/nds/vpx_mem_nds.h deleted file mode 100644 index e54f54d9b..000000000 --- a/vpx_mem/include/nds/vpx_mem_nds.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __VPX_MEM_NDS_H__ -#define __VPX_MEM_NDS_H__ - -#if defined(__cplusplus) -extern "C" { -#endif - -#include -#include - - void *vpx_mem_nds_alloc(osarena_id id, osheap_handle handle, size_t size, size_t align); - void vpx_mem_nds_free(osarena_id id, osheap_handle handle, void *mem); - int vpx_nds_alloc_heap(osarena_id id, u32 size); - -#if defined(__cplusplus) -} -#endif - -#endif /*__VPX_MEM_NDS_H__*/ diff --git a/vpx_mem/vpx_mem_tracker.c b/vpx_mem/vpx_mem_tracker.c index 938ad0716..9e8623a9a 100644 --- a/vpx_mem/vpx_mem_tracker.c +++ b/vpx_mem/vpx_mem_tracker.c @@ -36,9 +36,6 @@ # include #elif defined(VXWORKS) # include -#elif defined(NDS_NITRO) -# include -# include #endif #include @@ -112,8 +109,6 @@ struct memory_tracker HANDLE mutex; #elif defined(VXWORKS) SEM_ID mutex; -#elif defined(NDS_NITRO) - OSMutex mutex; #elif defined(NO_MUTEX) #else #error "No mutex type defined for this platform!" @@ -193,9 +188,6 @@ int vpx_memory_tracker_init(int padding_size, int pad_value) memtrack.mutex = sem_bcreate(SEM_Q_FIFO, /*SEM_Q_FIFO non-priority based mutex*/ SEM_FULL); /*SEM_FULL initial state is unlocked*/ ret = !memtrack.mutex; -#elif defined(NDS_NITRO) - os_init_mutex(&memtrack.mutex); - ret = 0; #elif defined(NO_MUTEX) ret = 0; #endif @@ -251,9 +243,7 @@ void vpx_memory_tracker_destroy() if (!g_logging.type && g_logging.file && g_logging.file != stderr) { -#if !defined(NDS_NITRO) fclose(g_logging.file); -#endif g_logging.file = NULL; } @@ -368,15 +358,12 @@ int vpx_memory_tracker_set_log_type(int type, char *option) g_logging.file = stderr; ret = 0; } - -#if !defined(NDS_NITRO) else { if ((g_logging.file = fopen((char *)option, "w"))) ret = 0; } -#endif break; #if defined(WIN32) && !defined(_WIN32_WCE) case 1: @@ -506,12 +493,6 @@ static void memory_tracker_dump() p->addr, i, p->size, p->file, p->line); -#ifdef NDS_NITRO - - if (!(i % 20)) os_sleep(500); - -#endif - p = p->next; ++i; } @@ -719,9 +700,6 @@ static int memory_tracker_lock_mutex() ret = WaitForSingleObject(memtrack.mutex, INFINITE); #elif defined(VXWORKS) ret = sem_take(memtrack.mutex, WAIT_FOREVER); -#elif defined(NDS_NITRO) - os_lock_mutex(&memtrack.mutex); - ret = 0; #endif if (ret) @@ -754,9 +732,6 @@ static int memory_tracker_unlock_mutex() ret = !ReleaseMutex(memtrack.mutex); #elif defined(VXWORKS) ret = sem_give(memtrack.mutex); -#elif defined(NDS_NITRO) - os_unlock_mutex(&memtrack.mutex); - ret = 0; #endif if (ret) diff --git a/vpx_scale/arm/nds/yv12extend.c b/vpx_scale/arm/nds/yv12extend.c deleted file mode 100644 index 48c0dfb33..000000000 --- a/vpx_scale/arm/nds/yv12extend.c +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -/**************************************************************************** -* -* Module Title : yv12extend.c -* -* Description : -* -***************************************************************************/ - -/**************************************************************************** -* Header Files -****************************************************************************/ -#include "vpx_scale/yv12config.h" -#include "vpx_mem/vpx_mem.h" -#include -#include -#include - -//---- DMA Number -#define DMA_NO 3 - -/**************************************************************************** -* Exports -****************************************************************************/ - -/**************************************************************************** -* -****************************************************************************/ -void -vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf) -{ - int i; - unsigned char *src_ptr1, *src_ptr2; - unsigned char *dest_ptr1, *dest_ptr2; - - unsigned int Border; - int plane_stride; - int plane_height; - int plane_width; - - /***********/ - /* Y Plane */ - /***********/ - Border = ybf->border; - plane_stride = ybf->y_stride; - plane_height = ybf->y_height; - plane_width = ybf->y_width; - - // copy the left and right most columns out - src_ptr1 = ybf->y_buffer; - src_ptr2 = src_ptr1 + plane_width - 1; - dest_ptr1 = src_ptr1 - Border; - dest_ptr2 = src_ptr2 + 1; - - for (i = 0; i < plane_height; i++) - { - mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border); - mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border); - src_ptr1 += plane_stride; - src_ptr2 += plane_stride; - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - // Now copy the top and bottom source lines into each line of the respective borders - src_ptr1 = ybf->y_buffer - Border; - src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; - dest_ptr1 = src_ptr1 - (Border * plane_stride); - dest_ptr2 = src_ptr2 + plane_stride; - - for (i = 0; i < (int)Border; i++) - { - mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride); - mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride); - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - plane_stride /= 2; - plane_height /= 2; - plane_width /= 2; - Border /= 2; - - /***********/ - /* U Plane */ - /***********/ - - // copy the left and right most columns out - src_ptr1 = ybf->u_buffer; - src_ptr2 = src_ptr1 + plane_width - 1; - dest_ptr1 = src_ptr1 - Border; - dest_ptr2 = src_ptr2 + 1; - - for (i = 0; i < plane_height; i++) - { - mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border); - mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border); - src_ptr1 += plane_stride; - src_ptr2 += plane_stride; - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - // Now copy the top and bottom source lines into each line of the respective borders - src_ptr1 = ybf->u_buffer - Border; - src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; - dest_ptr1 = src_ptr1 - (Border * plane_stride); - dest_ptr2 = src_ptr2 + plane_stride; - - for (i = 0; i < (int)(Border); i++) - { - mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride); - mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride); - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - /***********/ - /* V Plane */ - /***********/ - - // copy the left and right most columns out - src_ptr1 = ybf->v_buffer; - src_ptr2 = src_ptr1 + plane_width - 1; - dest_ptr1 = src_ptr1 - Border; - dest_ptr2 = src_ptr2 + 1; - - for (i = 0; i < plane_height; i++) - { - mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border); - mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border); - src_ptr1 += plane_stride; - src_ptr2 += plane_stride; - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - // Now copy the top and bottom source lines into each line of the respective borders - src_ptr1 = ybf->v_buffer - Border; - src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; - dest_ptr1 = src_ptr1 - (Border * plane_stride); - dest_ptr2 = src_ptr2 + plane_stride; - - for (i = 0; i < (int)(Border); i++) - { - mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride); - mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride); - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } -} - - - -/**************************************************************************** -* -* ROUTINE : vp8_yv12_copy_frame -* -* INPUTS : -* -* OUTPUTS : None. -* -* RETURNS : void -* -* FUNCTION : Copies the source image into the destination image and -* updates the destination's UMV borders. -* -* SPECIAL NOTES : The frames are assumed to be identical in size. -* -****************************************************************************/ -void -vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) -{ - int yplane_size = (src_ybc->y_height + 2 * src_ybc->border) * (src_ybc->y_stride); - int mem_size = (yplane_size * 3 / 2) + (src_ybc->y_stride * 2); - - mi_cpu_copy_fast(src_ybc->buffer_alloc, dst_ybc->buffer_alloc, mem_size); - - /* unsigned char *src_y, *dst_y; - unsigned char *src_u, *dst_u; - unsigned char *src_v, *dst_v; - - int yheight, uv_height; - int ystride, uv_stride; - int border; - int yoffset, uvoffset; - - border = src_ybc->border; - yheight = src_ybc->y_height; - uv_height = src_ybc->uv_height; - - ystride = src_ybc->y_stride; - uv_stride = src_ybc->uv_stride; - - yoffset = border * (ystride + 1); - uvoffset = border/2 * (uv_stride + 1); - - src_y = src_ybc->y_buffer - yoffset; - dst_y = dst_ybc->y_buffer - yoffset; - src_u = src_ybc->u_buffer - uvoffset; - dst_u = dst_ybc->u_buffer - uvoffset; - src_v = src_ybc->v_buffer - uvoffset; - dst_v = dst_ybc->v_buffer - uvoffset; - - mi_cpu_copy_fast (src_y, dst_y, ystride * (yheight + 2 * border)); - mi_cpu_copy_fast (src_u, dst_u, uv_stride * (uv_height + border)); - mi_cpu_copy_fast (src_v, dst_v, uv_stride * (uv_height + border)); - */ -} - -#include From e9751d4b748052429ac26df4364be255da048c5b Mon Sep 17 00:00:00 2001 From: Fritz Koenig Date: Mon, 11 Jul 2011 11:42:28 -0700 Subject: [PATCH 7/8] Better allocate yuv buffers. Previously allocated more memory than necessary for yuv buffers. This makes it harder to track bugs with reading uninitialized data. Change-Id: I510f7b298d3c647c869be6e5d51608becc63cce9 --- vpx_scale/generic/yv12config.c | 36 ++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c index cb0ab9466..d02cde28f 100644 --- a/vpx_scale/generic/yv12config.c +++ b/vpx_scale/generic/yv12config.c @@ -24,9 +24,12 @@ vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) { if (ybf) { - duck_free(ybf->buffer_alloc); + vpx_free(ybf->buffer_alloc); - ybf->buffer_alloc = 0; + /* buffer_alloc isn't accessed by most functions. Rather y_buffer, + u_buffer and v_buffer point to buffer_alloc and are used. Clear out + all of this so that a freed pointer isn't inadvertently used */ + vpx_memset (ybf, 0, sizeof (YV12_BUFFER_CONFIG)); } else { @@ -44,38 +47,37 @@ vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int { /*NOTE:*/ - int yplane_size = (height + 2 * border) * (width + 2 * border); - int uvplane_size = ((1 + height) / 2 + border) * ((1 + width) / 2 + border); - if (ybf) { + int uv_width = width >> 1; + int uv_height = height >> 1; + int yplane_size = (height + 2 * border) * (width + 2 * border); + int uvplane_size = (uv_height + border) * (uv_width + border); + vp8_yv12_de_alloc_frame_buffer(ybf); + /* only support allocating buffers that have + a height and width that are multiples of 16 */ + if ((width & 0xf) | (height & 0xf)) + return -3; + ybf->y_width = width; ybf->y_height = height; ybf->y_stride = width + 2 * border; - ybf->uv_width = (1 + width) / 2; - ybf->uv_height = (1 + height) / 2; - ybf->uv_stride = ybf->uv_width + border; + ybf->uv_width = uv_width; + ybf->uv_height = uv_height; + ybf->uv_stride = uv_width + border; ybf->border = border; ybf->frame_size = yplane_size + 2 * uvplane_size; - /* Added 2 extra lines to framebuffer so that copy12x12 doesn't fail - * when we have a large motion vector in V on the last v block. - * Note : We never use these pixels anyway so this doesn't hurt. - */ - ybf->buffer_alloc = (unsigned char *) duck_memalign(32, ybf->frame_size + (ybf->y_stride * 2) + 32, 0); + ybf->buffer_alloc = (unsigned char *) vpx_memalign(32, ybf->frame_size); if (ybf->buffer_alloc == NULL) return -1; ybf->y_buffer = ybf->buffer_alloc + (border * ybf->y_stride) + border; - - if (yplane_size & 0xf) - yplane_size += 16 - (yplane_size & 0xf); - ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * ybf->uv_stride) + border / 2; ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2 * ybf->uv_stride) + border / 2; From 139577f9376e1607581a0152f97ba90b3af964f4 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Tue, 12 Jul 2011 17:22:36 -0400 Subject: [PATCH 8/8] Fix unnecessary casting of B_PREDICTION_MODE (issue 349) Minor fix. Change-Id: Iaf93f6e47e882a33c479e57c7a0d0bf321e291c0 --- vp8/decoder/decodemv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c index 01d940233..0a7942d89 100644 --- a/vp8/decoder/decodemv.c +++ b/vp8/decoder/decodemv.c @@ -180,11 +180,11 @@ static MB_PREDICTION_MODE read_mv_ref(vp8_reader *bc, const vp8_prob *p) return (MB_PREDICTION_MODE)i; } -static MB_PREDICTION_MODE sub_mv_ref(vp8_reader *bc, const vp8_prob *p) +static B_PREDICTION_MODE sub_mv_ref(vp8_reader *bc, const vp8_prob *p) { const int i = vp8_treed_read(bc, vp8_sub_mv_ref_tree, p); - return (MB_PREDICTION_MODE)i; + return (B_PREDICTION_MODE)i; } #ifdef VPX_MODE_COUNT @@ -334,7 +334,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, abovemv.as_int = above_block_mv(mi, k, mis); mv_contz = vp8_mv_cont(&leftmv, &abovemv); - switch ((B_PREDICTION_MODE) sub_mv_ref(bc, vp8_sub_mv_ref_prob2 [mv_contz])) /*pc->fc.sub_mv_ref_prob))*/ + switch (sub_mv_ref(bc, vp8_sub_mv_ref_prob2 [mv_contz])) /*pc->fc.sub_mv_ref_prob))*/ { case NEW4X4: read_mv(bc, &blockmv.as_mv, (const MV_CONTEXT *) mvc);