diff --git a/configure b/configure index 976988043..5eec2a117 100755 --- a/configure +++ b/configure @@ -283,6 +283,7 @@ EXPERIMENT_LIST=" loop_restoration ext_partition obmc + affine_motion " CONFIG_LIST=" dependency_tracking diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c index f6ae6c032..fc9e2e924 100644 --- a/vp10/encoder/encoder.c +++ b/vp10/encoder/encoder.c @@ -410,6 +410,15 @@ static void dealloc_compressor_data(VP10_COMP *cpi) { vpx_free(cpi->active_map.map); cpi->active_map.map = NULL; +#if CONFIG_AFFINE_MOTION + { + // Free up-sampled reference buffers. + int i; + for (i = 0; i < MAX_REF_FRAMES; i++) + vpx_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf); + } +#endif + vp10_free_ref_frame_buffers(cm->buffer_pool); #if CONFIG_VP9_POSTPROC vp10_free_postproc_buffers(cm); @@ -744,6 +753,26 @@ static void alloc_util_frame_buffers(VP10_COMP *cpi) { NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate scaled last source buffer"); + +#if CONFIG_AFFINE_MOTION + { + // Allocate up-sampled reference buffers. + int i; + + for (i = 0; i < MAX_REF_FRAMES; i++) + if (vpx_realloc_frame_buffer(&cpi->upsampled_ref_bufs[i].buf, + (cm->width << 3), (cm->height << 3), + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + (VP9_ENC_BORDER_IN_PIXELS << 3), + cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate up-sampled reference frame buffer"); + } +#endif } @@ -2353,10 +2382,11 @@ static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, #if CONFIG_VP9_HIGHBITDEPTH static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, int bd) { + YV12_BUFFER_CONFIG *dst, int planes, + int bd) { #else static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst) { + YV12_BUFFER_CONFIG *dst, int planes) { #endif // CONFIG_VP9_HIGHBITDEPTH const int src_w = src->y_crop_width; const int src_h = src->y_crop_height; @@ -2374,7 +2404,7 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, for (y = 0; y < dst_h; y += 16) { for (x = 0; x < dst_w; x += 16) { - for (i = 0; i < MAX_MB_PLANE; ++i) { + for (i = 0; i < planes; ++i) { const int factor = (i == 0 || i == 3 ? 1 : 2); const int x_q4 = x * (16 / factor) * src_w / dst_w; const int y_q4 = y * (16 / factor) * src_h / dst_h; @@ -2391,13 +2421,13 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h, 16 / factor, 16 / factor, bd); } else { - vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, + vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w, &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h, 16 / factor, 16 / factor); } #else - vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, + vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w, &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h, 16 / factor, 16 / factor); @@ -2406,7 +2436,10 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, } } - vpx_extend_frame_borders(dst); + if (planes == 1) + vpx_extend_frame_borders_y(dst); + else + vpx_extend_frame_borders(dst); } static int scale_down(VP10_COMP *cpi, int q) { @@ -2462,6 +2495,45 @@ static int recode_loop_test(VP10_COMP *cpi, return force_recode; } +#if CONFIG_AFFINE_MOTION +static INLINE int get_free_upsampled_ref_buf(EncRefCntBuffer *ubufs) { + int i; + + for (i = 0; i < MAX_REF_FRAMES; i++) { + if (!ubufs[i].ref_count) { + return i; + } + } + return INVALID_IDX; +} + +// Up-sample reference frames. +static INLINE int upsample_ref_frame(RefCntBuffer *bufs, +#if CONFIG_VP9_HIGHBITDEPTH + EncRefCntBuffer *ubufs, int new_idx, + int bit_depth) { +#else + EncRefCntBuffer *ubufs, int new_idx) { +#endif + int new_uidx = get_free_upsampled_ref_buf(ubufs); + + if (new_uidx == INVALID_IDX) { + return INVALID_IDX; + } else { + const YV12_BUFFER_CONFIG *const ref = &bufs[new_idx].buf; + YV12_BUFFER_CONFIG *upsampled_ref = &ubufs[new_uidx].buf; + + // Currently, only Y plane is up-sampled, U, V are not used. +#if CONFIG_VP9_HIGHBITDEPTH + scale_and_extend_frame(ref, upsampled_ref, 1, bit_depth); +#else + scale_and_extend_frame(ref, upsampled_ref, 1); +#endif + return new_uidx; + } +} +#endif + void vp10_update_reference_frames(VP10_COMP *cpi) { VP10_COMMON * const cm = &cpi->common; BufferPool *const pool = cm->buffer_pool; @@ -2469,6 +2541,17 @@ void vp10_update_reference_frames(VP10_COMP *cpi) { int ref_frame; #endif // CONFIG_EXT_REFS +#if CONFIG_AFFINE_MOTION + // Always up-sample the current encoded frame. +#if CONFIG_VP9_HIGHBITDEPTH + int new_uidx = upsample_ref_frame(pool->frame_bufs, cpi->upsampled_ref_bufs, + cm->new_fb_idx, (int)cm->bit_depth); +#else + int new_uidx = upsample_ref_frame(pool->frame_bufs, cpi->upsampled_ref_bufs, + cm->new_fb_idx); +#endif +#endif + // At this point the new frame has been encoded. // If any buffer copy / swapping is signaled it should be done here. if (cm->frame_type == KEY_FRAME) { @@ -2476,6 +2559,13 @@ void vp10_update_reference_frames(VP10_COMP *cpi) { &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx); ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx); + +#if CONFIG_AFFINE_MOTION + uref_cnt_fb(cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx); + uref_cnt_fb(cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx); +#endif } else if (vp10_preserve_existing_gf(cpi)) { // We have decided to preserve the previously existing golden frame as our // new ARF frame. However, in the short term in function @@ -2489,7 +2579,10 @@ void vp10_update_reference_frames(VP10_COMP *cpi) { ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx); - +#if CONFIG_AFFINE_MOTION + uref_cnt_fb(cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx); +#endif tmp = cpi->alt_fb_idx; cpi->alt_fb_idx = cpi->gld_fb_idx; cpi->gld_fb_idx = tmp; @@ -2503,6 +2596,10 @@ void vp10_update_reference_frames(VP10_COMP *cpi) { ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx); +#if CONFIG_AFFINE_MOTION + uref_cnt_fb(cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx); +#endif memcpy(cpi->interp_filter_selected[ALTREF_FRAME], cpi->interp_filter_selected[0], sizeof(cpi->interp_filter_selected[0])); @@ -2511,6 +2608,10 @@ void vp10_update_reference_frames(VP10_COMP *cpi) { if (cpi->refresh_golden_frame) { ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx); +#if CONFIG_AFFINE_MOTION + uref_cnt_fb(cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx); +#endif if (!cpi->rc.is_src_frame_alt_ref) memcpy(cpi->interp_filter_selected[GOLDEN_FRAME], cpi->interp_filter_selected[0], @@ -2545,6 +2646,10 @@ void vp10_update_reference_frames(VP10_COMP *cpi) { if (cpi->refresh_last_frame) { ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx); +#if CONFIG_AFFINE_MOTION + uref_cnt_fb(cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx); +#endif if (!cpi->rc.is_src_frame_alt_ref) { memcpy(cpi->interp_filter_selected[LAST_FRAME], cpi->interp_filter_selected[0], @@ -2678,7 +2783,8 @@ void vp10_scale_references(VP10_COMP *cpi) { cm->byte_alignment, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); - scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth); + scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE, + (int)cm->bit_depth); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } @@ -2703,11 +2809,39 @@ void vp10_scale_references(VP10_COMP *cpi) { cm->byte_alignment, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); - scale_and_extend_frame(ref, &new_fb_ptr->buf); + scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } #endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_AFFINE_MOTION + { + const int map_idx = get_ref_frame_map_idx(cpi, ref_frame); + EncRefCntBuffer *ubuf = + &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[map_idx]]; + + if (vpx_realloc_frame_buffer(&ubuf->buf, + (cm->width << 3), (cm->height << 3), + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + (VP9_ENC_BORDER_IN_PIXELS << 3), + cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate up-sampled frame buffer"); +#if CONFIG_VP9_HIGHBITDEPTH + scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, MAX_MB_PLANE, + (int)cm->bit_depth); +#else + scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, MAX_MB_PLANE); +#endif + cpi->scaled_ref_idx[ref_frame - LAST_FRAME] = new_fb; + alloc_frame_mvs(cm, new_fb); + } +#endif } else { const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); RefCntBuffer *const buf = &pool->frame_bufs[buf_idx]; @@ -3787,6 +3921,17 @@ static void init_ref_frame_bufs(VP10_COMMON *cm) { } } +#if CONFIG_AFFINE_MOTION +static INLINE void init_upsampled_ref_frame_bufs(VP10_COMP *cpi) { + int i; + + for (i = 0; i < MAX_REF_FRAMES; ++i) { + cpi->upsampled_ref_bufs[i].ref_count = 0; + cpi->upsampled_ref_idx[i] = INVALID_IDX; + } +} +#endif + static void check_initial_width(VP10_COMP *cpi, #if CONFIG_VP9_HIGHBITDEPTH int use_highbitdepth, @@ -3809,7 +3954,9 @@ static void check_initial_width(VP10_COMP *cpi, alloc_raw_frame_buffers(cpi); init_ref_frame_bufs(cm); alloc_util_frame_buffers(cpi); - +#if CONFIG_AFFINE_MOTION + init_upsampled_ref_frame_bufs(cpi); +#endif init_motion_estimation(cpi); // TODO(agrange) This can be removed. cpi->initial_width = cm->width; diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h index 292494caf..2c158a47c 100644 --- a/vp10/encoder/encoder.h +++ b/vp10/encoder/encoder.h @@ -286,6 +286,13 @@ typedef struct IMAGE_STAT { double worst; } ImageStat; +#if CONFIG_AFFINE_MOTION +typedef struct { + int ref_count; + YV12_BUFFER_CONFIG buf; +} EncRefCntBuffer; +#endif + typedef struct VP10_COMP { QUANTS quants; ThreadData td; @@ -304,6 +311,12 @@ typedef struct VP10_COMP { YV12_BUFFER_CONFIG *unscaled_last_source; YV12_BUFFER_CONFIG scaled_last_source; +#if CONFIG_AFFINE_MOTION + // Up-sampled reference buffers + EncRefCntBuffer upsampled_ref_bufs[MAX_REF_FRAMES]; + int upsampled_ref_idx[MAX_REF_FRAMES]; +#endif + TileDataEnc *tile_data; int allocated_tiles; // Keep track of memory allocated for tiles. @@ -692,4 +705,18 @@ void vp10_new_framerate(VP10_COMP *cpi, double framerate); } // extern "C" #endif +#if CONFIG_AFFINE_MOTION +// Update up-sampled reference frame index. +static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx, + int new_uidx) { + const int ref_index = *uidx; + + if (ref_index >= 0 && ubufs[ref_index].ref_count > 0) + ubufs[ref_index].ref_count--; + + *uidx = new_uidx; + ubufs[new_uidx].ref_count++; +} +#endif + #endif // VP10_ENCODER_ENCODER_H_ diff --git a/vp10/encoder/mbgraph.c b/vp10/encoder/mbgraph.c index 2d3a33e39..1f467b811 100644 --- a/vp10/encoder/mbgraph.c +++ b/vp10/encoder/mbgraph.c @@ -64,7 +64,11 @@ static unsigned int do_16x16_motion_iteration(VP10_COMP *cpi, &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL, NULL, +#if CONFIG_AFFINE_MOTION + &distortion, &sse, NULL, 0, 0, 0); +#else &distortion, &sse, NULL, 0, 0); +#endif } #if CONFIG_EXT_INTER diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c index 6e3b06ab9..8949f76bc 100644 --- a/vp10/encoder/mcomp.c +++ b/vp10/encoder/mcomp.c @@ -208,6 +208,32 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { v = INT_MAX; \ } +#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c) + +#if CONFIG_AFFINE_MOTION +static INLINE const uint8_t *upre(const uint8_t *buf, int stride, + int r, int c) { + return &buf[(r) * stride + (c)]; +} + +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER1(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + thismse = upsampled_pref_error(xd, vfp, z, src_stride, \ + upre(y, y_stride, r, c), y_stride, \ + second_pred, w, h, &sse); \ + if ((v = MVC(r, c) + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } +#endif + #define FIRST_LEVEL_CHECKS \ { \ unsigned int left, right, up, down, diag; \ @@ -276,7 +302,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { // TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of // SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten // later in the same way. -#define SECOND_LEVEL_CHECKS_BEST \ +#define SECOND_LEVEL_CHECKS_BEST(k) \ { \ unsigned int second; \ int br0 = br; \ @@ -287,10 +313,10 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { } else if (tr != br && tc == bc) { \ kr = br - tr; \ } \ - CHECK_BETTER(second, br0 + kr, bc0); \ - CHECK_BETTER(second, br0, bc0 + kc); \ + CHECK_BETTER##k(second, br0 + kr, bc0); \ + CHECK_BETTER##k(second, br0, bc0 + kc); \ if (br0 != br || bc0 != bc) { \ - CHECK_BETTER(second, br0 + kr, bc0 + kc); \ + CHECK_BETTER##k(second, br0 + kr, bc0 + kc); \ } \ } @@ -412,7 +438,11 @@ int vp10_find_best_sub_pixel_tree_pruned_evenmore( int *distortion, unsigned int *sse1, const uint8_t *second_pred, +#if CONFIG_AFFINE_MOTION + int w, int h, int use_upsampled_ref) { +#else int w, int h) { +#endif SETUP_SUBPEL_SEARCH; besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, @@ -425,6 +455,9 @@ int vp10_find_best_sub_pixel_tree_pruned_evenmore( (void) allow_hp; (void) forced_stop; (void) hstep; +#if CONFIG_AFFINE_MOTION + (void) use_upsampled_ref; +#endif if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && @@ -491,8 +524,17 @@ int vp10_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x, int *distortion, unsigned int *sse1, const uint8_t *second_pred, +#if CONFIG_AFFINE_MOTION + int w, int h, + int use_upsampled_ref) { +#else int w, int h) { +#endif SETUP_SUBPEL_SEARCH; +#if CONFIG_AFFINE_MOTION + (void) use_upsampled_ref; +#endif + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, offset, mvjcost, mvcost, @@ -565,8 +607,16 @@ int vp10_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x, int *distortion, unsigned int *sse1, const uint8_t *second_pred, +#if CONFIG_AFFINE_MOTION + int w, int h, int use_upsampled_ref) { +#else int w, int h) { +#endif SETUP_SUBPEL_SEARCH; +#if CONFIG_AFFINE_MOTION + (void) use_upsampled_ref; +#endif + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, offset, mvjcost, mvcost, @@ -655,6 +705,101 @@ static const MV search_step_table[12] = { {0, -1}, {0, 1}, {-1, 0}, {1, 0} }; + +#if CONFIG_AFFINE_MOTION +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_comp_avg_upsampled_pred(uint16_t *comp_pred, + const uint8_t *pred8, + int width, int height, + const uint8_t *ref8, + int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int tmp = pred[j] + ref[(j << 3)]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += stride; + } +} + +static void highbd_upsampled_pred(uint16_t *comp_pred, + int width, int height, + const uint8_t *ref8, + int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + comp_pred[j] = ref[(j << 3)]; + } + comp_pred += width; + ref += stride; + } +} +#endif + +static int upsampled_pref_error(const MACROBLOCKD *xd, + const vp9_variance_fn_ptr_t *vfp, + const uint8_t *const src, const int src_stride, + const uint8_t *const y, int y_stride, + const uint8_t *second_pred, + int w, int h, unsigned int *sse) { + unsigned int besterr; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]); + if (second_pred != NULL) + highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y, + y_stride); + else + highbd_upsampled_pred(pred16, w, h, y, y_stride); + + besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, + sse); + } else { + DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]); +#else + DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]); + (void) xd; +#endif // CONFIG_VP9_HIGHBITDEPTH + if (second_pred != NULL) + vpx_comp_avg_upsampled_pred(pred, second_pred, w, h, y, + y_stride); + else + vpx_upsampled_pred(pred, w, h, y, y_stride); + + besterr = vfp->vf(pred, w, src, src_stride, sse); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif +return besterr; +} + +static unsigned int upsampled_setup_center_error( + const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, + const uint8_t *const src, const int src_stride, + const uint8_t *const y, int y_stride, const uint8_t *second_pred, + int w, int h, int offset, int *mvjcost, int *mvcost[2], + unsigned int *sse1, int *distortion) { + unsigned int besterr = upsampled_pref_error(xd, vfp, src, src_stride, + y + offset, y_stride, second_pred, + w, h, sse1); + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + return besterr; +} +#endif + int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, @@ -667,14 +812,18 @@ int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x, int *distortion, unsigned int *sse1, const uint8_t *second_pred, +#if CONFIG_AFFINE_MOTION + int w, int h, int use_upsampled_ref) { +#else int w, int h) { +#endif const uint8_t *const z = x->plane[0].src.buf; const uint8_t *const src_address = z; const int src_stride = x->plane[0].src.stride; const MACROBLOCKD *xd = &x->e_mbd; unsigned int besterr = INT_MAX; unsigned int sse; - int thismse; + unsigned int thismse; const int y_stride = xd->plane[0].pre[0].stride; const int offset = bestmv->row * y_stride + bestmv->col; const uint8_t *const y = xd->plane[0].pre[0].buf; @@ -703,10 +852,19 @@ int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x, bestmv->row *= 8; bestmv->col *= 8; - besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, - z, src_stride, y, y_stride, second_pred, - w, h, offset, mvjcost, mvcost, - sse1, distortion); +#if CONFIG_AFFINE_MOTION + // use_upsampled_ref can be 0 or 1 + if (use_upsampled_ref) + besterr = upsampled_setup_center_error(xd, bestmv, ref_mv, error_per_bit, + vfp, z, src_stride, y, y_stride, + second_pred, w, h, (offset << 3), + mvjcost, mvcost, sse1, distortion); + else +#endif + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, + z, src_stride, y, y_stride, second_pred, + w, h, offset, mvjcost, mvcost, + sse1, distortion); (void) cost_list; // to silence compiler warning @@ -716,16 +874,29 @@ int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x, tr = br + search_step[idx].row; tc = bc + search_step[idx].col; if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { - const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); - MV this_mv; - this_mv.row = tr; - this_mv.col = tc; - if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), - src_address, src_stride, &sse); - else - thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), - src_address, src_stride, &sse, second_pred); + MV this_mv = {tr, tc}; + +#if CONFIG_AFFINE_MOTION + if (use_upsampled_ref) { + const uint8_t *const pre_address = y + tr * y_stride + tc; + + thismse = upsampled_pref_error(xd, vfp, src_address, src_stride, + pre_address, y_stride, second_pred, + w, h, &sse); + } else { +#endif + const uint8_t *const pre_address = y + (tr >> 3) * y_stride + + (tc >> 3); + if (second_pred == NULL) + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse); + else + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse, second_pred); +#if CONFIG_AFFINE_MOTION + } +#endif + cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); @@ -747,14 +918,29 @@ int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x, tc = bc + kc; tr = br + kr; if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { - const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); MV this_mv = {tr, tc}; - if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), - src_address, src_stride, &sse); - else - thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), - src_address, src_stride, &sse, second_pred); + +#if CONFIG_AFFINE_MOTION + if (use_upsampled_ref) { + const uint8_t *const pre_address = y + tr * y_stride + tc; + + thismse = upsampled_pref_error(xd, vfp, src_address, src_stride, + pre_address, y_stride, second_pred, + w, h, &sse); + } else { +#endif + const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); + + if (second_pred == NULL) + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse); + else + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse, second_pred); +#if CONFIG_AFFINE_MOTION + } +#endif + cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); @@ -776,8 +962,17 @@ int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x, bc = tc; } - if (iters_per_step > 1 && best_idx != -1) - SECOND_LEVEL_CHECKS_BEST; + if (iters_per_step > 1 && best_idx != -1) { +#if CONFIG_AFFINE_MOTION + if (use_upsampled_ref) { + SECOND_LEVEL_CHECKS_BEST(1); + } else { +#endif + SECOND_LEVEL_CHECKS_BEST(0); +#if CONFIG_AFFINE_MOTION + } +#endif + } tr = br; tc = bc; diff --git a/vp10/encoder/mcomp.h b/vp10/encoder/mcomp.h index 9d1ab2aab..3063b996e 100644 --- a/vp10/encoder/mcomp.h +++ b/vp10/encoder/mcomp.h @@ -116,7 +116,11 @@ typedef int (fractional_mv_step_fp) ( int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, const uint8_t *second_pred, +#if CONFIG_AFFINE_MOTION + int w, int h, int use_upsampled_ref); +#else int w, int h); +#endif extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree; extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree_pruned; diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c index 03aa9f086..5c74d32eb 100644 --- a/vp10/encoder/rdopt.c +++ b/vp10/encoder/rdopt.c @@ -3929,7 +3929,8 @@ static void joint_motion_search(VP10_COMP *cpi, MACROBLOCK *x, int_mv* ref_mv_sub8x8[2], #endif int_mv single_newmv[MAX_REF_FRAMES], - int *rate_mv) { + int *rate_mv, + const int block) { const VP10_COMMON *const cm = &cpi->common; const int pw = 4 * num_4x4_blocks_wide_lookup[bsize]; const int ph = 4 * num_4x4_blocks_high_lookup[bsize]; @@ -4076,6 +4077,40 @@ static void joint_motion_search(VP10_COMP *cpi, MACROBLOCK *x, if (bestsme < INT_MAX) { int dis; /* TODO: use dis in distortion calculation later. */ unsigned int sse; +#if CONFIG_AFFINE_MOTION + // Use up-sampled reference frames. + struct macroblockd_plane *const pd = &xd->plane[0]; + struct buf_2d backup_pred = pd->pre[0]; + const YV12_BUFFER_CONFIG *upsampled_ref = + get_upsampled_ref(cpi, refs[id]); + + // Set pred for Y plane + setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer, + upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), + NULL, pd->subsampling_x, pd->subsampling_y); + + // If bsize < BLOCK_8X8, adjust pred pointer for this block + if (bsize < BLOCK_8X8) + pd->pre[0].buf = + &pd->pre[0].buf[(vp10_raster_block_offset(BLOCK_8X8, block, + pd->pre[0].stride)) << 3]; + + bestsme = cpi->find_fractional_mv_step( + x, &tmp_mv, + &ref_mv[id].as_mv, + cpi->common.allow_high_precision_mv, + x->errorperbit, + &cpi->fn_ptr[bsize], + 0, cpi->sf.mv.subpel_iters_per_step, + NULL, + x->nmvjointcost, x->mvcost, + &dis, &sse, second_pred, + pw, ph, 1); + + // Restore the reference frames. + pd->pre[0] = backup_pred; +#else + (void) block; bestsme = cpi->find_fractional_mv_step( x, &tmp_mv, &ref_mv[id].as_mv, @@ -4087,6 +4122,7 @@ static void joint_motion_search(VP10_COMP *cpi, MACROBLOCK *x, x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, pw, ph); +#endif } // Restore the pointer to the first (possibly scaled) prediction buffer. @@ -4367,6 +4403,43 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x, if (bestsme < INT_MAX) { int distortion; +#if CONFIG_AFFINE_MOTION + const int pw = 4 * num_4x4_blocks_wide_lookup[bsize]; + const int ph = 4 * num_4x4_blocks_high_lookup[bsize]; + // Use up-sampled reference frames. + struct macroblockd_plane *const pd = &xd->plane[0]; + struct buf_2d backup_pred = pd->pre[0]; + const YV12_BUFFER_CONFIG *upsampled_ref = + get_upsampled_ref(cpi, mbmi->ref_frame[0]); + + // Set pred for Y plane + setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer, + upsampled_ref->y_stride, + (mi_row << 3), (mi_col << 3), + NULL, pd->subsampling_x, pd->subsampling_y); + + // adjust pred pointer for this block + pd->pre[0].buf = + &pd->pre[0].buf[(vp10_raster_block_offset(BLOCK_8X8, i, + pd->pre[0].stride)) << 3]; + + cpi->find_fractional_mv_step( + x, + new_mv, + &bsi->ref_mv[0]->as_mv, + cm->allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, + cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, + &distortion, + &x->pred_sse[mbmi->ref_frame[0]], + NULL, pw, ph, 1); + + // Restore the reference frames. + pd->pre[0] = backup_pred; +#else cpi->find_fractional_mv_step( x, new_mv, @@ -4380,6 +4453,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x, &distortion, &x->pred_sse[mbmi->ref_frame[0]], NULL, 0, 0); +#endif // save motion search result for use in compound prediction #if CONFIG_EXT_INTER @@ -4426,7 +4500,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x, #else seg_mvs[i], #endif // CONFIG_EXT_INTER - &rate_mv); + &rate_mv, i); #if CONFIG_EXT_INTER compound_seg_newmvs[i][0].as_int = frame_mv[this_mode][mbmi->ref_frame[0]].as_int; @@ -4975,6 +5049,33 @@ static void single_motion_search(VP10_COMP *cpi, MACROBLOCK *x, if (bestsme < INT_MAX) { int dis; /* TODO: use dis in distortion calculation later. */ +#if CONFIG_AFFINE_MOTION + const int pw = 4 * num_4x4_blocks_wide_lookup[bsize]; + const int ph = 4 * num_4x4_blocks_high_lookup[bsize]; + // Use up-sampled reference frames. + struct macroblockd_plane *const pd = &xd->plane[0]; + struct buf_2d backup_pred = pd->pre[0]; + const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref); + + // Set pred for Y plane + setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer, + upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), + NULL, pd->subsampling_x, pd->subsampling_y); + + bestsme = cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv, + cm->allow_high_precision_mv, + x->errorperbit, + &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, + cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, + &dis, &x->pred_sse[ref], NULL, + pw, ph, 1); + + // Restore the reference frames. + pd->pre[0] = backup_pred; +#else cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv, cm->allow_high_precision_mv, x->errorperbit, @@ -4984,6 +5085,7 @@ static void single_motion_search(VP10_COMP *cpi, MACROBLOCK *x, cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0); +#endif } *rate_mv = vp10_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); @@ -5328,7 +5430,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { joint_motion_search(cpi, x, bsize, frame_mv, - mi_row, mi_col, NULL, single_newmv, &rate_mv); + mi_row, mi_col, NULL, single_newmv, &rate_mv, 0); } else { rate_mv = vp10_mv_bit_cost(&frame_mv[refs[0]].as_mv, &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv, @@ -5358,7 +5460,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, - single_newmv, &rate_mv); + single_newmv, &rate_mv, 0); } else { rate_mv = vp10_mv_bit_cost(&frame_mv[refs[0]].as_mv, &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv, diff --git a/vp10/encoder/rdopt.h b/vp10/encoder/rdopt.h index 066bf69f5..74702a95b 100644 --- a/vp10/encoder/rdopt.h +++ b/vp10/encoder/rdopt.h @@ -106,4 +106,20 @@ void vp10_build_prediction_by_left_preds(VP10_COMP *cpi, } // extern "C" #endif +#if CONFIG_AFFINE_MOTION +static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(VP10_COMP *cpi, + const int ref) { + // Use up-sampled reference frames. + int ref_idx = 0; + if (ref == LAST_FRAME) + ref_idx = cpi->lst_fb_idx; + else if (ref == GOLDEN_FRAME) + ref_idx = cpi->gld_fb_idx; + else if (ref == ALTREF_FRAME) + ref_idx = cpi->alt_fb_idx; + + return &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[ref_idx]].buf; +} +#endif + #endif // VP10_ENCODER_RDOPT_H_ diff --git a/vp10/encoder/temporal_filter.c b/vp10/encoder/temporal_filter.c index d16e4a4e2..3e1246a80 100644 --- a/vp10/encoder/temporal_filter.c +++ b/vp10/encoder/temporal_filter.c @@ -320,7 +320,11 @@ static int temporal_filter_find_matching_mb_c(VP10_COMP *cpi, 0, mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL, NULL, +#if CONFIG_AFFINE_MOTION + &distortion, &sse, NULL, 0, 0, 0); +#else &distortion, &sse, NULL, 0, 0); +#endif // Restore input state x->plane[0].src = src; diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c index e8bddb0a0..3b6c41974 100644 --- a/vpx_dsp/variance.c +++ b/vpx_dsp/variance.c @@ -272,6 +272,41 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, } } +#if CONFIG_AFFINE_MOTION +// Get pred block from up-sampled reference. +void vpx_upsampled_pred_c(uint8_t *comp_pred, + int width, int height, + const uint8_t *ref, int ref_stride) { + int i, j, k; + int stride = ref_stride << 3; + + for (i = 0; i < height; i++) { + for (j = 0, k = 0; j < width; j++, k += 8) { + comp_pred[j] = ref[k]; + } + comp_pred += width; + ref += stride; + } +} + +void vpx_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, + const uint8_t *ref, int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = ref[(j << 3)] + pred[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += stride; + } +} +#endif + #if CONFIG_VP9_HIGHBITDEPTH static void highbd_variance64(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 5457d00bf..8d1afdfac 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1464,6 +1464,13 @@ add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; +if (vpx_config("CONFIG_AFFINE_MOTION") eq "yes") { + add_proto qw/void vpx_upsampled_pred/, "uint8_t *comp_pred, int width, int height, const uint8_t *ref, int ref_stride"; + specialize qw/vpx_upsampled_pred sse2/; + add_proto qw/void vpx_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; + specialize qw/vpx_comp_avg_upsampled_pred sse2/; +} + # # Subpixel Variance # diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c index e6c9365ab..7943c843c 100644 --- a/vpx_dsp/x86/variance_sse2.c +++ b/vpx_dsp/x86/variance_sse2.c @@ -475,3 +475,232 @@ FNS(ssse3, ssse3); #undef FNS #undef FN #endif // CONFIG_USE_X86INC + +#if CONFIG_AFFINE_MOTION +void vpx_upsampled_pred_sse2(uint8_t *comp_pred, + int width, int height, + const uint8_t *ref, int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + if (width >= 16) { + // read 16 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j+= 16) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64)); + __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80)); + __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96)); + __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112)); + __m128i t0, t1, t2, t3; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + t2 = _mm_unpacklo_epi8(s4, s5); + s5 = _mm_unpackhi_epi8(s4, s5); + t3 = _mm_unpacklo_epi8(s6, s7); + s7 = _mm_unpackhi_epi8(s6, s7); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s4 = _mm_unpacklo_epi8(t2, s5); + s6 = _mm_unpacklo_epi8(t3, s7); + + *(int *)comp_pred = _mm_cvtsi128_si32(s0); + *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(s2); + *(int *)(comp_pred + 8) = _mm_cvtsi128_si32(s4); + *(int *)(comp_pred + 12) = _mm_cvtsi128_si32(s6); + + comp_pred += 16; + ref += 16 * 8; + } + ref += stride - (width << 3); + } + } else if (width >= 8) { + // read 8 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j+= 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i t0, t1; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + + *(int *)comp_pred = _mm_cvtsi128_si32(s0); + *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(s2); + comp_pred += 8; + ref += 8 * 8; + } + ref += stride - (width << 3); + } + } else { + // read 4 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j+= 4) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i t0; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + s0 = _mm_unpacklo_epi8(t0, s1); + + *(int *)comp_pred = _mm_cvtsi128_si32(s0); + + comp_pred += 4; + ref += 4 * 8; + } + ref += stride - (width << 3); + } + } +} + +void vpx_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, + const uint8_t *ref, int ref_stride) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + int i, j; + int stride = ref_stride << 3; + + if (width >= 16) { + // read 16 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j+= 16) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64)); + __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80)); + __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96)); + __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112)); + __m128i p0 = _mm_loadu_si128((const __m128i *)pred); + __m128i p1; + __m128i t0, t1, t2, t3; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + t2 = _mm_unpacklo_epi8(s4, s5); + s5 = _mm_unpackhi_epi8(s4, s5); + t3 = _mm_unpacklo_epi8(s6, s7); + s7 = _mm_unpackhi_epi8(s6, s7); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s4 = _mm_unpacklo_epi8(t2, s5); + s6 = _mm_unpacklo_epi8(t3, s7); + + s0 = _mm_unpacklo_epi32(s0, s2); + s4 = _mm_unpacklo_epi32(s4, s6); + s0 = _mm_unpacklo_epi8(s0, zero); + s4 = _mm_unpacklo_epi8(s4, zero); + + p1 = _mm_unpackhi_epi8(p0, zero); + p0 = _mm_unpacklo_epi8(p0, zero); + p0 = _mm_adds_epu16(s0, p0); + p1 = _mm_adds_epu16(s4, p1); + p0 = _mm_adds_epu16(p0, one); + p1 = _mm_adds_epu16(p1, one); + + p0 = _mm_srli_epi16(p0, 1); + p1 = _mm_srli_epi16(p1, 1); + p0 = _mm_packus_epi16(p0, p1); + + *(int *)comp_pred = _mm_cvtsi128_si32(p0); + p0 = _mm_srli_si128(p0, 4); + *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(p0); + p0 = _mm_srli_si128(p0, 4); + *(int *)(comp_pred + 8) = _mm_cvtsi128_si32(p0); + p0 = _mm_srli_si128(p0, 4); + *(int *)(comp_pred + 12) = _mm_cvtsi128_si32(p0); + + comp_pred += 16; + pred += 16; + ref += 16 * 8; + } + ref += stride - (width << 3); + } + } else if (width >= 8) { + // read 8 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j+= 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i p0 = _mm_loadl_epi64((const __m128i *)pred); + __m128i t0, t1; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s0 = _mm_unpacklo_epi32(s0, s2); + s0 = _mm_unpacklo_epi8(s0, zero); + + p0 = _mm_unpacklo_epi8(p0, zero); + p0 = _mm_adds_epu16(s0, p0); + p0 = _mm_adds_epu16(p0, one); + p0 = _mm_srli_epi16(p0, 1); + p0 = _mm_packus_epi16(p0, zero); + + *(int *)comp_pred = _mm_cvtsi128_si32(p0); + p0 = _mm_srli_si128(p0, 4); + *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(p0); + + comp_pred += 8; + pred += 8; + ref += 8 * 8; + } + ref += stride - (width << 3); + } + } else { + // read 4 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j+= 4) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)pred); + __m128i t0; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + s0 = _mm_unpacklo_epi8(t0, s1); + s0 = _mm_unpacklo_epi8(s0, zero); + + p0 = _mm_unpacklo_epi8(p0, zero); + p0 = _mm_adds_epu16(s0, p0); + p0 = _mm_adds_epu16(p0, one); + p0 = _mm_srli_epi16(p0, 1); + p0 = _mm_packus_epi16(p0, zero); + + *(int *)comp_pred = _mm_cvtsi128_si32(p0); + + comp_pred += 4; + pred += 4; + ref += 4 * 8; + } + ref += stride - (width << 3); + } + } +} +#endif diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c index 670144bc1..521207589 100644 --- a/vpx_scale/generic/yv12extend.c +++ b/vpx_scale/generic/yv12extend.c @@ -210,6 +210,30 @@ void vpx_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) { extend_frame(ybf, inner_bw); } +void vpx_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) { + int ext_size = ybf->border; + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); + +#if CONFIG_VP9_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + extend_plane_high(ybf->y_buffer, ybf->y_stride, + ybf->y_crop_width, ybf->y_crop_height, + ext_size, ext_size, + ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width); + return; + } +#endif + extend_plane(ybf->y_buffer, ybf->y_stride, + ybf->y_crop_width, ybf->y_crop_height, + ext_size, ext_size, + ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width); +} + #if CONFIG_VP9_HIGHBITDEPTH void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) { uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl index 56b952ba3..68a1a3ec0 100644 --- a/vpx_scale/vpx_scale_rtcd.pl +++ b/vpx_scale/vpx_scale_rtcd.pl @@ -28,5 +28,8 @@ if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes")) add_proto qw/void vpx_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf"; specialize qw/vpx_extend_frame_inner_borders dspr2/; + + add_proto qw/void vpx_extend_frame_borders_y/, "struct yv12_buffer_config *ybf"; + specialize qw/vpx_extend_frame_borders_y/; } 1;