From 432cd4bfb795534c8ba479fd735cc11dc8562469 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Mon, 6 Jul 2015 09:33:27 -0700 Subject: [PATCH] Move subtract functions from vp9 to vpx_dsp Factor out the subtraction operator as common function. Change-Id: I526e703477c6a290e0e3e3c8898f8bb1ca82779b --- test/vp9_subtract_test.cc | 7 +- vp8/encoder/encodemb.c | 2 + vp9/common/vp9_rtcd_defs.pl | 6 - vp9/encoder/vp9_encodemb.c | 60 +----- vp9/encoder/vp9_rdopt.c | 11 +- vp9/vp9cx.mk | 3 - .../arm/subtract_neon.c | 5 +- vpx_dsp/mips/macros_msa.h | 193 ++++++++++++++++++ .../mips/subtract_msa.c | 10 +- vpx_dsp/subtract.c | 56 +++++ vpx_dsp/vpx_dsp.mk | 4 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 12 ++ .../x86/subtract_sse2.asm | 3 +- 13 files changed, 297 insertions(+), 75 deletions(-) rename vp9/encoder/arm/neon/vp9_subtract_neon.c => vpx_dsp/arm/subtract_neon.c (97%) rename vp9/encoder/mips/msa/vp9_subtract_msa.c => vpx_dsp/mips/subtract_msa.c (97%) create mode 100644 vpx_dsp/subtract.c rename vp9/encoder/x86/vp9_subtract_sse2.asm => vpx_dsp/x86/subtract_sse2.asm (98%) diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc index fabb43824..a1798d706 100644 --- a/test/vp9_subtract_test.cc +++ b/test/vp9_subtract_test.cc @@ -14,6 +14,7 @@ #include "test/register_state_check.h" #include "./vpx_config.h" #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vp9/common/vp9_blockd.h" #include "vpx_mem/vpx_mem.h" @@ -89,15 +90,15 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) { } INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest, - ::testing::Values(vp9_subtract_block_c)); + ::testing::Values(vpx_subtract_block_c)); #if HAVE_SSE2 && CONFIG_USE_X86INC INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest, - ::testing::Values(vp9_subtract_block_sse2)); + ::testing::Values(vpx_subtract_block_sse2)); #endif #if HAVE_NEON INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest, - ::testing::Values(vp9_subtract_block_neon)); + ::testing::Values(vpx_subtract_block_neon)); #endif } // namespace vp9 diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c index dfd0a237a..820b1376b 100644 --- a/vp8/encoder/encodemb.c +++ b/vp8/encoder/encodemb.c @@ -19,6 +19,8 @@ #include "vpx_mem/vpx_mem.h" #include "rdopt.h" +// TODO(jingning,johannkoenig): use vpx_subtract_block to replace +// codec specified vp9_subtract_ functions. void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) { unsigned char *src_ptr = (*(be->base_src) + be->src); diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index d8c7e1447..e3ab06d93 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -922,9 +922,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # ENCODEMB INVOKE -add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/vp9_subtract_block neon msa/, "$sse2_x86inc"; - # # Denoiser # @@ -1328,9 +1325,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; specialize qw/vp9_highbd_block_error sse2/; - add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; - specialize qw/vp9_highbd_subtract_block/; - add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_highbd_quantize_fp/; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 2829365e5..313094140 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -11,6 +11,7 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" @@ -31,45 +32,6 @@ struct optimize_ctx { ENTROPY_CONTEXT tl[MAX_MB_PLANE][16]; }; -void vp9_subtract_block_c(int rows, int cols, - int16_t *diff, ptrdiff_t diff_stride, - const uint8_t *src, ptrdiff_t src_stride, - const uint8_t *pred, ptrdiff_t pred_stride) { - int r, c; - - for (r = 0; r < rows; r++) { - for (c = 0; c < cols; c++) - diff[c] = src[c] - pred[c]; - - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } -} - -#if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_subtract_block_c(int rows, int cols, - int16_t *diff, ptrdiff_t diff_stride, - const uint8_t *src8, ptrdiff_t src_stride, - const uint8_t *pred8, ptrdiff_t pred_stride, - int bd) { - int r, c; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - (void) bd; - - for (r = 0; r < rows; r++) { - for (c = 0; c < cols; c++) { - diff[c] = src[c] - pred[c]; - } - - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH - void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; @@ -79,13 +41,13 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { #if CONFIG_VP9_HIGHBITDEPTH if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, + vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, x->e_mbd.bd); return; } #endif // CONFIG_VP9_HIGHBITDEPTH - vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); } @@ -838,7 +800,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_highbd_subtract_block(32, 32, src_diff, diff_stride, + vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, @@ -859,7 +821,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_highbd_subtract_block(16, 16, src_diff, diff_stride, + vpx_highbd_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type); vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, @@ -881,7 +843,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_highbd_subtract_block(8, 8, src_diff, diff_stride, + vpx_highbd_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type); vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, @@ -904,7 +866,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_highbd_subtract_block(4, 4, src_diff, diff_stride, + vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); if (tx_type != DCT_DCT) vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type); @@ -946,7 +908,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_subtract_block(32, 32, src_diff, diff_stride, + vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride); fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, @@ -966,7 +928,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_subtract_block(16, 16, src_diff, diff_stride, + vpx_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst, dst_stride); vp9_fht16x16(src_diff, coeff, diff_stride, tx_type); vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, @@ -986,7 +948,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_subtract_block(8, 8, src_diff, diff_stride, + vpx_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst, dst_stride); vp9_fht8x8(src_diff, coeff, diff_stride, tx_type); vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, @@ -1007,7 +969,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_subtract_block(4, 4, src_diff, diff_stride, + vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst, dst_stride); if (tx_type != DCT_DCT) vp9_fht4x4(src_diff, coeff, diff_stride, tx_type); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index e6b7f193a..dc054f0c1 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -12,6 +12,7 @@ #include #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" @@ -832,7 +833,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, x->skip_encode ? src : dst, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, idx, idy, 0); - vp9_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, + vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride, xd->bd); if (xd->lossless) { const scan_order *so = &vp9_default_scan_orders[TX_4X4]; @@ -932,7 +933,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, x->skip_encode ? src : dst, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, idx, idy, 0); - vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride); + vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride); if (xd->lossless) { const scan_order *so = &vp9_default_scan_orders[TX_4X4]; @@ -1394,16 +1395,16 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_subtract_block( + vpx_highbd_subtract_block( height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, src, p->src.stride, dst, pd->dst.stride, xd->bd); } else { - vp9_subtract_block( + vpx_subtract_block( height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, src, p->src.stride, dst, pd->dst.stride); } #else - vp9_subtract_block(height, width, + vpx_subtract_block(height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, src, p->src.stride, dst, pd->dst.stride); #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 5a039accc..f670d82e7 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -114,7 +114,6 @@ endif ifeq ($(CONFIG_USE_X86INC),yes) VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) @@ -151,7 +150,6 @@ VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c endif VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c -VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c @@ -161,7 +159,6 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h -VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_subtract_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/vp9/encoder/arm/neon/vp9_subtract_neon.c b/vpx_dsp/arm/subtract_neon.c similarity index 97% rename from vp9/encoder/arm/neon/vp9_subtract_neon.c rename to vpx_dsp/arm/subtract_neon.c index b4bf567db..7b146095e 100644 --- a/vp9/encoder/arm/neon/vp9_subtract_neon.c +++ b/vpx_dsp/arm/subtract_neon.c @@ -9,12 +9,11 @@ */ #include -#include "./vp9_rtcd.h" -#include "./vpx_config.h" +#include "./vpx_config.h" #include "vpx/vpx_integer.h" -void vp9_subtract_block_neon(int rows, int cols, +void vpx_subtract_block_neon(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src, ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) { diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h index 30e152bc1..8410f4bbd 100644 --- a/vpx_dsp/mips/macros_msa.h +++ b/vpx_dsp/mips/macros_msa.h @@ -24,6 +24,9 @@ #define LD_UH(...) LD_H(v8u16, __VA_ARGS__) #define LD_SH(...) LD_H(v8i16, __VA_ARGS__) +#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) + #if (__mips_isa_rev >= 6) #define LW(psrc) ({ \ const uint8_t *psrc_m = (const uint8_t *)(psrc); \ @@ -38,6 +41,61 @@ \ val_m; \ }) + +#if (__mips == 64) +#define LD(psrc) ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ __volatile__ ( \ + "ld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m) \ + ); \ + \ + val_m; \ +}) +#else // !(__mips == 64) +#define LD(psrc) ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + \ + val0_m = LW(psrc_m); \ + val1_m = LW(psrc_m + 4); \ + \ + val_m = (uint64_t)(val1_m); \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ + \ + val_m; \ +}) +#endif // (__mips == 64) + +#define SW(val, pdst) { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__ ( \ + "sw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ +} + +#define SD(val, pdst) { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint64_t val_m = (val); \ + \ + __asm__ __volatile__ ( \ + "sd %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ +} #else // !(__mips_isa_rev >= 6) #define LW(psrc) ({ \ const uint8_t *psrc_m = (const uint8_t *)(psrc); \ @@ -52,6 +110,60 @@ \ val_m; \ }) + +#define SW(val, pdst) { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__ ( \ + "usw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ +} + +#if (__mips == 64) +#define LD(psrc) ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ __volatile__ ( \ + "uld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m) \ + ); \ + \ + val_m; \ +}) +#else // !(__mips == 64) +#define LD(psrc) ({ \ + const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + \ + val0_m = LW(psrc_m1); \ + val1_m = LW(psrc_m1 + 4); \ + \ + val_m = (uint64_t)(val1_m); \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ + \ + val_m; \ +}) +#endif // (__mips == 64) + +#define SD(val, pdst) { \ + uint8_t *pdst_m1 = (uint8_t *)(pdst); \ + uint32_t val0_m, val1_m; \ + \ + val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ + val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_m, pdst_m1); \ + SW(val1_m, pdst_m1 + 4); \ +} #endif // (__mips_isa_rev >= 6) /* Description : Load 4 words with stride @@ -69,6 +181,21 @@ out3 = LW((psrc) + 3 * stride); \ } +/* Description : Load double words with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Details : Load double word in 'out0' from (psrc) + Load double word in 'out1' from (psrc + stride) +*/ +#define LD2(psrc, stride, out0, out1) { \ + out0 = LD((psrc)); \ + out1 = LD((psrc) + stride); \ +} +#define LD4(psrc, stride, out0, out1, out2, out3) { \ + LD2((psrc), stride, out0, out1); \ + LD2((psrc) + 2 * stride, stride, out2, out3); \ +} + /* Description : Load vectors with 16 byte elements with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 @@ -81,6 +208,7 @@ out1 = LD_B(RTYPE, (psrc) + stride); \ } #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) +#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) { \ LD_B2(RTYPE, (psrc), stride, out0, out1); \ @@ -93,6 +221,7 @@ LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ } #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) { \ LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ @@ -100,6 +229,14 @@ } #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) +#define LD_B8(RTYPE, psrc, stride, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ +} +#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) +#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) + /* Description : Load vectors with 8 halfword elements with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 @@ -271,6 +408,13 @@ #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) +#define INSERT_D2(RTYPE, in0, in1, out) { \ + out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ + out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ +} +#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) +#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) + /* Description : Interleave both left and right half of input vectors Arguments : Inputs - in0, in1 Outputs - out0, out1 @@ -328,4 +472,53 @@ tmp_m = __msa_clti_s_h((v8i16)in, 0); \ ILVRL_H2_SW(tmp_m, in, out0, out1); \ } + +/* Description : Store 4 double words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store double word from 'in0' to (pdst) + Store double word from 'in1' to (pdst + stride) + Store double word from 'in2' to (pdst + 2 * stride) + Store double word from 'in3' to (pdst + 3 * stride) +*/ +#define SD4(in0, in1, in2, in3, pdst, stride) { \ + SD(in0, (pdst)) \ + SD(in1, (pdst) + stride); \ + SD(in2, (pdst) + 2 * stride); \ + SD(in3, (pdst) + 3 * stride); \ +} + +/* Description : Store vectors of 8 halfword elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 8 halfword elements from 'in0' to (pdst) + Store 8 halfword elements from 'in1' to (pdst + stride) +*/ +#define ST_H2(RTYPE, in0, in1, pdst, stride) { \ + ST_H(RTYPE, in0, (pdst)); \ + ST_H(RTYPE, in1, (pdst) + stride); \ +} +#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) + +/* Description : Store 8x4 byte block to destination memory from input + vectors + Arguments : Inputs - in0, in1, pdst, stride + Details : Index 0 double word element from 'in0' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in0' vector is copied to the + GP register and stored to (pdst + stride) + Index 0 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 2 * stride) + Index 1 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST8x4_UB(in0, in1, pdst, stride) { \ + uint64_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in0, 0); \ + out1_m = __msa_copy_u_d((v2i64)in0, 1); \ + out2_m = __msa_copy_u_d((v2i64)in1, 0); \ + out3_m = __msa_copy_u_d((v2i64)in1, 1); \ + \ + SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ +} #endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */ diff --git a/vp9/encoder/mips/msa/vp9_subtract_msa.c b/vpx_dsp/mips/subtract_msa.c similarity index 97% rename from vp9/encoder/mips/msa/vp9_subtract_msa.c rename to vpx_dsp/mips/subtract_msa.c index 1b8b694ce..9ac43c5cd 100644 --- a/vp9/encoder/mips/msa/vp9_subtract_msa.c +++ b/vpx_dsp/mips/subtract_msa.c @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vp9_rtcd.h" -#include "vp9/common/mips/msa/vp9_macros_msa.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *pred_ptr, int32_t pred_stride, @@ -226,7 +226,7 @@ static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride, } } -void vp9_subtract_block_msa(int32_t rows, int32_t cols, +void vpx_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { @@ -253,12 +253,12 @@ void vp9_subtract_block_msa(int32_t rows, int32_t cols, diff_ptr, diff_stride); break; default: - vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, + vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, pred_ptr, pred_stride); break; } } else { - vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, + vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, pred_ptr, pred_stride); } } diff --git a/vpx_dsp/subtract.c b/vpx_dsp/subtract.c new file mode 100644 index 000000000..556e0134f --- /dev/null +++ b/vpx_dsp/subtract.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +void vpx_subtract_block_c(int rows, int cols, + int16_t *diff, ptrdiff_t diff_stride, + const uint8_t *src, ptrdiff_t src_stride, + const uint8_t *pred, ptrdiff_t pred_stride) { + int r, c; + + for (r = 0; r < rows; r++) { + for (c = 0; c < cols; c++) + diff[c] = src[c] - pred[c]; + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_subtract_block_c(int rows, int cols, + int16_t *diff, ptrdiff_t diff_stride, + const uint8_t *src8, ptrdiff_t src_stride, + const uint8_t *pred8, ptrdiff_t pred_stride, + int bd) { + int r, c; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + (void) bd; + + for (r = 0; r < rows; r++) { + for (c = 0; c < cols; c++) { + diff[c] = src[c] - pred[c]; + } + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index d8ab10822..ecfaf4e01 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -12,13 +12,16 @@ DSP_SRCS-yes += vpx_dsp.mk ifeq ($(CONFIG_ENCODERS),yes) DSP_SRCS-yes += sad.c +DSP_SRCS-yes += subtract.c DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM) DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c DSP_SRCS-$(HAVE_MMX) += x86/sad_mmx.asm DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm @@ -30,6 +33,7 @@ DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c ifeq ($(CONFIG_USE_X86INC),yes) DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index f12270c1c..8130eb782 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -36,6 +36,12 @@ if ($opts{arch} eq "x86_64") { } if (vpx_config("CONFIG_ENCODERS") eq "yes") { +# +# Block subtraction +# +add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; +specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc"; + # # Single block SAD # @@ -210,6 +216,12 @@ add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const specialize qw/vpx_sad4x4x4d msa/, "$sse_x86inc"; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + # + # Block subtraction + # + add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; + specialize qw/vpx_highbd_subtract_block/; + # # Single block SAD # diff --git a/vp9/encoder/x86/vp9_subtract_sse2.asm b/vpx_dsp/x86/subtract_sse2.asm similarity index 98% rename from vp9/encoder/x86/vp9_subtract_sse2.asm rename to vpx_dsp/x86/subtract_sse2.asm index 982408083..a18645112 100644 --- a/vp9/encoder/x86/vp9_subtract_sse2.asm +++ b/vpx_dsp/x86/subtract_sse2.asm @@ -7,12 +7,13 @@ ; in the file PATENTS. All contributing project authors may ; be found in the AUTHORS file in the root of the source tree. ; +%define program_name vpx %include "third_party/x86inc/x86inc.asm" SECTION .text -; void vp9_subtract_block(int rows, int cols, +; void vpx_subtract_block(int rows, int cols, ; int16_t *diff, ptrdiff_t diff_stride, ; const uint8_t *src, ptrdiff_t src_stride, ; const uint8_t *pred, ptrdiff_t pred_stride)