diff --git a/PATENTS b/PATENTS index 4414d8385..79d17d7d6 100644 --- a/PATENTS +++ b/PATENTS @@ -1,22 +1,23 @@ Additional IP Rights Grant (Patents) +------------------------------------ -"This implementation" means the copyrightable works distributed by -Google as part of the WebM Project. +"These implementations" means the copyrightable works that implement the WebM +codecs distributed by Google as part of the WebM Project. -Google hereby grants to you a perpetual, worldwide, non-exclusive, -no-charge, royalty-free, irrevocable (except as stated in this section) -patent license to make, have made, use, offer to sell, sell, import, -transfer, and otherwise run, modify and propagate the contents of this -implementation of VP8, where such license applies only to those patent -claims, both currently owned by Google and acquired in the future, -licensable by Google that are necessarily infringed by this -implementation of VP8. This grant does not include claims that would be -infringed only as a consequence of further modification of this -implementation. If you or your agent or exclusive licensee institute or -order or agree to the institution of patent litigation against any -entity (including a cross-claim or counterclaim in a lawsuit) alleging -that this implementation of VP8 or any code incorporated within this -implementation of VP8 constitutes direct or contributory patent -infringement, or inducement of patent infringement, then any patent -rights granted to you under this License for this implementation of VP8 -shall terminate as of the date such litigation is filed. +Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge, +royalty-free, irrevocable (except as stated in this section) patent license to +make, have made, use, offer to sell, sell, import, transfer, and otherwise +run, modify and propagate the contents of these implementations of WebM, where +such license applies only to those patent claims, both currently owned by +Google and acquired in the future, licensable by Google that are necessarily +infringed by these implementations of WebM. This grant does not include claims +that would be infringed only as a consequence of further modification of these +implementations. If you or your agent or exclusive licensee institute or order +or agree to the institution of patent litigation or any other patent +enforcement activity against any entity (including a cross-claim or +counterclaim in a lawsuit) alleging that any of these implementations of WebM +or any code incorporated within any of these implementations of WebM +constitutes direct or contributory patent infringement, or inducement of +patent infringement, then any patent rights granted to you under this License +for these implementations of WebM shall terminate as of the date such +litigation is filed. diff --git a/build/make/Makefile b/build/make/Makefile index dc61429a9..ed90397f0 100644 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -118,20 +118,26 @@ testdata:: utiltest: # Add compiler flags for intrinsic files +ifeq ($(TOOLCHAIN), x86-os2-gcc) +STACKREALIGN=-mstackrealign +else +STACKREALIGN= +endif + $(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx $(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx -$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 -$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2 -$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3 -$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3 -$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3 -$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3 -$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1 -$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1 -$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx -$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx -$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 -$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 +$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 $(STACKREALIGN) +$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2 $(STACKREALIGN) +$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3 $(STACKREALIGN) +$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3 $(STACKREALIGN) +$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3 $(STACKREALIGN) +$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3 $(STACKREALIGN) +$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1 $(STACKREALIGN) +$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1 $(STACKREALIGN) +$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(STACKREALIGN) +$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(STACKREALIGN) +$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(STACKREALIGN) +$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(STACKREALIGN) $(BUILD_PFX)%.c.d: %.c $(if $(quiet),@echo " [DEP] $@") @@ -196,13 +202,13 @@ $(BUILD_PFX)%.asm.s: %.asm # the copy implementation HAVE_GNU_STRIP := $(if $(CONFIG_DEBUG),,$(HAVE_GNU_STRIP)) ifeq ($(HAVE_GNU_STRIP),yes) -# Older binutils strip global sybols not needed for relocation processing -# when given --strip-unneeded. Use nm and awk to identify globals and -# keep them. +# Older binutils strip global symbols not needed for relocation processing +# when given --strip-unneeded. Using nm and awk to identify globals and +# keep them caused command line length issues under mingw and segfaults in +# test_libvpx were observed under OS/2: simply use --strip-debug. %.a: %_g.a $(if $(quiet),@echo " [STRIP] $@ < $<") - $(qexec)$(STRIP) --strip-unneeded \ - `$(NM) $< | grep ' [A-TV-Z] ' | awk '{print "-K"$$3'}`\ + $(qexec)$(STRIP) --strip-debug \ -o $@ $< else %.a: %_g.a diff --git a/build/make/configure.sh b/build/make/configure.sh old mode 100755 new mode 100644 index f22e3e0ac..ab6687f73 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -252,7 +252,7 @@ tolower(){ # source_path=${0%/*} enable_feature source_path_used -if test -z "$source_path" -o "$source_path" = "." ; then +if [ -z "$source_path" ] || [ "$source_path" = "." ]; then source_path="`pwd`" disable_feature source_path_used fi @@ -381,8 +381,8 @@ EOF # tests for -m$1 toggling the feature given in $2. If $2 is empty $1 is used. check_gcc_machine_option() { - local opt="$1" - local feature="$2" + opt="$1" + feature="$2" [ -n "$feature" ] || feature="$opt" if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then @@ -419,8 +419,8 @@ true } write_common_target_config_mk() { - local CC="${CC}" - local CXX="${CXX}" + saved_CC="${CC}" + saved_CXX="${CXX}" enabled ccache && CC="ccache ${CC}" enabled ccache && CXX="ccache ${CXX}" print_webm_license $1 "##" "" @@ -470,6 +470,8 @@ EOF enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}" + CC="${saved_CC}" + CXX="${saved_CXX}" } @@ -547,7 +549,8 @@ process_common_cmdline() { alt_libc="${optval}" ;; --as=*) - [ "${optval}" = yasm -o "${optval}" = nasm -o "${optval}" = auto ] \ + [ "${optval}" = yasm ] || [ "${optval}" = nasm ] \ + || [ "${optval}" = auto ] \ || die "Must be yasm, nasm or auto: ${optval}" alt_as="${optval}" ;; @@ -555,8 +558,8 @@ process_common_cmdline() { w="${optval%%x*}" h="${optval##*x}" VAR_LIST="DECODE_WIDTH_LIMIT ${w} DECODE_HEIGHT_LIMIT ${h}" - [ ${w} -gt 0 -a ${h} -gt 0 ] || die "Invalid size-limit: too small." - [ ${w} -lt 65536 -a ${h} -lt 65536 ] \ + [ ${w} -gt 0 ] && [ ${h} -gt 0 ] || die "Invalid size-limit: too small." + [ ${w} -lt 65536 ] && [ ${h} -lt 65536 ] \ || die "Invalid size-limit: too big." enable_feature size_limit ;; @@ -1150,7 +1153,7 @@ EOF auto|"") which nasm >/dev/null 2>&1 && AS=nasm which yasm >/dev/null 2>&1 && AS=yasm - [ "${AS}" = auto -o -z "${AS}" ] \ + [ "${AS}" = auto ] || [ -z "${AS}" ] \ && die "Neither yasm nor nasm have been found" ;; esac @@ -1222,7 +1225,12 @@ EOF fi fi - enabled debug && check_add_cflags -g && check_add_ldflags -g + if enabled debug; then + check_add_cflags -g && check_add_ldflags -g + else + check_add_cflags -DNDEBUG + fi + enabled gprof && check_add_cflags -pg && check_add_ldflags -pg enabled gcov && check_add_cflags -fprofile-arcs -ftest-coverage && @@ -1309,8 +1317,9 @@ process_toolchain() { } print_config_mk() { - local prefix=$1 - local makefile=$2 + saved_prefix="${prefix}" + prefix=$1 + makefile=$2 shift 2 for cfg; do if enabled $cfg; then @@ -1318,11 +1327,13 @@ print_config_mk() { echo "${prefix}_${upname}=yes" >> $makefile fi done + prefix="${saved_prefix}" } print_config_h() { - local prefix=$1 - local header=$2 + saved_prefix="${prefix}" + prefix=$1 + header=$2 shift 2 for cfg; do upname="`toupper $cfg`" @@ -1332,10 +1343,11 @@ print_config_h() { echo "#define ${prefix}_${upname} 0" >> $header fi done + prefix="${saved_prefix}" } print_config_vars_h() { - local header=$1 + header=$1 shift while [ $# -gt 0 ]; do upname="`toupper $1`" @@ -1345,9 +1357,10 @@ print_config_vars_h() { } print_webm_license() { - local destination=$1 - local prefix="$2" - local suffix="$3" + saved_prefix="${prefix}" + destination=$1 + prefix="$2" + suffix="$3" shift 3 cat < ${destination} ${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix} @@ -1358,6 +1371,7 @@ ${prefix} tree. An additional intellectual property rights grant can be found${s ${prefix} in the file PATENTS. All contributing project authors may${suffix} ${prefix} be found in the AUTHORS file in the root of the source tree.${suffix} EOF + prefix="${saved_prefix}" } process_targets() { diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl index f5f59b146..28ef69c23 100755 --- a/build/make/rtcd.pl +++ b/build/make/rtcd.pl @@ -3,7 +3,7 @@ no strict 'refs'; use warnings; use Getopt::Long; -Getopt::Long::Configure("auto_help"); +Getopt::Long::Configure("auto_help") if $Getopt::Long::VERSION > 2.32; my %ALL_FUNCS = (); my @ALL_ARCHS; @@ -385,6 +385,8 @@ if ($opts{arch} eq 'x86') { arm; } elsif ($opts{arch} eq 'armv7') { @ALL_ARCHS = filter(qw/edsp media neon_asm neon/); + @REQUIRES = filter(keys %required ? keys %required : qw/media/); + &require(@REQUIRES); arm; } elsif ($opts{arch} eq 'armv8') { @ALL_ARCHS = filter(qw/neon/); diff --git a/configure b/configure index d5be3a2fd..2ac43c109 100755 --- a/configure +++ b/configure @@ -25,6 +25,7 @@ Advanced options: ${toggle_docs} documentation ${toggle_unit_tests} unit tests ${toggle_decode_perf_tests} build decoder perf tests with unit tests + ${toggle_encode_perf_tests} build encoder perf tests with unit tests --libc=PATH path to alternate libc --size-limit=WxH max size to allow in the decoder --as={yasm|nasm|auto} use specified assembler [auto, yasm preferred] @@ -45,6 +46,9 @@ Advanced options: ${toggle_realtime_only} enable this option while building for real-time encoding ${toggle_onthefly_bitpacking} enable on-the-fly bitpacking in real-time encoding ${toggle_error_concealment} enable this option to get a decoder which is able to conceal losses + ${toggle_coefficient_range_checking} + enable decoder to check if intermediate + transform coefficients are in valid range ${toggle_runtime_cpu_detect} runtime cpu detection ${toggle_shared} shared library support ${toggle_static} static library support @@ -66,10 +70,10 @@ Codecs: EOF #restore editor state ' - local family; - local last_family; - local c; - local str; + family=""; + last_family=""; + c=""; + str=""; for c in ${CODECS}; do family=${c%_*} if [ "${family}" != "${last_family}" ]; then @@ -271,12 +275,10 @@ HAVE_LIST=" unistd_h " EXPERIMENT_LIST=" - alpha vp9_high - multiple_arf spatial_svc high_quant - denoising + vp9_temporal_denoising emulate_hw_high fp_mb_stats " @@ -328,8 +330,10 @@ CONFIG_LIST=" webm_io libyuv decode_perf_tests + encode_perf_tests multi_res_encoding temporal_denoising + coefficient_range_checking experimental size_limit ${EXPERIMENT_LIST} @@ -384,8 +388,10 @@ CMDLINE_SELECT=" webm_io libyuv decode_perf_tests + encode_perf_tests multi_res_encoding temporal_denoising + coefficient_range_checking experimental " @@ -413,7 +419,7 @@ process_cmdline() { } post_process_cmdline() { - local c + c="" # If the codec family is disabled, disable all components of that family. # If the codec family is enabled, enable all components of that family. @@ -460,8 +466,8 @@ process_targets() { enabled universal && echo "FAT_ARCHS=${fat_bin_archs}" >> config.mk # Calculate the default distribution name, based on the enabled features - local cf - local DIST_DIR=vpx + cf="" + DIST_DIR=vpx for cf in $CODEC_FAMILIES; do if enabled ${cf}_encoder && enabled ${cf}_decoder; then DIST_DIR="${DIST_DIR}-${cf}" @@ -483,7 +489,7 @@ process_targets() { ;; esac if [ -f "${source_path}/build/make/version.sh" ]; then - local ver=`"$source_path/build/make/version.sh" --bare "$source_path"` + ver=`"$source_path/build/make/version.sh" --bare "$source_path"` DIST_DIR="${DIST_DIR}-${ver}" VERSION_STRING=${ver} ver=${ver%%-*} @@ -517,7 +523,7 @@ EOF # Write makefiles for all enabled targets # for tgt in libs examples docs solution; do - local tgt_fn="$tgt-$toolchain.mk" + tgt_fn="$tgt-$toolchain.mk" if enabled $tgt; then echo "Creating makefiles for ${toolchain} ${tgt}" @@ -556,7 +562,7 @@ process_detect() { true; ;; *) - local result=false + result=false for d in "$@"; do [ -f "${d##-I}/$header" ] && result=true && break done @@ -605,7 +611,7 @@ process_toolchain() { # Handle universal binaries for this architecture case $toolchain in universal-darwin*) - local darwin_ver=${tgt_os##darwin} + darwin_ver=${tgt_os##darwin} # Snow Leopard (10.6/darwin10) dropped support for PPC # Include PPC support for all prior versions diff --git a/examples.mk b/examples.mk index a47db04ae..91bd45aa4 100644 --- a/examples.mk +++ b/examples.mk @@ -9,8 +9,12 @@ ## LIBYUV_SRCS += third_party/libyuv/include/libyuv/basic_types.h \ + third_party/libyuv/include/libyuv/convert.h \ + third_party/libyuv/include/libyuv/convert_argb.h \ + third_party/libyuv/include/libyuv/convert_from.h \ third_party/libyuv/include/libyuv/cpu_id.h \ third_party/libyuv/include/libyuv/planar_functions.h \ + third_party/libyuv/include/libyuv/rotate.h \ third_party/libyuv/include/libyuv/row.h \ third_party/libyuv/include/libyuv/scale.h \ third_party/libyuv/include/libyuv/scale_row.h \ @@ -20,14 +24,15 @@ LIBYUV_SRCS += third_party/libyuv/include/libyuv/basic_types.h \ third_party/libyuv/source/row_common.cc \ third_party/libyuv/source/row_mips.cc \ third_party/libyuv/source/row_neon.cc \ + third_party/libyuv/source/row_neon64.cc \ third_party/libyuv/source/row_posix.cc \ third_party/libyuv/source/row_win.cc \ - third_party/libyuv/source/scale.cc \ + third_party/libyuv/source/scale.cc \ third_party/libyuv/source/scale_common.cc \ third_party/libyuv/source/scale_mips.cc \ third_party/libyuv/source/scale_neon.cc \ third_party/libyuv/source/scale_posix.cc \ - third_party/libyuv/source/scale_win.cc + third_party/libyuv/source/scale_win.cc \ LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \ third_party/libwebm/mkvmuxerutil.cpp \ @@ -210,17 +215,18 @@ endif # from an installed tree or a version controlled tree. Determine # the proper paths. ifeq ($(HAVE_ALT_TREE_LAYOUT),yes) - LIB_PATH := $(SRC_PATH_BARE)/../lib - INC_PATH := $(SRC_PATH_BARE)/../include + LIB_PATH-yes := $(SRC_PATH_BARE)/../lib + INC_PATH-yes := $(SRC_PATH_BARE)/../include else LIB_PATH-yes += $(if $(BUILD_PFX),$(BUILD_PFX),.) INC_PATH-$(CONFIG_VP8_DECODER) += $(SRC_PATH_BARE)/vp8 INC_PATH-$(CONFIG_VP8_ENCODER) += $(SRC_PATH_BARE)/vp8 INC_PATH-$(CONFIG_VP9_DECODER) += $(SRC_PATH_BARE)/vp9 INC_PATH-$(CONFIG_VP9_ENCODER) += $(SRC_PATH_BARE)/vp9 - LIB_PATH := $(call enabled,LIB_PATH) - INC_PATH := $(call enabled,INC_PATH) endif +INC_PATH-$(CONFIG_LIBYUV) += $(SRC_PATH_BARE)/third_party/libyuv/include +LIB_PATH := $(call enabled,LIB_PATH) +INC_PATH := $(call enabled,INC_PATH) INTERNAL_CFLAGS = $(addprefix -I,$(INC_PATH)) INTERNAL_LDFLAGS += $(addprefix -L,$(LIB_PATH)) diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c index dc9bc06b1..4c1b60bad 100644 --- a/examples/simple_encoder.c +++ b/examples/simple_encoder.c @@ -118,11 +118,12 @@ void usage_exit() { exit(EXIT_FAILURE); } -static void encode_frame(vpx_codec_ctx_t *codec, - vpx_image_t *img, - int frame_index, - int flags, - VpxVideoWriter *writer) { +static int encode_frame(vpx_codec_ctx_t *codec, + vpx_image_t *img, + int frame_index, + int flags, + VpxVideoWriter *writer) { + int got_pkts = 0; vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt = NULL; const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, @@ -131,6 +132,8 @@ static void encode_frame(vpx_codec_ctx_t *codec, die_codec(codec, "Failed to encode frame"); while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) { + got_pkts = 1; + if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0; if (!vpx_video_writer_write_frame(writer, @@ -139,11 +142,12 @@ static void encode_frame(vpx_codec_ctx_t *codec, pkt->data.frame.pts)) { die_codec(codec, "Failed to write compressed frame"); } - printf(keyframe ? "K" : "."); fflush(stdout); } } + + return got_pkts; } int main(int argc, char **argv) { @@ -230,13 +234,16 @@ int main(int argc, char **argv) { if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) die_codec(&codec, "Failed to initialize encoder"); + // Encode frames. while (vpx_img_read(&raw, infile)) { int flags = 0; if (keyframe_interval > 0 && frame_count % keyframe_interval == 0) flags |= VPX_EFLAG_FORCE_KF; encode_frame(&codec, &raw, frame_count++, flags, writer); } - encode_frame(&codec, NULL, -1, 0, writer); // flush the encoder + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, 0, writer)) {} printf("\n"); fclose(infile); diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c index 7872048b1..2720088b3 100644 --- a/examples/vpx_temporal_svc_encoder.c +++ b/examples/vpx_temporal_svc_encoder.c @@ -636,7 +636,7 @@ int main(int argc, char **argv) { if (strncmp(encoder->name, "vp8", 3) == 0) { vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed); - vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYUV); + vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly); } else if (strncmp(encoder->name, "vp9", 3) == 0) { vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed); vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); diff --git a/libs.mk b/libs.mk index 1e01639c7..25fbc2cb9 100644 --- a/libs.mk +++ b/libs.mk @@ -409,12 +409,16 @@ $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1 curl -L -o $@ $(call libvpx_test_data_url,$(@F)) testdata:: $(LIBVPX_TEST_DATA) - $(qexec)if [ -x "$$(which sha1sum)" ]; then\ + $(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\ + [ -x "$$(which shasum)" ] && sha1sum=shasum;\ + [ -x "$$(which sha1)" ] && sha1sum=sha1;\ + if [ -n "$${sha1sum}" ]; then\ + set -e;\ echo "Checking test data:";\ if [ -n "$(LIBVPX_TEST_DATA)" ]; then\ for f in $(call enabled,LIBVPX_TEST_DATA); do\ grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\ - (cd $(LIBVPX_TEST_DATA_PATH); sha1sum -c);\ + (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\ done; \ fi; \ else\ diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 1456791a1..fdc722797 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -21,29 +21,28 @@ #include "vpx_ports/mem.h" namespace { -typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h); +typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h); struct ConvolveFunctions { - ConvolveFunctions(convolve_fn_t h8, convolve_fn_t h8_avg, - convolve_fn_t v8, convolve_fn_t v8_avg, - convolve_fn_t hv8, convolve_fn_t hv8_avg, int bps) + ConvolveFunctions(ConvolveFunc h8, ConvolveFunc h8_avg, + ConvolveFunc v8, ConvolveFunc v8_avg, + ConvolveFunc hv8, ConvolveFunc hv8_avg, int bps) : h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg), v8_avg_(v8_avg), hv8_avg_(hv8_avg), use_high_bps_(bps) {} - - convolve_fn_t h8_; - convolve_fn_t v8_; - convolve_fn_t hv8_; - convolve_fn_t h8_avg_; - convolve_fn_t v8_avg_; - convolve_fn_t hv8_avg_; + ConvolveFunc h8_; + ConvolveFunc v8_; + ConvolveFunc hv8_; + ConvolveFunc h8_avg_; + ConvolveFunc v8_avg_; + ConvolveFunc hv8_avg_; int use_high_bps_; }; -typedef std::tr1::tuple convolve_param_t; +typedef std::tr1::tuple ConvolveParam; // Reference 8-tap subpixel filter, slightly modified to fit into this test. #define VP9_FILTER_WEIGHT 128 @@ -171,14 +170,14 @@ void filter_average_block2d_8_c(const uint8_t *src_ptr, } void high_filter_block2d_8_c(const uint16_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter, - const int16_t *VFilter, - uint16_t *dst_ptr, - unsigned int dst_stride, - unsigned int output_width, - unsigned int output_height, - int bps) { + const unsigned int src_stride, + const int16_t *HFilter, + const int16_t *VFilter, + uint16_t *dst_ptr, + unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height, + int bps) { // Between passes, we use an intermediate buffer whose height is extended to // have enough horizontally filtered values as input for the vertical pass. // This buffer is allocated to be big enough for the largest block type we @@ -255,12 +254,12 @@ void high_filter_block2d_8_c(const uint16_t *src_ptr, } void high_block2d_average_c(uint16_t *src, - unsigned int src_stride, - uint16_t *output_ptr, - unsigned int output_stride, - unsigned int output_width, - unsigned int output_height, - int bps) { + unsigned int src_stride, + uint16_t *output_ptr, + unsigned int output_stride, + unsigned int output_width, + unsigned int output_height, + int bps) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { @@ -271,14 +270,14 @@ void high_block2d_average_c(uint16_t *src, } void high_filter_average_block2d_8_c(const uint16_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter, - const int16_t *VFilter, - uint16_t *dst_ptr, - unsigned int dst_stride, - unsigned int output_width, - unsigned int output_height, - int bps) { + const unsigned int src_stride, + const int16_t *HFilter, + const int16_t *VFilter, + uint16_t *dst_ptr, + unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height, + int bps) { uint16_t tmp[64 * 64]; assert(output_width <= 64); @@ -289,7 +288,7 @@ void high_filter_average_block2d_8_c(const uint16_t *src_ptr, output_width, output_height, bps); } -class ConvolveTest : public ::testing::TestWithParam { +class ConvolveTest : public ::testing::TestWithParam { public: static void SetUpTestCase() { // Force input_ to be unaligned, output to be 16 byte aligned. diff --git a/test/cq_test.cc b/test/cq_test.cc index 7da7b80aa..4e8019a87 100644 --- a/test/cq_test.cc +++ b/test/cq_test.cc @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" @@ -24,6 +25,28 @@ const unsigned int kCQTargetBitrate = 2000; class CQTest : public ::libvpx_test::EncoderTest, public ::libvpx_test::CodecTestWithParam { + public: + // maps the cqlevel to the bitrate produced. + typedef std::map BitrateMap; + + static void SetUpTestCase() { + bitrates_.clear(); + } + + static void TearDownTestCase() { + ASSERT_TRUE(!HasFailure()) + << "skipping bitrate validation due to earlier failure."; + uint32_t prev_actual_bitrate = kCQTargetBitrate; + for (BitrateMap::const_iterator iter = bitrates_.begin(); + iter != bitrates_.end(); ++iter) { + const uint32_t cq_actual_bitrate = iter->second; + EXPECT_LE(cq_actual_bitrate, prev_actual_bitrate) + << "cq_level: " << iter->first + << ", bitrate should decrease with increase in CQ level."; + prev_actual_bitrate = cq_actual_bitrate; + } + } + protected: CQTest() : EncoderTest(GET_PARAM(0)), cq_level_(GET_PARAM(1)) { init_flags_ = VPX_CODEC_USE_PSNR; @@ -66,9 +89,12 @@ class CQTest : public ::libvpx_test::EncoderTest, return pow(10.0, avg_psnr / 10.0) / file_size_; } + int cq_level() const { return cq_level_; } size_t file_size() const { return file_size_; } int n_frames() const { return n_frames_; } + static BitrateMap bitrates_; + private: int cq_level_; size_t file_size_; @@ -76,7 +102,8 @@ class CQTest : public ::libvpx_test::EncoderTest, int n_frames_; }; -unsigned int prev_actual_bitrate = kCQTargetBitrate; +CQTest::BitrateMap CQTest::bitrates_; + TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) { const vpx_rational timebase = { 33333333, 1000000000 }; cfg_.g_timebase = timebase; @@ -91,8 +118,7 @@ TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) { const unsigned int cq_actual_bitrate = static_cast(file_size()) * 8 * 30 / (n_frames() * 1000); EXPECT_LE(cq_actual_bitrate, kCQTargetBitrate); - EXPECT_LE(cq_actual_bitrate, prev_actual_bitrate); - prev_actual_bitrate = cq_actual_bitrate; + bitrates_[cq_level()] = cq_actual_bitrate; // try targeting the approximate same bitrate with VBR mode cfg_.rc_end_usage = VPX_VBR; diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index fdc685804..94fa08331 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -259,15 +259,15 @@ void reference_16x16_dct_2d(int16_t input[256], double output[256]) { } } -typedef void (*fdct_t)(const int16_t *in, tran_low_t *out, int stride); -typedef void (*idct_t)(const tran_low_t *in, uint8_t *out, int stride); -typedef void (*fht_t) (const int16_t *in, tran_low_t *out, int stride, - int tx_type); -typedef void (*iht_t) (const tran_low_t *in, uint8_t *out, int stride, - int tx_type); +typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); +typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); +typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, + int tx_type); +typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type); -typedef std::tr1::tuple dct_16x16_param_t; -typedef std::tr1::tuple ht_16x16_param_t; +typedef std::tr1::tuple Dct16x16Param; +typedef std::tr1::tuple Ht16x16Param; void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { @@ -550,15 +550,15 @@ class Trans16x16TestBase { } int pitch_; int tx_type_; - fht_t fwd_txfm_ref; int bit_depth_; int mask_; - iht_t inv_txfm_ref; + FhtFunc fwd_txfm_ref; + IhtFunc inv_txfm_ref; }; class Trans16x16DCT : public Trans16x16TestBase, - public ::testing::TestWithParam { + public ::testing::TestWithParam { public: virtual ~Trans16x16DCT() {} @@ -596,8 +596,8 @@ class Trans16x16DCT inv_txfm_(out, dst, stride); } - fdct_t fwd_txfm_; - idct_t inv_txfm_; + FdctFunc fwd_txfm_; + IdctFunc inv_txfm_; }; TEST_P(Trans16x16DCT, AccuracyCheck) { @@ -624,7 +624,7 @@ TEST_P(Trans16x16DCT, InvAccuracyCheck) { class Trans16x16HT : public Trans16x16TestBase, - public ::testing::TestWithParam { + public ::testing::TestWithParam { public: virtual ~Trans16x16HT() {} @@ -662,8 +662,8 @@ class Trans16x16HT inv_txfm_(out, dst, stride, tx_type_); } - fht_t fwd_txfm_; - iht_t inv_txfm_; + FhtFunc fwd_txfm_; + IhtFunc inv_txfm_; }; TEST_P(Trans16x16HT, AccuracyCheck) { @@ -755,29 +755,4 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0, 8))); #endif - -#if HAVE_AVX2 && !CONFIG_VP9_HIGH -// TODO(jzern): these prototypes can be removed after the avx2 versions are -// reenabled in vp9_rtcd_defs.pl. -extern "C" { -void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride); -void vp9_fht16x16_avx2(const int16_t *input, int16_t *output, int stride, - int tx_type); -} -INSTANTIATE_TEST_CASE_P( - DISABLED_AVX2, Trans16x16DCT, - ::testing::Values( - make_tuple(&vp9_fdct16x16_avx2, - &vp9_idct16x16_256_add_c, 0, 8))); -INSTANTIATE_TEST_CASE_P( - AVX2, Trans16x16HT, - ::testing::Values( - make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 3, 8))); -INSTANTIATE_TEST_CASE_P( - DISABLED_AVX2, Trans16x16HT, - ::testing::Values( - make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 0, 8), - make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 1, 8), - make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 2, 8))); -#endif } // namespace diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index 597a808ec..a05e09826 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -71,10 +71,10 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs], } } -typedef void (*fwd_txfm_t)(const int16_t *in, tran_low_t *out, int stride); -typedef void (*inv_txfm_t)(const tran_low_t *in, uint8_t *out, int stride); +typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride); +typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride); -typedef std::tr1::tuple trans_32x32_param_t; +typedef std::tr1::tuple Trans32x32Param; #if CONFIG_VP9_HIGH @@ -88,7 +88,7 @@ void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) { #endif -class Trans32x32Test : public ::testing::TestWithParam { +class Trans32x32Test : public ::testing::TestWithParam { public: virtual ~Trans32x32Test() {} virtual void SetUp() { @@ -106,8 +106,8 @@ class Trans32x32Test : public ::testing::TestWithParam { int version_; int bit_depth_; int mask_; - fwd_txfm_t fwd_txfm_; - inv_txfm_t inv_txfm_; + FwdTxfmFunc fwd_txfm_; + InvTxfmFunc inv_txfm_; }; TEST_P(Trans32x32Test, AccuracyCheck) { diff --git a/test/decode_api_test.cc b/test/decode_api_test.cc new file mode 100644 index 000000000..2837f8cbe --- /dev/null +++ b/test/decode_api_test.cc @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/ivf_video_source.h" +#include "./vpx_config.h" +#include "vpx/vp8dx.h" +#include "vpx/vpx_decoder.h" + +namespace { + +#define NELEMENTS(x) static_cast(sizeof(x) / sizeof(x[0])) + +TEST(DecodeAPI, InvalidParams) { + static const vpx_codec_iface_t *kCodecs[] = { +#if CONFIG_VP8_DECODER + &vpx_codec_vp8_dx_algo, +#endif +#if CONFIG_VP9_DECODER + &vpx_codec_vp9_dx_algo, +#endif + }; + uint8_t buf[1] = {0}; + vpx_codec_ctx_t dec; + + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_dec_init(NULL, NULL, NULL, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_dec_init(&dec, NULL, NULL, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_decode(NULL, NULL, 0, NULL, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_decode(NULL, buf, 0, NULL, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_decode(NULL, buf, NELEMENTS(buf), NULL, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_decode(NULL, NULL, NELEMENTS(buf), NULL, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(NULL)); + EXPECT_TRUE(vpx_codec_error(NULL) != NULL); + + for (int i = 0; i < NELEMENTS(kCodecs); ++i) { + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_dec_init(NULL, kCodecs[i], NULL, 0)); + + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, kCodecs[i], NULL, 0)); + EXPECT_EQ(VPX_CODEC_UNSUP_BITSTREAM, + vpx_codec_decode(&dec, buf, NELEMENTS(buf), NULL, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_decode(&dec, NULL, NELEMENTS(buf), NULL, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_decode(&dec, buf, 0, NULL, 0)); + + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec)); + } +} + +#if CONFIG_VP9_DECODER +// Test VP9 codec controls after a decode error to ensure the code doesn't +// misbehave. +void TestVp9Controls(vpx_codec_ctx_t *dec) { + static const int kControls[] = { + VP8D_GET_LAST_REF_UPDATES, + VP8D_GET_FRAME_CORRUPTED, + VP9D_GET_DISPLAY_SIZE, + }; + int val[2]; + + for (int i = 0; i < NELEMENTS(kControls); ++i) { + const vpx_codec_err_t res = vpx_codec_control_(dec, kControls[i], val); + switch (kControls[i]) { + case VP8D_GET_FRAME_CORRUPTED: + EXPECT_EQ(VPX_CODEC_ERROR, res) << kControls[i]; + break; + default: + EXPECT_EQ(VPX_CODEC_OK, res) << kControls[i]; + break; + } + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_control_(dec, kControls[i], NULL)); + } + + vp9_ref_frame_t ref; + ref.idx = 0; + EXPECT_EQ(VPX_CODEC_ERROR, vpx_codec_control(dec, VP9_GET_REFERENCE, &ref)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_control(dec, VP9_GET_REFERENCE, NULL)); + + vpx_ref_frame_t ref_copy; + const int width = 352; + const int height = 288; + ASSERT_TRUE( + vpx_img_alloc(&ref_copy.img, VPX_IMG_FMT_I420, width, height, 1) != NULL); + ref_copy.frame_type = VP8_LAST_FRAME; + EXPECT_EQ(VPX_CODEC_ERROR, + vpx_codec_control(dec, VP8_COPY_REFERENCE, &ref_copy)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_control(dec, VP8_COPY_REFERENCE, NULL)); + vpx_img_free(&ref_copy.img); +} + +TEST(DecodeAPI, Vp9InvalidDecode) { + const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; + const char filename[] = + "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"; + libvpx_test::IVFVideoSource video(filename); + video.Init(); + video.Begin(); + ASSERT_TRUE(!HasFailure()); + + vpx_codec_ctx_t dec; + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0)); + const uint32_t frame_size = static_cast(video.frame_size()); + EXPECT_EQ(VPX_CODEC_MEM_ERROR, + vpx_codec_decode(&dec, video.cxdata(), frame_size, NULL, 0)); + vpx_codec_iter_t iter = NULL; + EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter)); + + TestVp9Controls(&dec); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec)); +} +#endif // CONFIG_VP9_DECODER + +} // namespace diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc index 9b423b353..11529b349 100644 --- a/test/decode_perf_test.cc +++ b/test/decode_perf_test.cc @@ -29,9 +29,9 @@ const double kUsecsInSec = 1000000.0; /* DecodePerfTest takes a tuple of filename + number of threads to decode with */ -typedef std::tr1::tuple decode_perf_param_t; +typedef std::tr1::tuple DecodePerfParam; -const decode_perf_param_t kVP9DecodePerfVectors[] = { +const DecodePerfParam kVP9DecodePerfVectors[] = { make_tuple("vp90-2-bbb_426x240_tile_1x1_180kbps.webm", 1), make_tuple("vp90-2-bbb_640x360_tile_1x2_337kbps.webm", 2), make_tuple("vp90-2-bbb_854x480_tile_1x2_651kbps.webm", 2), @@ -64,7 +64,7 @@ const decode_perf_param_t kVP9DecodePerfVectors[] = { power/temp/min max frame decode times/etc */ -class DecodePerfTest : public ::testing::TestWithParam { +class DecodePerfTest : public ::testing::TestWithParam { }; TEST_P(DecodePerfTest, PerfTest) { @@ -92,6 +92,7 @@ TEST_P(DecodePerfTest, PerfTest) { const double fps = double(frames) / elapsed_secs; printf("{\n"); + printf("\t\"type\" : \"decode_perf_test\",\n"); printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP); printf("\t\"videoName\" : \"%s\",\n", video_name); printf("\t\"threadCount\" : %u,\n", threads); diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc index 3e4ef0ad1..99610ebc5 100644 --- a/test/decode_test_driver.cc +++ b/test/decode_test_driver.cc @@ -67,24 +67,33 @@ void DecoderTest::RunLoop(CompressedVideoSource *video, const vpx_codec_dec_cfg_t &dec_cfg) { Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0); ASSERT_TRUE(decoder != NULL); + bool end_of_file = false; // Decode frames. - for (video->Begin(); !::testing::Test::HasFailure() && video->cxdata(); + for (video->Begin(); !::testing::Test::HasFailure() && !end_of_file; video->Next()) { PreDecodeFrameHook(*video, decoder); vpx_codec_stream_info_t stream_info; stream_info.sz = sizeof(stream_info); - const vpx_codec_err_t res_peek = decoder->PeekStream(video->cxdata(), - video->frame_size(), - &stream_info); - HandlePeekResult(decoder, video, res_peek); - ASSERT_FALSE(::testing::Test::HasFailure()); - vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(), - video->frame_size()); - if (!HandleDecodeResult(res_dec, *video, decoder)) - break; + if (video->cxdata() != NULL) { + const vpx_codec_err_t res_peek = decoder->PeekStream(video->cxdata(), + video->frame_size(), + &stream_info); + HandlePeekResult(decoder, video, res_peek); + ASSERT_FALSE(::testing::Test::HasFailure()); + + vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(), + video->frame_size()); + if (!HandleDecodeResult(res_dec, *video, decoder)) + break; + } else { + // Signal end of the file to the decoder. + const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0); + ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); + end_of_file = true; + } DxDataIterator dec_iter = decoder->GetDxData(); const vpx_image_t *img = NULL; diff --git a/test/decode_to_md5.sh b/test/decode_to_md5.sh index 6cb7d0e6e..854b74f84 100755 --- a/test/decode_to_md5.sh +++ b/test/decode_to_md5.sh @@ -44,8 +44,8 @@ decode_to_md5() { [ -e "${output_file}" ] || return 1 - local md5_last_frame=$(tail -n1 "${output_file}") - local actual_md5=$(echo "${md5_last_frame% *}" | tr -d [:space:]) + local md5_last_frame="$(tail -n1 "${output_file}" | awk '{print $1}')" + local actual_md5="$(echo "${md5_last_frame}" | awk '{print $1}')" [ "${actual_md5}" = "${expected_md5}" ] || return 1 } diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc new file mode 100644 index 000000000..feef37e7b --- /dev/null +++ b/test/encode_perf_test.cc @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vpx_config.h" +#include "./vpx_version.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vpx_ports/vpx_timer.h" + +namespace { + +const int kMaxPsnr = 100; +const double kUsecsInSec = 1000000.0; + +struct EncodePerfTestVideo { + EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_, + uint32_t bitrate_, int frames_) + : name(name_), + width(width_), + height(height_), + bitrate(bitrate_), + frames(frames_) {} + const char *name; + uint32_t width; + uint32_t height; + uint32_t bitrate; + int frames; +}; + +const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = { + EncodePerfTestVideo("desktop_640_360_30.yuv", 640, 360, 200, 2484), + EncodePerfTestVideo("kirland_640_480_30.yuv", 640, 480, 200, 300), + EncodePerfTestVideo("macmarcomoving_640_480_30.yuv", 640, 480, 200, 987), + EncodePerfTestVideo("macmarcostationary_640_480_30.yuv", 640, 480, 200, 718), + EncodePerfTestVideo("niklas_640_480_30.yuv", 640, 480, 200, 471), + EncodePerfTestVideo("tacomanarrows_640_480_30.yuv", 640, 480, 200, 300), + EncodePerfTestVideo("tacomasmallcameramovement_640_480_30.yuv", + 640, 480, 200, 300), + EncodePerfTestVideo("thaloundeskmtg_640_480_30.yuv", 640, 480, 200, 300), + EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470), +}; + +const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 12 }; + +#define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0])) + +class VP9EncodePerfTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + VP9EncodePerfTest() + : EncoderTest(GET_PARAM(0)), + min_psnr_(kMaxPsnr), + nframes_(0), + encoding_mode_(GET_PARAM(1)), + speed_(0) {} + + virtual ~VP9EncodePerfTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + + cfg_.g_lag_in_frames = 0; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_resize_allowed = 0; + cfg_.rc_end_usage = VPX_CBR; + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 1) { + encoder->Control(VP8E_SET_CPUUSED, speed_); + } + } + + virtual void BeginPassHook(unsigned int /*pass*/) { + min_psnr_ = kMaxPsnr; + nframes_ = 0; + } + + virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + if (pkt->data.psnr.psnr[0] < min_psnr_) { + min_psnr_= pkt->data.psnr.psnr[0]; + } + } + + // for performance reasons don't decode + virtual bool DoDecode() { return 0; } + + double min_psnr() const { + return min_psnr_; + } + + void set_speed(unsigned int speed) { + speed_ = speed; + } + + private: + double min_psnr_; + unsigned int nframes_; + libvpx_test::TestMode encoding_mode_; + unsigned speed_; +}; + +TEST_P(VP9EncodePerfTest, PerfTest) { + for (size_t i = 0; i < NELEMENTS(kVP9EncodePerfTestVectors); ++i) { + for (size_t j = 0; j < NELEMENTS(kEncodePerfTestSpeeds); ++j) { + SetUp(); + + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = kVP9EncodePerfTestVectors[i].bitrate; + + init_flags_ = VPX_CODEC_USE_PSNR; + + const unsigned frames = kVP9EncodePerfTestVectors[i].frames; + const char *video_name = kVP9EncodePerfTestVectors[i].name; + libvpx_test::I420VideoSource video( + video_name, + kVP9EncodePerfTestVectors[i].width, + kVP9EncodePerfTestVectors[i].height, + timebase.den, timebase.num, 0, + kVP9EncodePerfTestVectors[i].frames); + set_speed(kEncodePerfTestSpeeds[j]); + + vpx_usec_timer t; + vpx_usec_timer_start(&t); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + vpx_usec_timer_mark(&t); + const double elapsed_secs = vpx_usec_timer_elapsed(&t) / kUsecsInSec; + const double fps = frames / elapsed_secs; + const double minimum_psnr = min_psnr(); + + printf("{\n"); + printf("\t\"type\" : \"encode_perf_test\",\n"); + printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP); + printf("\t\"videoName\" : \"%s\",\n", video_name); + printf("\t\"encodeTimeSecs\" : %f,\n", elapsed_secs); + printf("\t\"totalFrames\" : %u,\n", frames); + printf("\t\"framesPerSecond\" : %f,\n", fps); + printf("\t\"minPsnr\" : %f,\n", minimum_psnr); + printf("\t\"speed\" : %d\n", kEncodePerfTestSpeeds[j]); + printf("}\n"); + } + } +} + +VP9_INSTANTIATE_TEST_CASE( + VP9EncodePerfTest, ::testing::Values(::libvpx_test::kRealTime)); +} // namespace diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc index 278133aac..9500fe49c 100644 --- a/test/fdct4x4_test.cc +++ b/test/fdct4x4_test.cc @@ -30,15 +30,15 @@ using libvpx_test::ACMRandom; namespace { const int kNumCoeffs = 16; -typedef void (*fdct_t)(const int16_t *in, tran_low_t *out, int stride); -typedef void (*idct_t)(const tran_low_t *in, uint8_t *out, int stride); -typedef void (*fht_t) (const int16_t *in, tran_low_t *out, int stride, - int tx_type); -typedef void (*iht_t) (const tran_low_t *in, uint8_t *out, int stride, - int tx_type); +typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); +typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); +typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, + int tx_type); +typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type); -typedef std::tr1::tuple dct_4x4_param_t; -typedef std::tr1::tuple ht_4x4_param_t; +typedef std::tr1::tuple Dct4x4Param; +typedef std::tr1::tuple Ht4x4Param; void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { vp9_fdct4x4_c(in, out, stride); @@ -245,14 +245,14 @@ class Trans4x4TestBase { int pitch_; int tx_type_; - fht_t fwd_txfm_ref; int bit_depth_; int mask_; + FhtFunc fwd_txfm_ref; }; class Trans4x4DCT : public Trans4x4TestBase, - public ::testing::TestWithParam { + public ::testing::TestWithParam { public: virtual ~Trans4x4DCT() {} @@ -275,8 +275,8 @@ class Trans4x4DCT inv_txfm_(out, dst, stride); } - fdct_t fwd_txfm_; - idct_t inv_txfm_; + FdctFunc fwd_txfm_; + IdctFunc inv_txfm_; }; TEST_P(Trans4x4DCT, AccuracyCheck) { @@ -297,7 +297,7 @@ TEST_P(Trans4x4DCT, InvAccuracyCheck) { class Trans4x4HT : public Trans4x4TestBase, - public ::testing::TestWithParam { + public ::testing::TestWithParam { public: virtual ~Trans4x4HT() {} @@ -321,8 +321,8 @@ class Trans4x4HT inv_txfm_(out, dst, stride, tx_type_); } - fht_t fwd_txfm_; - iht_t inv_txfm_; + FhtFunc fwd_txfm_; + IhtFunc inv_txfm_; }; TEST_P(Trans4x4HT, AccuracyCheck) { @@ -343,7 +343,7 @@ TEST_P(Trans4x4HT, InvAccuracyCheck) { class Trans4x4WHT : public Trans4x4TestBase, - public ::testing::TestWithParam { + public ::testing::TestWithParam { public: virtual ~Trans4x4WHT() {} @@ -366,8 +366,8 @@ class Trans4x4WHT inv_txfm_(out, dst, stride); } - fdct_t fwd_txfm_; - idct_t inv_txfm_; + FdctFunc fwd_txfm_; + IdctFunc inv_txfm_; }; TEST_P(Trans4x4WHT, AccuracyCheck) { @@ -477,19 +477,4 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, 8))); #endif -#if HAVE_AVX2 && !CONFIG_VP9_HIGH -INSTANTIATE_TEST_CASE_P( - AVX2, Trans4x4DCT, - ::testing::Values( - make_tuple(&vp9_fdct4x4_avx2, - &vp9_idct4x4_16_add_c, 0, 8))); -INSTANTIATE_TEST_CASE_P( - AVX2, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 0, 8), - make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 1, 8), - make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 2, 8), - make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 3, 8))); -#endif - } // namespace diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index e46091f97..fad424fe8 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -65,15 +65,15 @@ void reference_8x8_dct_2d(const int16_t input[kNumCoeffs], using libvpx_test::ACMRandom; namespace { -typedef void (*fdct_t)(const int16_t *in, tran_low_t *out, int stride); -typedef void (*idct_t)(const tran_low_t *in, uint8_t *out, int stride); -typedef void (*fht_t) (const int16_t *in, tran_low_t *out, int stride, - int tx_type); -typedef void (*iht_t) (const tran_low_t *in, uint8_t *out, int stride, - int tx_type); +typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); +typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); +typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, + int tx_type); +typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type); -typedef std::tr1::tuple dct_8x8_param_t; -typedef std::tr1::tuple ht_8x8_param_t; +typedef std::tr1::tuple Dct8x8Param; +typedef std::tr1::tuple Ht8x8Param; void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { vp9_fdct8x8_c(in, out, stride); @@ -412,14 +412,14 @@ class FwdTrans8x8TestBase { int pitch_; int tx_type_; - fht_t fwd_txfm_ref; + FhtFunc fwd_txfm_ref; int bit_depth_; int mask_; }; class FwdTrans8x8DCT : public FwdTrans8x8TestBase, - public ::testing::TestWithParam { + public ::testing::TestWithParam { public: virtual ~FwdTrans8x8DCT() {} @@ -443,8 +443,8 @@ class FwdTrans8x8DCT inv_txfm_(out, dst, stride); } - fdct_t fwd_txfm_; - idct_t inv_txfm_; + FdctFunc fwd_txfm_; + IdctFunc inv_txfm_; }; TEST_P(FwdTrans8x8DCT, SignBiasCheck) { @@ -469,7 +469,7 @@ TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) { class FwdTrans8x8HT : public FwdTrans8x8TestBase, - public ::testing::TestWithParam { + public ::testing::TestWithParam { public: virtual ~FwdTrans8x8HT() {} @@ -493,8 +493,8 @@ class FwdTrans8x8HT inv_txfm_(out, dst, stride, tx_type_); } - fht_t fwd_txfm_; - iht_t inv_txfm_; + FhtFunc fwd_txfm_; + IhtFunc inv_txfm_; }; TEST_P(FwdTrans8x8HT, SignBiasCheck) { @@ -554,7 +554,7 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( NEON, FwdTrans8x8DCT, ::testing::Values( - make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_neon, 0, 8))); + make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0, 8))); INSTANTIATE_TEST_CASE_P( DISABLED_NEON, FwdTrans8x8HT, ::testing::Values( @@ -584,18 +584,4 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0, 8))); #endif - -#if HAVE_AVX2 && !CONFIG_VP9_HIGH -INSTANTIATE_TEST_CASE_P( - AVX2, FwdTrans8x8DCT, - ::testing::Values( - make_tuple(&vp9_fdct8x8_avx2, &vp9_idct8x8_64_add_c, 0, 8))); -INSTANTIATE_TEST_CASE_P( - AVX2, FwdTrans8x8HT, - ::testing::Values( - make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 0, 8), - make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 1, 8), - make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 2, 8), - make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 3, 8))); -#endif } // namespace diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc index d1ea0967f..2400c2021 100644 --- a/test/frame_size_tests.cc +++ b/test/frame_size_tests.cc @@ -7,13 +7,9 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include -#include #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" -#include "test/encode_test_driver.h" -#include "test/i420_video_source.h" -#include "test/util.h" +#include "test/video_source.h" namespace { @@ -33,10 +29,7 @@ class VP9FrameSizeTestsLarge virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, const libvpx_test::VideoSource &video, libvpx_test::Decoder *decoder) { - EXPECT_EQ(expected_res_, res_dec) - << "Expected " << expected_res_ - << "but got " << res_dec; - + EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError(); return !::testing::Test::HasFailure(); } @@ -62,16 +55,6 @@ TEST_F(VP9FrameSizeTestsLarge, TestInvalidSizes) { video.set_limit(2); expected_res_ = VPX_CODEC_CORRUPT_FRAME; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); -#else - // If we are on a 32 bit platform we can't possibly allocate enough memory - // for the largest video frame size (64kx64k). This test checks that we - // properly return a memory error. - if (sizeof(size_t) == 4) { - video.SetSize(65535, 65535); - video.set_limit(2); - expected_res_ = VPX_CODEC_MEM_ERROR; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - } #endif } diff --git a/test/idct_test.cc b/test/idct_test.cc index c7f609d58..2ff9e6446 100644 --- a/test/idct_test.cc +++ b/test/idct_test.cc @@ -16,11 +16,11 @@ #include "vpx/vpx_integer.h" -typedef void (*idct_fn_t)(int16_t *input, unsigned char *pred_ptr, - int pred_stride, unsigned char *dst_ptr, - int dst_stride); +typedef void (*IdctFunc)(int16_t *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride); namespace { -class IDCTTest : public ::testing::TestWithParam { +class IDCTTest : public ::testing::TestWithParam { protected: virtual void SetUp() { int i; @@ -33,7 +33,7 @@ class IDCTTest : public ::testing::TestWithParam { virtual void TearDown() { libvpx_test::ClearSystemState(); } - idct_fn_t UUT; + IdctFunc UUT; int16_t input[16]; unsigned char output[256]; unsigned char predict[256]; diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc index d93251dfe..ead476030 100644 --- a/test/intrapred_test.cc +++ b/test/intrapred_test.cc @@ -216,16 +216,16 @@ class IntraPredBase { int num_planes_; }; -typedef void (*intra_pred_y_fn_t)(MACROBLOCKD *x, - uint8_t *yabove_row, - uint8_t *yleft, - int left_stride, - uint8_t *ypred_ptr, - int y_stride); +typedef void (*IntraPredYFunc)(MACROBLOCKD *x, + uint8_t *yabove_row, + uint8_t *yleft, + int left_stride, + uint8_t *ypred_ptr, + int y_stride); class IntraPredYTest : public IntraPredBase, - public ::testing::TestWithParam { + public ::testing::TestWithParam { public: static void SetUpTestCase() { mb_ = reinterpret_cast( @@ -267,7 +267,7 @@ class IntraPredYTest data_ptr_[0], kStride)); } - intra_pred_y_fn_t pred_fn_; + IntraPredYFunc pred_fn_; static uint8_t* data_array_; static MACROBLOCKD * mb_; static MODE_INFO *mi_; @@ -295,19 +295,19 @@ INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredYTest, vp8_build_intra_predictors_mby_s_ssse3)); #endif -typedef void (*intra_pred_uv_fn_t)(MACROBLOCKD *x, - uint8_t *uabove_row, - uint8_t *vabove_row, - uint8_t *uleft, - uint8_t *vleft, - int left_stride, - uint8_t *upred_ptr, - uint8_t *vpred_ptr, - int pred_stride); +typedef void (*IntraPredUvFunc)(MACROBLOCKD *x, + uint8_t *uabove_row, + uint8_t *vabove_row, + uint8_t *uleft, + uint8_t *vleft, + int left_stride, + uint8_t *upred_ptr, + uint8_t *vpred_ptr, + int pred_stride); class IntraPredUVTest : public IntraPredBase, - public ::testing::TestWithParam { + public ::testing::TestWithParam { public: static void SetUpTestCase() { mb_ = reinterpret_cast( @@ -349,7 +349,7 @@ class IntraPredUVTest data_ptr_[0], data_ptr_[1], kStride); } - intra_pred_uv_fn_t pred_fn_; + IntraPredUvFunc pred_fn_; // We use 24 so that the data pointer of the first pixel in each row of // each macroblock is 8-byte aligned, and this gives us access to the // top-left and top-right corner pixels belonging to the top-left/right diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 777e0e9fc..eb333fe37 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -26,14 +26,14 @@ using libvpx_test::ACMRandom; namespace { -typedef void (*fwd_txfm_t)(const int16_t *in, tran_low_t *out, int stride); -typedef void (*inv_txfm_t)(const tran_low_t *in, uint8_t *out, int stride); -typedef std::tr1::tuple partial_itxfm_param_t; +typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride); +typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride); +typedef std::tr1::tuple PartialInvTxfmParam; const int kMaxNumCoeffs = 1024; -class PartialIDctTest : public ::testing::TestWithParam { +class PartialIDctTest : public ::testing::TestWithParam { public: virtual ~PartialIDctTest() {} virtual void SetUp() { @@ -49,9 +49,9 @@ class PartialIDctTest : public ::testing::TestWithParam { protected: int last_nonzero_; TX_SIZE tx_size_; - fwd_txfm_t ftxfm_; - inv_txfm_t full_itxfm_; - inv_txfm_t partial_itxfm_; + FwdTxfmFunc ftxfm_; + InvTxfmFunc full_itxfm_; + InvTxfmFunc partial_itxfm_; }; TEST_P(PartialIDctTest, RunQuantCheck) { diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc index 1144083ad..a9b16e055 100644 --- a/test/pp_filter_test.cc +++ b/test/pp_filter_test.cc @@ -15,18 +15,18 @@ #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" -typedef void (*post_proc_func_t)(unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int cols, - unsigned char *flimit, - int size); +typedef void (*PostProcFunc)(unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int cols, + unsigned char *flimit, + int size); namespace { class VP8PostProcessingFilterTest - : public ::testing::TestWithParam { + : public ::testing::TestWithParam { public: virtual void TearDown() { libvpx_test::ClearSystemState(); diff --git a/test/sad_test.cc b/test/sad_test.cc index 23dbd0565..e63770bd4 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -30,29 +30,27 @@ #if CONFIG_VP8_ENCODER -typedef unsigned int (*sad_m_by_n_fn_t)(const unsigned char *source_ptr, - int source_stride, - const unsigned char *reference_ptr, - int reference_stride, - unsigned int max_sad); -typedef std::tr1::tuple sad_m_by_n_test_param_t; +typedef unsigned int (*SadMxNFunc)(const unsigned char *source_ptr, + int source_stride, + const unsigned char *reference_ptr, + int reference_stride, + unsigned int max_sad); +typedef std::tr1::tuple SadMxNParam; #endif #if CONFIG_VP9_ENCODER -typedef unsigned int (*sad_m_by_n_fn_vp9_t)(const unsigned char *source_ptr, - int source_stride, - const unsigned char *reference_ptr, - int reference_stride); -typedef std::tr1::tuple - sad_m_by_n_test_param_vp9_t; +typedef unsigned int (*SadMxNVp9Func)(const unsigned char *source_ptr, + int source_stride, + const unsigned char *reference_ptr, + int reference_stride); +typedef std::tr1::tuple SadMxNVp9Param; #endif -typedef void (*sad_n_by_n_by_4_fn_t)(const uint8_t *src_ptr, - int src_stride, - const unsigned char * const ref_ptr[], - int ref_stride, - unsigned int *sad_array); -typedef std::tr1::tuple - sad_n_by_n_by_4_test_param_t; +typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, + int src_stride, + const unsigned char *const ref_ptr[], + int ref_stride, + unsigned int *sad_array); +typedef std::tr1::tuple SadMxNx4Param; using libvpx_test::ACMRandom; @@ -140,7 +138,7 @@ class SADTestBase : public ::testing::Test { class SADx4Test : public SADTestBase, - public ::testing::WithParamInterface { + public ::testing::WithParamInterface { public: SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {} @@ -169,7 +167,7 @@ class SADx4Test #if CONFIG_VP8_ENCODER class SADTest : public SADTestBase, - public ::testing::WithParamInterface { + public ::testing::WithParamInterface { public: SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {} @@ -201,7 +199,7 @@ class SADTest #if CONFIG_VP9_ENCODER class SADVP9Test : public SADTestBase, - public ::testing::WithParamInterface { + public ::testing::WithParamInterface { public: SADVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {} @@ -382,12 +380,12 @@ using std::tr1::make_tuple; //------------------------------------------------------------------------------ // C functions #if CONFIG_VP8_ENCODER -const sad_m_by_n_fn_t sad_16x16_c = vp8_sad16x16_c; -const sad_m_by_n_fn_t sad_8x16_c = vp8_sad8x16_c; -const sad_m_by_n_fn_t sad_16x8_c = vp8_sad16x8_c; -const sad_m_by_n_fn_t sad_8x8_c = vp8_sad8x8_c; -const sad_m_by_n_fn_t sad_4x4_c = vp8_sad4x4_c; -const sad_m_by_n_test_param_t c_tests[] = { +const SadMxNFunc sad_16x16_c = vp8_sad16x16_c; +const SadMxNFunc sad_8x16_c = vp8_sad8x16_c; +const SadMxNFunc sad_16x8_c = vp8_sad16x8_c; +const SadMxNFunc sad_8x8_c = vp8_sad8x8_c; +const SadMxNFunc sad_4x4_c = vp8_sad4x4_c; +const SadMxNParam c_tests[] = { make_tuple(16, 16, sad_16x16_c), make_tuple(8, 16, sad_8x16_c), make_tuple(16, 8, sad_16x8_c), @@ -398,16 +396,16 @@ INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests)); #endif // CONFIG_VP8_ENCODER #if CONFIG_VP9_ENCODER -const sad_m_by_n_fn_vp9_t sad_64x64_c_vp9 = vp9_sad64x64_c; -const sad_m_by_n_fn_vp9_t sad_32x32_c_vp9 = vp9_sad32x32_c; -const sad_m_by_n_fn_vp9_t sad_16x16_c_vp9 = vp9_sad16x16_c; -const sad_m_by_n_fn_vp9_t sad_8x16_c_vp9 = vp9_sad8x16_c; -const sad_m_by_n_fn_vp9_t sad_16x8_c_vp9 = vp9_sad16x8_c; -const sad_m_by_n_fn_vp9_t sad_8x8_c_vp9 = vp9_sad8x8_c; -const sad_m_by_n_fn_vp9_t sad_8x4_c_vp9 = vp9_sad8x4_c; -const sad_m_by_n_fn_vp9_t sad_4x8_c_vp9 = vp9_sad4x8_c; -const sad_m_by_n_fn_vp9_t sad_4x4_c_vp9 = vp9_sad4x4_c; -const sad_m_by_n_test_param_vp9_t c_vp9_tests[] = { +const SadMxNVp9Func sad_64x64_c_vp9 = vp9_sad64x64_c; +const SadMxNVp9Func sad_32x32_c_vp9 = vp9_sad32x32_c; +const SadMxNVp9Func sad_16x16_c_vp9 = vp9_sad16x16_c; +const SadMxNVp9Func sad_8x16_c_vp9 = vp9_sad8x16_c; +const SadMxNVp9Func sad_16x8_c_vp9 = vp9_sad16x8_c; +const SadMxNVp9Func sad_8x8_c_vp9 = vp9_sad8x8_c; +const SadMxNVp9Func sad_8x4_c_vp9 = vp9_sad8x4_c; +const SadMxNVp9Func sad_4x8_c_vp9 = vp9_sad4x8_c; +const SadMxNVp9Func sad_4x4_c_vp9 = vp9_sad4x4_c; +const SadMxNVp9Param c_vp9_tests[] = { make_tuple(64, 64, sad_64x64_c_vp9), make_tuple(32, 32, sad_32x32_c_vp9), make_tuple(16, 16, sad_16x16_c_vp9), @@ -420,19 +418,19 @@ const sad_m_by_n_test_param_vp9_t c_vp9_tests[] = { }; INSTANTIATE_TEST_CASE_P(C, SADVP9Test, ::testing::ValuesIn(c_vp9_tests)); -const sad_n_by_n_by_4_fn_t sad_64x64x4d_c = vp9_sad64x64x4d_c; -const sad_n_by_n_by_4_fn_t sad_64x32x4d_c = vp9_sad64x32x4d_c; -const sad_n_by_n_by_4_fn_t sad_32x64x4d_c = vp9_sad32x64x4d_c; -const sad_n_by_n_by_4_fn_t sad_32x32x4d_c = vp9_sad32x32x4d_c; -const sad_n_by_n_by_4_fn_t sad_32x16x4d_c = vp9_sad32x16x4d_c; -const sad_n_by_n_by_4_fn_t sad_16x32x4d_c = vp9_sad16x32x4d_c; -const sad_n_by_n_by_4_fn_t sad_16x16x4d_c = vp9_sad16x16x4d_c; -const sad_n_by_n_by_4_fn_t sad_16x8x4d_c = vp9_sad16x8x4d_c; -const sad_n_by_n_by_4_fn_t sad_8x16x4d_c = vp9_sad8x16x4d_c; -const sad_n_by_n_by_4_fn_t sad_8x8x4d_c = vp9_sad8x8x4d_c; -const sad_n_by_n_by_4_fn_t sad_8x4x4d_c = vp9_sad8x4x4d_c; -const sad_n_by_n_by_4_fn_t sad_4x8x4d_c = vp9_sad4x8x4d_c; -const sad_n_by_n_by_4_fn_t sad_4x4x4d_c = vp9_sad4x4x4d_c; +const SadMxNx4Func sad_64x64x4d_c = vp9_sad64x64x4d_c; +const SadMxNx4Func sad_64x32x4d_c = vp9_sad64x32x4d_c; +const SadMxNx4Func sad_32x64x4d_c = vp9_sad32x64x4d_c; +const SadMxNx4Func sad_32x32x4d_c = vp9_sad32x32x4d_c; +const SadMxNx4Func sad_32x16x4d_c = vp9_sad32x16x4d_c; +const SadMxNx4Func sad_16x32x4d_c = vp9_sad16x32x4d_c; +const SadMxNx4Func sad_16x16x4d_c = vp9_sad16x16x4d_c; +const SadMxNx4Func sad_16x8x4d_c = vp9_sad16x8x4d_c; +const SadMxNx4Func sad_8x16x4d_c = vp9_sad8x16x4d_c; +const SadMxNx4Func sad_8x8x4d_c = vp9_sad8x8x4d_c; +const SadMxNx4Func sad_8x4x4d_c = vp9_sad8x4x4d_c; +const SadMxNx4Func sad_4x8x4d_c = vp9_sad4x8x4d_c; +const SadMxNx4Func sad_4x4x4d_c = vp9_sad4x4x4d_c; INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values( make_tuple(64, 64, sad_64x64x4d_c), make_tuple(64, 32, sad_64x32x4d_c), @@ -453,7 +451,7 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values( // ARM functions #if HAVE_MEDIA #if CONFIG_VP8_ENCODER -const sad_m_by_n_fn_t sad_16x16_armv6 = vp8_sad16x16_armv6; +const SadMxNFunc sad_16x16_armv6 = vp8_sad16x16_armv6; INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values( make_tuple(16, 16, sad_16x16_armv6))); #endif // CONFIG_VP8_ENCODER @@ -461,11 +459,11 @@ INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values( #if HAVE_NEON #if CONFIG_VP8_ENCODER -const sad_m_by_n_fn_t sad_16x16_neon = vp8_sad16x16_neon; -const sad_m_by_n_fn_t sad_8x16_neon = vp8_sad8x16_neon; -const sad_m_by_n_fn_t sad_16x8_neon = vp8_sad16x8_neon; -const sad_m_by_n_fn_t sad_8x8_neon = vp8_sad8x8_neon; -const sad_m_by_n_fn_t sad_4x4_neon = vp8_sad4x4_neon; +const SadMxNFunc sad_16x16_neon = vp8_sad16x16_neon; +const SadMxNFunc sad_8x16_neon = vp8_sad8x16_neon; +const SadMxNFunc sad_16x8_neon = vp8_sad16x8_neon; +const SadMxNFunc sad_8x8_neon = vp8_sad8x8_neon; +const SadMxNFunc sad_4x4_neon = vp8_sad4x4_neon; INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values( make_tuple(16, 16, sad_16x16_neon), make_tuple(8, 16, sad_8x16_neon), @@ -474,13 +472,15 @@ INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values( make_tuple(4, 4, sad_4x4_neon))); #endif // CONFIG_VP8_ENCODER #if CONFIG_VP9_ENCODER -const sad_m_by_n_fn_vp9_t sad_64x64_neon_vp9 = vp9_sad64x64_neon; -const sad_m_by_n_fn_vp9_t sad_32x32_neon_vp9 = vp9_sad32x32_neon; -const sad_m_by_n_fn_vp9_t sad_16x16_neon_vp9 = vp9_sad16x16_neon; -const sad_m_by_n_test_param_vp9_t neon_vp9_tests[] = { +const SadMxNVp9Func sad_64x64_neon_vp9 = vp9_sad64x64_neon; +const SadMxNVp9Func sad_32x32_neon_vp9 = vp9_sad32x32_neon; +const SadMxNVp9Func sad_16x16_neon_vp9 = vp9_sad16x16_neon; +const SadMxNVp9Func sad_8x8_neon_vp9 = vp9_sad8x8_neon; +const SadMxNVp9Param neon_vp9_tests[] = { make_tuple(64, 64, sad_64x64_neon_vp9), make_tuple(32, 32, sad_32x32_neon_vp9), make_tuple(16, 16, sad_16x16_neon_vp9), + make_tuple(8, 8, sad_8x8_neon_vp9), }; INSTANTIATE_TEST_CASE_P(NEON, SADVP9Test, ::testing::ValuesIn(neon_vp9_tests)); #endif // CONFIG_VP9_ENCODER @@ -490,12 +490,12 @@ INSTANTIATE_TEST_CASE_P(NEON, SADVP9Test, ::testing::ValuesIn(neon_vp9_tests)); // x86 functions #if HAVE_MMX #if CONFIG_VP8_ENCODER -const sad_m_by_n_fn_t sad_16x16_mmx = vp8_sad16x16_mmx; -const sad_m_by_n_fn_t sad_8x16_mmx = vp8_sad8x16_mmx; -const sad_m_by_n_fn_t sad_16x8_mmx = vp8_sad16x8_mmx; -const sad_m_by_n_fn_t sad_8x8_mmx = vp8_sad8x8_mmx; -const sad_m_by_n_fn_t sad_4x4_mmx = vp8_sad4x4_mmx; -const sad_m_by_n_test_param_t mmx_tests[] = { +const SadMxNFunc sad_16x16_mmx = vp8_sad16x16_mmx; +const SadMxNFunc sad_8x16_mmx = vp8_sad8x16_mmx; +const SadMxNFunc sad_16x8_mmx = vp8_sad16x8_mmx; +const SadMxNFunc sad_8x8_mmx = vp8_sad8x8_mmx; +const SadMxNFunc sad_4x4_mmx = vp8_sad4x4_mmx; +const SadMxNParam mmx_tests[] = { make_tuple(16, 16, sad_16x16_mmx), make_tuple(8, 16, sad_8x16_mmx), make_tuple(16, 8, sad_16x8_mmx), @@ -506,12 +506,12 @@ INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests)); #endif // CONFIG_VP8_ENCODER #if CONFIG_VP9_ENCODER -const sad_m_by_n_fn_vp9_t sad_16x16_mmx_vp9 = vp9_sad16x16_mmx; -const sad_m_by_n_fn_vp9_t sad_8x16_mmx_vp9 = vp9_sad8x16_mmx; -const sad_m_by_n_fn_vp9_t sad_16x8_mmx_vp9 = vp9_sad16x8_mmx; -const sad_m_by_n_fn_vp9_t sad_8x8_mmx_vp9 = vp9_sad8x8_mmx; -const sad_m_by_n_fn_vp9_t sad_4x4_mmx_vp9 = vp9_sad4x4_mmx; -const sad_m_by_n_test_param_vp9_t mmx_vp9_tests[] = { +const SadMxNVp9Func sad_16x16_mmx_vp9 = vp9_sad16x16_mmx; +const SadMxNVp9Func sad_8x16_mmx_vp9 = vp9_sad8x16_mmx; +const SadMxNVp9Func sad_16x8_mmx_vp9 = vp9_sad16x8_mmx; +const SadMxNVp9Func sad_8x8_mmx_vp9 = vp9_sad8x8_mmx; +const SadMxNVp9Func sad_4x4_mmx_vp9 = vp9_sad4x4_mmx; +const SadMxNVp9Param mmx_vp9_tests[] = { make_tuple(16, 16, sad_16x16_mmx_vp9), make_tuple(8, 16, sad_8x16_mmx_vp9), make_tuple(16, 8, sad_16x8_mmx_vp9), @@ -525,14 +525,14 @@ INSTANTIATE_TEST_CASE_P(MMX, SADVP9Test, ::testing::ValuesIn(mmx_vp9_tests)); #if HAVE_SSE #if CONFIG_VP9_ENCODER #if CONFIG_USE_X86INC -const sad_m_by_n_fn_vp9_t sad_4x4_sse_vp9 = vp9_sad4x4_sse; -const sad_m_by_n_fn_vp9_t sad_4x8_sse_vp9 = vp9_sad4x8_sse; +const SadMxNVp9Func sad_4x4_sse_vp9 = vp9_sad4x4_sse; +const SadMxNVp9Func sad_4x8_sse_vp9 = vp9_sad4x8_sse; INSTANTIATE_TEST_CASE_P(SSE, SADVP9Test, ::testing::Values( make_tuple(4, 4, sad_4x4_sse_vp9), make_tuple(4, 8, sad_4x8_sse_vp9))); -const sad_n_by_n_by_4_fn_t sad_4x8x4d_sse = vp9_sad4x8x4d_sse; -const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse = vp9_sad4x4x4d_sse; +const SadMxNx4Func sad_4x8x4d_sse = vp9_sad4x8x4d_sse; +const SadMxNx4Func sad_4x4x4d_sse = vp9_sad4x4x4d_sse; INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values( make_tuple(4, 8, sad_4x8x4d_sse), make_tuple(4, 4, sad_4x4x4d_sse))); @@ -542,12 +542,12 @@ INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values( #if HAVE_SSE2 #if CONFIG_VP8_ENCODER -const sad_m_by_n_fn_t sad_16x16_wmt = vp8_sad16x16_wmt; -const sad_m_by_n_fn_t sad_8x16_wmt = vp8_sad8x16_wmt; -const sad_m_by_n_fn_t sad_16x8_wmt = vp8_sad16x8_wmt; -const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt; -const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt; -const sad_m_by_n_test_param_t sse2_tests[] = { +const SadMxNFunc sad_16x16_wmt = vp8_sad16x16_wmt; +const SadMxNFunc sad_8x16_wmt = vp8_sad8x16_wmt; +const SadMxNFunc sad_16x8_wmt = vp8_sad16x8_wmt; +const SadMxNFunc sad_8x8_wmt = vp8_sad8x8_wmt; +const SadMxNFunc sad_4x4_wmt = vp8_sad4x4_wmt; +const SadMxNParam sse2_tests[] = { make_tuple(16, 16, sad_16x16_wmt), make_tuple(8, 16, sad_8x16_wmt), make_tuple(16, 8, sad_16x8_wmt), @@ -559,18 +559,18 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); #if CONFIG_VP9_ENCODER #if CONFIG_USE_X86INC -const sad_m_by_n_fn_vp9_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2; -const sad_m_by_n_fn_vp9_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2; -const sad_m_by_n_fn_vp9_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2; -const sad_m_by_n_fn_vp9_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2; -const sad_m_by_n_fn_vp9_t sad_32x16_sse2_vp9 = vp9_sad32x16_sse2; -const sad_m_by_n_fn_vp9_t sad_16x32_sse2_vp9 = vp9_sad16x32_sse2; -const sad_m_by_n_fn_vp9_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2; -const sad_m_by_n_fn_vp9_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2; -const sad_m_by_n_fn_vp9_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2; -const sad_m_by_n_fn_vp9_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2; -const sad_m_by_n_fn_vp9_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2; -const sad_m_by_n_test_param_vp9_t sse2_vp9_tests[] = { +const SadMxNVp9Func sad_64x64_sse2_vp9 = vp9_sad64x64_sse2; +const SadMxNVp9Func sad_64x32_sse2_vp9 = vp9_sad64x32_sse2; +const SadMxNVp9Func sad_32x64_sse2_vp9 = vp9_sad32x64_sse2; +const SadMxNVp9Func sad_32x32_sse2_vp9 = vp9_sad32x32_sse2; +const SadMxNVp9Func sad_32x16_sse2_vp9 = vp9_sad32x16_sse2; +const SadMxNVp9Func sad_16x32_sse2_vp9 = vp9_sad16x32_sse2; +const SadMxNVp9Func sad_16x16_sse2_vp9 = vp9_sad16x16_sse2; +const SadMxNVp9Func sad_16x8_sse2_vp9 = vp9_sad16x8_sse2; +const SadMxNVp9Func sad_8x16_sse2_vp9 = vp9_sad8x16_sse2; +const SadMxNVp9Func sad_8x8_sse2_vp9 = vp9_sad8x8_sse2; +const SadMxNVp9Func sad_8x4_sse2_vp9 = vp9_sad8x4_sse2; +const SadMxNVp9Param sse2_vp9_tests[] = { make_tuple(64, 64, sad_64x64_sse2_vp9), make_tuple(64, 32, sad_64x32_sse2_vp9), make_tuple(32, 64, sad_32x64_sse2_vp9), @@ -585,17 +585,17 @@ const sad_m_by_n_test_param_vp9_t sse2_vp9_tests[] = { }; INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::ValuesIn(sse2_vp9_tests)); -const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2; -const sad_n_by_n_by_4_fn_t sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2; -const sad_n_by_n_by_4_fn_t sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2; -const sad_n_by_n_by_4_fn_t sad_32x32x4d_sse2 = vp9_sad32x32x4d_sse2; -const sad_n_by_n_by_4_fn_t sad_32x16x4d_sse2 = vp9_sad32x16x4d_sse2; -const sad_n_by_n_by_4_fn_t sad_16x32x4d_sse2 = vp9_sad16x32x4d_sse2; -const sad_n_by_n_by_4_fn_t sad_16x16x4d_sse2 = vp9_sad16x16x4d_sse2; -const sad_n_by_n_by_4_fn_t sad_16x8x4d_sse2 = vp9_sad16x8x4d_sse2; -const sad_n_by_n_by_4_fn_t sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2; -const sad_n_by_n_by_4_fn_t sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2; -const sad_n_by_n_by_4_fn_t sad_8x4x4d_sse2 = vp9_sad8x4x4d_sse2; +const SadMxNx4Func sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2; +const SadMxNx4Func sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2; +const SadMxNx4Func sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2; +const SadMxNx4Func sad_32x32x4d_sse2 = vp9_sad32x32x4d_sse2; +const SadMxNx4Func sad_32x16x4d_sse2 = vp9_sad32x16x4d_sse2; +const SadMxNx4Func sad_16x32x4d_sse2 = vp9_sad16x32x4d_sse2; +const SadMxNx4Func sad_16x16x4d_sse2 = vp9_sad16x16x4d_sse2; +const SadMxNx4Func sad_16x8x4d_sse2 = vp9_sad16x8x4d_sse2; +const SadMxNx4Func sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2; +const SadMxNx4Func sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2; +const SadMxNx4Func sad_8x4x4d_sse2 = vp9_sad8x4x4d_sse2; INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values( make_tuple(64, 64, sad_64x64x4d_sse2), make_tuple(64, 32, sad_64x32x4d_sse2), @@ -614,11 +614,11 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values( #if HAVE_SSE3 #if CONFIG_VP8_ENCODER -const sad_n_by_n_by_4_fn_t sad_16x16x4d_sse3 = vp8_sad16x16x4d_sse3; -const sad_n_by_n_by_4_fn_t sad_16x8x4d_sse3 = vp8_sad16x8x4d_sse3; -const sad_n_by_n_by_4_fn_t sad_8x16x4d_sse3 = vp8_sad8x16x4d_sse3; -const sad_n_by_n_by_4_fn_t sad_8x8x4d_sse3 = vp8_sad8x8x4d_sse3; -const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse3 = vp8_sad4x4x4d_sse3; +const SadMxNx4Func sad_16x16x4d_sse3 = vp8_sad16x16x4d_sse3; +const SadMxNx4Func sad_16x8x4d_sse3 = vp8_sad16x8x4d_sse3; +const SadMxNx4Func sad_8x16x4d_sse3 = vp8_sad8x16x4d_sse3; +const SadMxNx4Func sad_8x8x4d_sse3 = vp8_sad8x8x4d_sse3; +const SadMxNx4Func sad_4x4x4d_sse3 = vp8_sad4x4x4d_sse3; INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values( make_tuple(16, 16, sad_16x16x4d_sse3), make_tuple(16, 8, sad_16x8x4d_sse3), @@ -631,7 +631,7 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values( #if HAVE_SSSE3 #if CONFIG_USE_X86INC #if CONFIG_VP8_ENCODER -const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3; +const SadMxNFunc sad_16x16_sse3 = vp8_sad16x16_sse3; INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values( make_tuple(16, 16, sad_16x16_sse3))); #endif // CONFIG_VP8_ENCODER @@ -640,19 +640,9 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values( #if HAVE_AVX2 #if CONFIG_VP9_ENCODER -// TODO(jzern): these prototypes can be removed after the avx2 versions are -// reenabled in vp9_rtcd_defs.pl. -extern "C" { -void vp9_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_ptr[], int ref_stride, - unsigned int *sad_array); -void vp9_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_ptr[], int ref_stride, - unsigned int *sad_array); -} -const sad_n_by_n_by_4_fn_t sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2; -const sad_n_by_n_by_4_fn_t sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2; -INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, SADx4Test, ::testing::Values( +const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2; +const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2; +INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values( make_tuple(32, 32, sad_32x32x4d_avx2), make_tuple(64, 64, sad_64x64x4d_avx2))); #endif // CONFIG_VP9_ENCODER diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc index ac7aa9947..1e6d91547 100644 --- a/test/sixtap_predict_test.cc +++ b/test/sixtap_predict_test.cc @@ -23,17 +23,17 @@ namespace { -typedef void (*sixtap_predict_fn_t)(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch); +typedef void (*SixtapPredictFunc)(uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + uint8_t *dst_ptr, + int dst_pitch); -typedef std::tr1::tuple sixtap_predict_param_t; +typedef std::tr1::tuple SixtapPredictParam; class SixtapPredictTest - : public ::testing::TestWithParam { + : public ::testing::TestWithParam { public: static void SetUpTestCase() { src_ = reinterpret_cast(vpx_memalign(kDataAlignment, kSrcSize)); @@ -74,7 +74,7 @@ class SixtapPredictTest int width_; int height_; - sixtap_predict_fn_t sixtap_predict_; + SixtapPredictFunc sixtap_predict_; // The src stores the macroblock we will filter on, and makes it 1 byte larger // in order to test unaligned access. The result is stored in dst and dst_c(c // reference code result). @@ -184,10 +184,10 @@ TEST_P(SixtapPredictTest, TestWithRandomData) { using std::tr1::make_tuple; -const sixtap_predict_fn_t sixtap_16x16_c = vp8_sixtap_predict16x16_c; -const sixtap_predict_fn_t sixtap_8x8_c = vp8_sixtap_predict8x8_c; -const sixtap_predict_fn_t sixtap_8x4_c = vp8_sixtap_predict8x4_c; -const sixtap_predict_fn_t sixtap_4x4_c = vp8_sixtap_predict4x4_c; +const SixtapPredictFunc sixtap_16x16_c = vp8_sixtap_predict16x16_c; +const SixtapPredictFunc sixtap_8x8_c = vp8_sixtap_predict8x8_c; +const SixtapPredictFunc sixtap_8x4_c = vp8_sixtap_predict8x4_c; +const SixtapPredictFunc sixtap_4x4_c = vp8_sixtap_predict4x4_c; INSTANTIATE_TEST_CASE_P( C, SixtapPredictTest, ::testing::Values( make_tuple(16, 16, sixtap_16x16_c), @@ -195,9 +195,9 @@ INSTANTIATE_TEST_CASE_P( make_tuple(8, 4, sixtap_8x4_c), make_tuple(4, 4, sixtap_4x4_c))); #if HAVE_NEON -const sixtap_predict_fn_t sixtap_16x16_neon = vp8_sixtap_predict16x16_neon; -const sixtap_predict_fn_t sixtap_8x8_neon = vp8_sixtap_predict8x8_neon; -const sixtap_predict_fn_t sixtap_8x4_neon = vp8_sixtap_predict8x4_neon; +const SixtapPredictFunc sixtap_16x16_neon = vp8_sixtap_predict16x16_neon; +const SixtapPredictFunc sixtap_8x8_neon = vp8_sixtap_predict8x8_neon; +const SixtapPredictFunc sixtap_8x4_neon = vp8_sixtap_predict8x4_neon; INSTANTIATE_TEST_CASE_P( DISABLED_NEON, SixtapPredictTest, ::testing::Values( make_tuple(16, 16, sixtap_16x16_neon), @@ -205,10 +205,10 @@ INSTANTIATE_TEST_CASE_P( make_tuple(8, 4, sixtap_8x4_neon))); #endif #if HAVE_MMX -const sixtap_predict_fn_t sixtap_16x16_mmx = vp8_sixtap_predict16x16_mmx; -const sixtap_predict_fn_t sixtap_8x8_mmx = vp8_sixtap_predict8x8_mmx; -const sixtap_predict_fn_t sixtap_8x4_mmx = vp8_sixtap_predict8x4_mmx; -const sixtap_predict_fn_t sixtap_4x4_mmx = vp8_sixtap_predict4x4_mmx; +const SixtapPredictFunc sixtap_16x16_mmx = vp8_sixtap_predict16x16_mmx; +const SixtapPredictFunc sixtap_8x8_mmx = vp8_sixtap_predict8x8_mmx; +const SixtapPredictFunc sixtap_8x4_mmx = vp8_sixtap_predict8x4_mmx; +const SixtapPredictFunc sixtap_4x4_mmx = vp8_sixtap_predict4x4_mmx; INSTANTIATE_TEST_CASE_P( MMX, SixtapPredictTest, ::testing::Values( make_tuple(16, 16, sixtap_16x16_mmx), @@ -217,9 +217,9 @@ INSTANTIATE_TEST_CASE_P( make_tuple(4, 4, sixtap_4x4_mmx))); #endif #if HAVE_SSE2 -const sixtap_predict_fn_t sixtap_16x16_sse2 = vp8_sixtap_predict16x16_sse2; -const sixtap_predict_fn_t sixtap_8x8_sse2 = vp8_sixtap_predict8x8_sse2; -const sixtap_predict_fn_t sixtap_8x4_sse2 = vp8_sixtap_predict8x4_sse2; +const SixtapPredictFunc sixtap_16x16_sse2 = vp8_sixtap_predict16x16_sse2; +const SixtapPredictFunc sixtap_8x8_sse2 = vp8_sixtap_predict8x8_sse2; +const SixtapPredictFunc sixtap_8x4_sse2 = vp8_sixtap_predict8x4_sse2; INSTANTIATE_TEST_CASE_P( SSE2, SixtapPredictTest, ::testing::Values( make_tuple(16, 16, sixtap_16x16_sse2), @@ -227,10 +227,10 @@ INSTANTIATE_TEST_CASE_P( make_tuple(8, 4, sixtap_8x4_sse2))); #endif #if HAVE_SSSE3 -const sixtap_predict_fn_t sixtap_16x16_ssse3 = vp8_sixtap_predict16x16_ssse3; -const sixtap_predict_fn_t sixtap_8x8_ssse3 = vp8_sixtap_predict8x8_ssse3; -const sixtap_predict_fn_t sixtap_8x4_ssse3 = vp8_sixtap_predict8x4_ssse3; -const sixtap_predict_fn_t sixtap_4x4_ssse3 = vp8_sixtap_predict4x4_ssse3; +const SixtapPredictFunc sixtap_16x16_ssse3 = vp8_sixtap_predict16x16_ssse3; +const SixtapPredictFunc sixtap_8x8_ssse3 = vp8_sixtap_predict8x8_ssse3; +const SixtapPredictFunc sixtap_8x4_ssse3 = vp8_sixtap_predict8x4_ssse3; +const SixtapPredictFunc sixtap_4x4_ssse3 = vp8_sixtap_predict4x4_ssse3; INSTANTIATE_TEST_CASE_P( SSSE3, SixtapPredictTest, ::testing::Values( make_tuple(16, 16, sixtap_16x16_ssse3), diff --git a/test/subtract_test.cc b/test/subtract_test.cc index 2db3dd785..6619fb158 100644 --- a/test/subtract_test.cc +++ b/test/subtract_test.cc @@ -18,11 +18,11 @@ #include "vp8/encoder/block.h" #include "vpx_mem/vpx_mem.h" -typedef void (*subtract_b_fn_t)(BLOCK *be, BLOCKD *bd, int pitch); +typedef void (*SubtractBlockFunc)(BLOCK *be, BLOCKD *bd, int pitch); namespace { -class SubtractBlockTest : public ::testing::TestWithParam { +class SubtractBlockTest : public ::testing::TestWithParam { public: virtual void TearDown() { libvpx_test::ClearSystemState(); diff --git a/test/test-data.sha1 b/test/test-data.sha1 index 74f7842d4..ee6289f1e 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -669,3 +669,15 @@ c01bb7938f9a9f25e0c37afdec2f2fb73b6cc7fa vp90-2-17-show-existing-frame.webm cc75f351818b9a619818f5cc77b9bc013d0c1e11 vp90-2-17-show-existing-frame.webm.md5 0321d507ce62dedc8a51b4e9011f7a19aed9c3dc vp91-2-04-yuv444.webm 367e423dd41fdb49aa028574a2cfec5c2f325c5c vp91-2-04-yuv444.webm.md5 +eb438c6540eb429f74404eedfa3228d409c57874 desktop_640_360_30.yuv +89e70ebd22c27d275fe14dc2f1a41841a6d8b9ab kirland_640_480_30.yuv +33c533192759e5bb4f07abfbac389dc259db4686 macmarcomoving_640_480_30.yuv +8bfaab121080821b8f03b23467911e59ec59b8fe macmarcostationary_640_480_30.yuv +70894878d916a599842d9ad0dcd24e10c13e5467 niklas_640_480_30.yuv +8784b6df2d8cc946195a90ac00540500d2e522e4 tacomanarrows_640_480_30.yuv +edd86a1f5e62fd9da9a9d46078247759c2638009 tacomasmallcameramovement_640_480_30.yuv +9a70e8b7d14fba9234d0e51dce876635413ce444 thaloundeskmtg_640_480_30.yuv +e7d315dbf4f3928779e0dc624311196d44491d32 niklas_1280_720_30.yuv +c77e4a26616add298a05dd5d12397be22c0e40c5 vp90-2-18-resize.ivf +c12918cf0a716417fba2de35c3fc5ab90e52dfce vp90-2-18-resize.ivf.md5 +717da707afcaa1f692ff1946f291054eb75a4f06 screendata.y4m diff --git a/test/test.mk b/test/test.mk index 888a4bbc7..44cff1455 100644 --- a/test/test.mk +++ b/test/test.mk @@ -59,6 +59,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../webmdec.h LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += webm_video_source.h endif +LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += decode_api_test.cc LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += invalid_file_test.cc LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += test_vector_test.cc @@ -69,6 +70,11 @@ ifeq ($(CONFIG_DECODE_PERF_TESTS)$(CONFIG_VP9_DECODER)$(CONFIG_WEBM_IO), \ LIBVPX_TEST_SRCS-yes += decode_perf_test.cc endif +# encode perf tests are vp9 only +ifeq ($(CONFIG_ENCODE_PERF_TESTS)$(CONFIG_VP9_ENCODER), yesyes) +LIBVPX_TEST_SRCS-yes += encode_perf_test.cc +endif + ## ## WHITE BOX TESTS ## @@ -95,6 +101,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc LIBVPX_TEST_SRCS-yes += idct_test.cc LIBVPX_TEST_SRCS-yes += intrapred_test.cc LIBVPX_TEST_SRCS-yes += sixtap_predict_test.cc +LIBVPX_TEST_SRCS-yes += vpx_scale_test.cc endif # VP8 @@ -778,6 +785,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm.md5 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf.md5 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5 @@ -840,3 +849,15 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \ vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm endif # CONFIG_DECODE_PERF_TESTS + +ifeq ($(CONFIG_ENCODE_PERF_TESTS),yes) +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_640_360_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += thaloundeskmtg_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv +endif # CONFIG_ENCODE_PERF_TESTS diff --git a/test/test_vectors.cc b/test/test_vectors.cc index 4ea4b9dab..dbdbdd6f9 100644 --- a/test/test_vectors.cc +++ b/test/test_vectors.cc @@ -181,7 +181,7 @@ const char *const kVP9TestVectors[] = { "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm", "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm", "vp90-2-16-intra-only.webm", "vp90-2-17-show-existing-frame.webm", - "vp91-2-04-yuv444.webm", + "vp90-2-18-resize.ivf", "vp91-2-04-yuv444.webm", }; const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors); #endif // CONFIG_VP9_DECODER diff --git a/test/tools_common.sh b/test/tools_common.sh index 3e69c3687..0bfefba46 100755 --- a/test/tools_common.sh +++ b/test/tools_common.sh @@ -182,97 +182,6 @@ webm_io_available() { [ "$(vpx_config_option_enabled CONFIG_WEBM_IO)" = "yes" ] && echo yes } -# Echoes yes to stdout when vpxdec exists according to vpx_tool_available(). -vpxdec_available() { - [ -n $(vpx_tool_available vpxdec) ] && echo yes -} - -# Wrapper function for running vpxdec in noblit mode. Requires that -# LIBVPX_BIN_PATH points to the directory containing vpxdec. Positional -# parameter one is used as the input file path. Positional parameter two, when -# present, is interpreted as a boolean flag that means the input should be sent -# to vpxdec via pipe from cat instead of directly. -vpxdec() { - local input="${1}" - local pipe_input=${2} - - if [ $# -gt 2 ]; then - # shift away $1 and $2 so the remaining arguments can be passed to vpxdec - # via $@. - shift 2 - fi - - local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}" - - if [ -z "${pipe_input}" ]; then - eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" --summary --noblit "$@" \ - ${devnull} - else - cat "${input}" \ - | eval "${VPX_TEST_PREFIX}" "${decoder}" - --summary --noblit "$@" \ - ${devnull} - fi -} - -# Echoes yes to stdout when vpxenc exists according to vpx_tool_available(). -vpxenc_available() { - [ -n $(vpx_tool_available vpxenc) ] && echo yes -} - -# Wrapper function for running vpxenc. Positional parameters are interpreted as -# follows: -# 1 - codec name -# 2 - input width -# 3 - input height -# 4 - number of frames to encode -# 5 - path to input file -# 6 - path to output file -# Note: The output file path must end in .ivf to output an IVF file. -# 7 - extra flags -# Note: Extra flags currently supports a special case: when set to "-" -# input is piped to vpxenc via cat. -vpxenc() { - local encoder="${LIBVPX_BIN_PATH}/vpxenc${VPX_TEST_EXE_SUFFIX}" - local codec="${1}" - local width=${2} - local height=${3} - local frames=${4} - local input=${5} - local output="${VPX_TEST_OUTPUT_DIR}/${6}" - local extra_flags=${7} - - # Because --ivf must be within the command line to get IVF from vpxenc. - if echo "${output}" | egrep -q 'ivf$'; then - use_ivf=--ivf - else - unset use_ivf - fi - - if [ "${extra_flags}" = "-" ]; then - pipe_input=yes - extra_flags=${8} - else - unset pipe_input - fi - - if [ -z "${pipe_input}" ]; then - eval "${VPX_TEST_PREFIX}" "${encoder}" --codec=${codec} --width=${width} \ - --height=${height} --limit=${frames} ${use_ivf} ${extra_flags} \ - --output="${output}" "${input}" ${devnull} - else - cat "${input}" \ - | eval "${VPX_TEST_PREFIX}" "${encoder}" --codec=${codec} \ - --width=${width} --height=${height} --limit=${frames} ${use_ivf} \ - ${extra_flags} --output="${output}" - ${devnull} - fi - - if [ ! -e "${output}" ]; then - # Return non-zero exit status: output file doesn't exist, so something - # definitely went wrong. - return 1 - fi -} - # Filters strings from positional parameter one using the filter specified by # positional parameter two. Filter behavior depends on the presence of a third # positional parameter. When parameter three is present, strings that match the diff --git a/test/variance_test.cc b/test/variance_test.cc index 546977069..7d8118235 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -90,14 +90,14 @@ class VarianceTest rnd(ACMRandom::DeterministicSeed()); block_size_ = width_ * height_; - src_ = new uint8_t[block_size_]; + src_ = reinterpret_cast(vpx_memalign(16, block_size_)); ref_ = new uint8_t[block_size_]; ASSERT_TRUE(src_ != NULL); ASSERT_TRUE(ref_ != NULL); } virtual void TearDown() { - delete[] src_; + vpx_free(src_); delete[] ref_; libvpx_test::ClearSystemState(); } @@ -707,24 +707,7 @@ INSTANTIATE_TEST_CASE_P( #endif #if HAVE_AVX2 -// TODO(jzern): these prototypes can be removed after the avx2 versions are -// reenabled in vp9_rtcd_defs.pl. -extern "C" { -unsigned int vp9_sub_pixel_variance32x32_avx2( - const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, - const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x64_avx2( - const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, - const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_avg_variance32x32_avx2( - const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, - const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, - const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x64_avx2( - const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, - const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, - const uint8_t *second_pred); -} + const vp9_variance_fn_t variance16x16_avx2 = vp9_variance16x16_avx2; const vp9_variance_fn_t variance32x16_avx2 = vp9_variance32x16_avx2; const vp9_variance_fn_t variance32x32_avx2 = vp9_variance32x32_avx2; @@ -743,7 +726,7 @@ const vp9_subpixvariance_fn_t subpel_variance32x32_avx2 = const vp9_subpixvariance_fn_t subpel_variance64x64_avx2 = vp9_sub_pixel_variance64x64_avx2; INSTANTIATE_TEST_CASE_P( - DISABLED_AVX2, VP9SubpelVarianceTest, + AVX2, VP9SubpelVarianceTest, ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2), make_tuple(6, 6, subpel_variance64x64_avx2))); @@ -752,10 +735,32 @@ const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_avx2 = const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_avx2 = vp9_sub_pixel_avg_variance64x64_avx2; INSTANTIATE_TEST_CASE_P( - DISABLED_AVX2, VP9SubpelAvgVarianceTest, + AVX2, VP9SubpelAvgVarianceTest, ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2), make_tuple(6, 6, subpel_avg_variance64x64_avx2))); #endif // HAVE_AVX2 +#if HAVE_NEON +const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon; +const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon; +const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon; +INSTANTIATE_TEST_CASE_P( + NEON, VP9VarianceTest, + ::testing::Values(make_tuple(3, 3, variance8x8_neon), + make_tuple(4, 4, variance16x16_neon), + make_tuple(5, 5, variance32x32_neon))); + +const vp9_subpixvariance_fn_t subpel_variance8x8_neon = + vp9_sub_pixel_variance8x8_neon; +const vp9_subpixvariance_fn_t subpel_variance16x16_neon = + vp9_sub_pixel_variance16x16_neon; +const vp9_subpixvariance_fn_t subpel_variance32x32_neon = + vp9_sub_pixel_variance32x32_neon; +INSTANTIATE_TEST_CASE_P( + NEON, VP9SubpelVarianceTest, + ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon), + make_tuple(4, 4, subpel_variance16x16_neon), + make_tuple(5, 5, subpel_variance32x32_neon))); +#endif // HAVE_NEON #endif // CONFIG_VP9_ENCODER } // namespace vp9 diff --git a/test/video_source.h b/test/video_source.h index 94500b43d..c924f964f 100644 --- a/test/video_source.h +++ b/test/video_source.h @@ -10,6 +10,9 @@ #ifndef TEST_VIDEO_SOURCE_H_ #define TEST_VIDEO_SOURCE_H_ +#if defined(_WIN32) +#include +#endif #include #include #include @@ -50,14 +53,57 @@ static FILE *OpenTestDataFile(const std::string& file_name) { return fopen(path_to_source.c_str(), "rb"); } -static FILE *OpenTestOutFile(const std::string& file_name) { - const std::string path_to_source = GetDataPath() + "/" + file_name; - return fopen(path_to_source.c_str(), "wb"); +static FILE *GetTempOutFile(std::string *file_name) { + file_name->clear(); +#if defined(_WIN32) + char fname[MAX_PATH]; + char tmppath[MAX_PATH]; + if (GetTempPathA(MAX_PATH, tmppath)) { + // Assume for now that the filename generated is unique per process + if (GetTempFileNameA(tmppath, "lvx", 0, fname)) { + file_name->assign(fname); + return fopen(fname, "wb+"); + } + } + return NULL; +#else + return tmpfile(); +#endif } -static FILE *OpenTempOutFile() { - return tmpfile(); -} +class TempOutFile { + public: + TempOutFile() { + file_ = GetTempOutFile(&file_name_); + } + ~TempOutFile() { + CloseFile(); + if (!file_name_.empty()) { + EXPECT_EQ(0, remove(file_name_.c_str())); + } + } + FILE *file() { + return file_; + } + const std::string& file_name() { + return file_name_; + } + + protected: + void CloseFile() { + if (file_) { + // Close if file pointer is associated with an open file +#if defined(_WIN32) + if (file_->_ptr != NULL) fclose(file_); +#else + if (fileno(file_) != -1) fclose(file_); +#endif + file_ = NULL; + } + } + FILE *file_; + std::string file_name_; +}; // Abstract base class for test video sources, which provide a stream of // vpx_image_t images with associated timestamps and duration. diff --git a/test/vp9_spatial_svc_encoder.sh b/test/vp9_spatial_svc_encoder.sh index a5728f677..6dd5f171b 100755 --- a/test/vp9_spatial_svc_encoder.sh +++ b/test/vp9_spatial_svc_encoder.sh @@ -25,12 +25,13 @@ vp9_spatial_svc_encoder_verify_environment() { # Runs vp9_spatial_svc_encoder. $1 is the test name. vp9_spatial_svc_encoder() { - local encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder" - encoder="${encoder}${VPX_TEST_EXE_SUFFIX}" - local test_name="$1" - local output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf" - local frames_to_encode="10" - local max_kf="9999" + local readonly \ + encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder${VPX_TEST_EXE_SUFFIX}" + local readonly test_name="$1" + local readonly \ + output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf" + local readonly frames_to_encode=10 + local readonly max_kf=9999 shift @@ -40,52 +41,32 @@ vp9_spatial_svc_encoder() { fi eval "${VPX_TEST_PREFIX}" "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" \ - -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \ - "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull} + -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \ + "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull} [ -e "${output_file}" ] || return 1 } -# Each mode is run with layer count 1-$vp9_ssvc_test_layers. +# Each test is run with layer count 1-$vp9_ssvc_test_layers. vp9_ssvc_test_layers=5 -vp9_spatial_svc_mode_i() { +vp9_spatial_svc() { if [ "$(vp9_encode_available)" = "yes" ]; then - local test_name="${FUNCNAME}" + local readonly test_name="vp9_spatial_svc" for layers in $(seq 1 ${vp9_ssvc_test_layers}); do - vp9_spatial_svc_encoder "${test_name}" -m i -l ${layers} + vp9_spatial_svc_encoder "${test_name}" -l ${layers} done fi } -vp9_spatial_svc_mode_altip() { - if [ "$(vp9_encode_available)" = "yes" ]; then - local test_name="${FUNCNAME}" - for layers in $(seq 1 ${vp9_ssvc_test_layers}); do - vp9_spatial_svc_encoder "${test_name}" -m "alt-ip" -l ${layers} - done - fi -} +readonly vp9_spatial_svc_tests="DISABLED_vp9_spatial_svc_mode_i + DISABLED_vp9_spatial_svc_mode_altip + DISABLED_vp9_spatial_svc_mode_ip + DISABLED_vp9_spatial_svc_mode_gf + vp9_spatial_svc" -vp9_spatial_svc_mode_ip() { - if [ "$(vp9_encode_available)" = "yes" ]; then - local test_name="${FUNCNAME}" - vp9_spatial_svc_encoder "${test_name}" -m ip -l 1 - fi -} - -vp9_spatial_svc_mode_gf() { - if [ "$(vp9_encode_available)" = "yes" ]; then - local test_name="${FUNCNAME}" - for layers in $(seq 1 ${vp9_ssvc_test_layers}); do - vp9_spatial_svc_encoder "${test_name}" -m gf -l ${layers} - done - fi -} - -vp9_spatial_svc_tests="vp9_spatial_svc_mode_i - vp9_spatial_svc_mode_altip - vp9_spatial_svc_mode_ip - vp9_spatial_svc_mode_gf" - -run_tests vp9_spatial_svc_encoder_verify_environment "${vp9_spatial_svc_tests}" +if [ "$(vpx_config_option_enabled CONFIG_SPATIAL_SVC)" = "yes" ]; then + run_tests \ + vp9_spatial_svc_encoder_verify_environment \ + "${vp9_spatial_svc_tests}" +fi diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc index d7df2867d..fabb43824 100644 --- a/test/vp9_subtract_test.cc +++ b/test/vp9_subtract_test.cc @@ -17,14 +17,14 @@ #include "vp9/common/vp9_blockd.h" #include "vpx_mem/vpx_mem.h" -typedef void (*subtract_fn_t)(int rows, int cols, - int16_t *diff_ptr, ptrdiff_t diff_stride, - const uint8_t *src_ptr, ptrdiff_t src_stride, - const uint8_t *pred_ptr, ptrdiff_t pred_stride); +typedef void (*SubtractFunc)(int rows, int cols, + int16_t *diff_ptr, ptrdiff_t diff_stride, + const uint8_t *src_ptr, ptrdiff_t src_stride, + const uint8_t *pred_ptr, ptrdiff_t pred_stride); namespace vp9 { -class VP9SubtractBlockTest : public ::testing::TestWithParam { +class VP9SubtractBlockTest : public ::testing::TestWithParam { public: virtual void TearDown() { libvpx_test::ClearSystemState(); @@ -95,4 +95,9 @@ INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest, INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest, ::testing::Values(vp9_subtract_block_sse2)); #endif +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest, + ::testing::Values(vp9_subtract_block_neon)); +#endif + } // namespace vp9 diff --git a/test/vpx_scale_test.cc b/test/vpx_scale_test.cc new file mode 100644 index 000000000..b3302d942 --- /dev/null +++ b/test/vpx_scale_test.cc @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/clear_system_state.h" +#include "test/register_state_check.h" + +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/yv12config.h" + +namespace { + +typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf); +typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf, + YV12_BUFFER_CONFIG *dst_ybf); + +class VpxScaleBase { + public: + virtual ~VpxScaleBase() { + libvpx_test::ClearSystemState(); + } + + void ResetImage(int width, int height) { + width_ = width; + height_ = height; + vpx_memset(&img_, 0, sizeof(img_)); + ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_, + VP8BORDERINPIXELS)); + vpx_memset(img_.buffer_alloc, kBufFiller, img_.frame_size); + FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, + img_.y_stride); + FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + + vpx_memset(&ref_img_, 0, sizeof(ref_img_)); + ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_, + VP8BORDERINPIXELS)); + vpx_memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size); + + vpx_memset(&cpy_img_, 0, sizeof(cpy_img_)); + ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_, + VP8BORDERINPIXELS)); + vpx_memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size); + ReferenceCopyFrame(); + } + + void DeallocImage() { + vp8_yv12_de_alloc_frame_buffer(&img_); + vp8_yv12_de_alloc_frame_buffer(&ref_img_); + vp8_yv12_de_alloc_frame_buffer(&cpy_img_); + } + + protected: + static const int kBufFiller = 123; + static const int kBufMax = kBufFiller - 1; + + static void FillPlane(uint8_t *buf, int width, int height, int stride) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + buf[x + (y * stride)] = (x + (width * y)) % kBufMax; + } + } + } + + static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height, + int width, int height, int stride, int padding) { + // Copy the outermost visible pixel to a distance of at least 'padding.' + // The buffers are allocated such that there may be excess space outside the + // padding. As long as the minimum amount of padding is achieved it is not + // necessary to fill this space as well. + uint8_t *left = buf - padding; + uint8_t *right = buf + crop_width; + const int right_extend = padding + (width - crop_width); + const int bottom_extend = padding + (height - crop_height); + + // Fill the border pixels from the nearest image pixel. + for (int y = 0; y < crop_height; ++y) { + vpx_memset(left, left[padding], padding); + vpx_memset(right, right[-1], right_extend); + left += stride; + right += stride; + } + + left = buf - padding; + uint8_t *top = left - (stride * padding); + // The buffer does not always extend as far as the stride. + // Equivalent to padding + width + padding. + const int extend_width = padding + crop_width + right_extend; + + // The first row was already extended to the left and right. Copy it up. + for (int y = 0; y < padding; ++y) { + vpx_memcpy(top, left, extend_width); + top += stride; + } + + uint8_t *bottom = left + (crop_height * stride); + for (int y = 0; y < bottom_extend; ++y) { + vpx_memcpy(bottom, left + (crop_height - 1) * stride, extend_width); + bottom += stride; + } + } + + void ReferenceExtendBorder() { + ExtendPlane(ref_img_.y_buffer, + ref_img_.y_crop_width, ref_img_.y_crop_height, + ref_img_.y_width, ref_img_.y_height, + ref_img_.y_stride, + ref_img_.border); + ExtendPlane(ref_img_.u_buffer, + ref_img_.uv_crop_width, ref_img_.uv_crop_height, + ref_img_.uv_width, ref_img_.uv_height, + ref_img_.uv_stride, + ref_img_.border / 2); + ExtendPlane(ref_img_.v_buffer, + ref_img_.uv_crop_width, ref_img_.uv_crop_height, + ref_img_.uv_width, ref_img_.uv_height, + ref_img_.uv_stride, + ref_img_.border / 2); + } + + void ReferenceCopyFrame() { + // Copy img_ to ref_img_ and extend frame borders. This will be used for + // verifying extend_fn_ as well as copy_frame_fn_. + EXPECT_EQ(ref_img_.frame_size, img_.frame_size); + for (int y = 0; y < img_.y_crop_height; ++y) { + for (int x = 0; x < img_.y_crop_width; ++x) { + ref_img_.y_buffer[x + y * ref_img_.y_stride] = + img_.y_buffer[x + y * img_.y_stride]; + } + } + + for (int y = 0; y < img_.uv_crop_height; ++y) { + for (int x = 0; x < img_.uv_crop_width; ++x) { + ref_img_.u_buffer[x + y * ref_img_.uv_stride] = + img_.u_buffer[x + y * img_.uv_stride]; + ref_img_.v_buffer[x + y * ref_img_.uv_stride] = + img_.v_buffer[x + y * img_.uv_stride]; + } + } + + ReferenceExtendBorder(); + } + + void CompareImages(const YV12_BUFFER_CONFIG actual) { + EXPECT_EQ(ref_img_.frame_size, actual.frame_size); + EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc, + ref_img_.frame_size)); + } + + YV12_BUFFER_CONFIG img_; + YV12_BUFFER_CONFIG ref_img_; + YV12_BUFFER_CONFIG cpy_img_; + int width_; + int height_; +}; + +class ExtendBorderTest + : public VpxScaleBase, + public ::testing::TestWithParam { + public: + virtual ~ExtendBorderTest() {} + + protected: + virtual void SetUp() { + extend_fn_ = GetParam(); + } + + void ExtendBorder() { + ASM_REGISTER_STATE_CHECK(extend_fn_(&img_)); + } + + void RunTest() { +#if ARCH_ARM + // Some arm devices OOM when trying to allocate the largest buffers. + static const int kNumSizesToTest = 6; +#else + static const int kNumSizesToTest = 7; +#endif + static const int kSizesToTest[] = {1, 15, 33, 145, 512, 1025, 16383}; + for (int h = 0; h < kNumSizesToTest; ++h) { + for (int w = 0; w < kNumSizesToTest; ++w) { + ResetImage(kSizesToTest[w], kSizesToTest[h]); + ExtendBorder(); + ReferenceExtendBorder(); + CompareImages(img_); + DeallocImage(); + } + } + } + + ExtendFrameBorderFunc extend_fn_; +}; + +TEST_P(ExtendBorderTest, ExtendBorder) { + ASSERT_NO_FATAL_FAILURE(RunTest()); +} + +INSTANTIATE_TEST_CASE_P(C, ExtendBorderTest, + ::testing::Values(vp8_yv12_extend_frame_borders_c)); + +class CopyFrameTest + : public VpxScaleBase, + public ::testing::TestWithParam { + public: + virtual ~CopyFrameTest() {} + + protected: + virtual void SetUp() { + copy_frame_fn_ = GetParam(); + } + + void CopyFrame() { + ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &cpy_img_)); + } + + void RunTest() { +#if ARCH_ARM + // Some arm devices OOM when trying to allocate the largest buffers. + static const int kNumSizesToTest = 6; +#else + static const int kNumSizesToTest = 7; +#endif + static const int kSizesToTest[] = {1, 15, 33, 145, 512, 1025, 16383}; + for (int h = 0; h < kNumSizesToTest; ++h) { + for (int w = 0; w < kNumSizesToTest; ++w) { + ResetImage(kSizesToTest[w], kSizesToTest[h]); + ReferenceCopyFrame(); + CopyFrame(); + CompareImages(cpy_img_); + DeallocImage(); + } + } + } + + CopyFrameFunc copy_frame_fn_; +}; + +TEST_P(CopyFrameTest, CopyFrame) { + ASSERT_NO_FATAL_FAILURE(RunTest()); +} + +INSTANTIATE_TEST_CASE_P(C, CopyFrameTest, + ::testing::Values(vp8_yv12_copy_frame_c)); +} // namespace diff --git a/test/vpxdec.sh b/test/vpxdec.sh index 093230b69..836b13cce 100755 --- a/test/vpxdec.sh +++ b/test/vpxdec.sh @@ -22,6 +22,32 @@ vpxdec_verify_environment() { fi } +# Echoes yes to stdout when vpxdec exists according to vpx_tool_available(). +vpxdec_available() { + [ -n "$(vpx_tool_available vpxdec)" ] && echo yes +} + +# Wrapper function for running vpxdec with pipe input. Requires that +# LIBVPX_BIN_PATH points to the directory containing vpxdec. $1 is used as the +# input file path and shifted away. All remaining parameters are passed through +# to vpxdec. +vpxdec_pipe() { + local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}" + local input="$1" + shift + cat "${input}" | eval "${VPX_TEST_PREFIX}" "${decoder}" - "$@" ${devnull} +} + +# Wrapper function for running vpxdec. Requires that LIBVPX_BIN_PATH points to +# the directory containing vpxdec. $1 one is used as the input file path and +# shifted away. All remaining parameters are passed through to vpxdec. +vpxdec() { + local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}" + local input="${1}" + shift + eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull} +} + vpxdec_can_decode_vp8() { if [ "$(vpxdec_available)" = "yes" ] && \ [ "$(vp8_decode_available)" = "yes" ]; then @@ -38,20 +64,20 @@ vpxdec_can_decode_vp9() { vpxdec_vp8_ivf() { if [ "$(vpxdec_can_decode_vp8)" = "yes" ]; then - vpxdec "${VP8_IVF_FILE}" + vpxdec "${VP8_IVF_FILE}" --summary --noblit fi } vpxdec_vp8_ivf_pipe_input() { if [ "$(vpxdec_can_decode_vp8)" = "yes" ]; then - vpxdec "${VP8_IVF_FILE}" - + vpxdec_pipe "${VP8_IVF_FILE}" --summary --noblit fi } vpxdec_vp9_webm() { if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - vpxdec "${VP9_WEBM_FILE}" + vpxdec "${VP9_WEBM_FILE}" --summary --noblit fi } diff --git a/test/vpxenc.sh b/test/vpxenc.sh index f08c04878..3cf3f4d40 100755 --- a/test/vpxenc.sh +++ b/test/vpxenc.sh @@ -15,7 +15,7 @@ ## . $(dirname $0)/tools_common.sh -TEST_FRAMES=10 +readonly TEST_FRAMES=10 # Environment check: Make sure input is available. vpxenc_verify_environment() { @@ -39,55 +39,172 @@ vpxenc_can_encode_vp9() { fi } +# Echoes yes to stdout when vpxenc exists according to vpx_tool_available(). +vpxenc_available() { + [ -n "$(vpx_tool_available vpxenc)" ] && echo yes +} + +# Wrapper function for running vpxenc with pipe input. Requires that +# LIBVPX_BIN_PATH points to the directory containing vpxenc. $1 is used as the +# input file path and shifted away. All remaining parameters are passed through +# to vpxenc. +vpxenc_pipe() { + local readonly encoder="${LIBVPX_BIN_PATH}/vpxenc${VPX_TEST_EXE_SUFFIX}" + local readonly input="$1" + shift + cat "${input}" | eval "${VPX_TEST_PREFIX}" "${encoder}" - "$@" ${devnull} +} + +# Wrapper function for running vpxenc. Requires that LIBVPX_BIN_PATH points to +# the directory containing vpxenc. $1 one is used as the input file path and +# shifted away. All remaining parameters are passed through to vpxenc. +vpxenc() { + local readonly encoder="${LIBVPX_BIN_PATH}/vpxenc${VPX_TEST_EXE_SUFFIX}" + local readonly input="${1}" + shift + eval "${VPX_TEST_PREFIX}" "${encoder}" "$input" "$@" ${devnull} +} + vpxenc_vp8_ivf() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then - vpxenc vp8 ${YUV_RAW_INPUT_WIDTH} ${YUV_RAW_INPUT_HEIGHT} ${TEST_FRAMES} \ - "${YUV_RAW_INPUT}" vp8.ivf + local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf" + vpxenc --codec=vp8 \ + --width="${YUV_RAW_INPUT_WIDTH}" \ + --height="${YUV_RAW_INPUT_HEIGHT}" \ + --limit="${TEST_FRAMES}" \ + --ivf \ + --output="${output}" \ + "${YUV_RAW_INPUT}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi fi } -vpxenc_vp8_ivf_pipe_input() { +vpxenc_vp8_ivf_piped_input() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then - vpxenc vp8 ${YUV_RAW_INPUT_WIDTH} ${YUV_RAW_INPUT_HEIGHT} ${TEST_FRAMES} \ - "${YUV_RAW_INPUT}" vp8.ivf - + local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf" + cat "${YUV_RAW_INPUT}" \ + | vpxenc --codec=vp8 \ + --width="${YUV_RAW_INPUT_WIDTH}" \ + --height="${YUV_RAW_INPUT_HEIGHT}" \ + --limit="${TEST_FRAMES}" \ + --ivf \ + --output="${output}" \ + - + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi fi } vpxenc_vp8_webm() { - if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && - [ "$(webm_io_available)" = "yes" ] ; then - vpxenc vp8 ${YUV_RAW_INPUT_WIDTH} ${YUV_RAW_INPUT_HEIGHT} ${TEST_FRAMES} \ - "${YUV_RAW_INPUT}" vp8.webm + if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm" + vpxenc --codec=vp8 \ + --width="${YUV_RAW_INPUT_WIDTH}" \ + --height="${YUV_RAW_INPUT_HEIGHT}" \ + --limit="${TEST_FRAMES}" \ + --output="${output}" \ + "${YUV_RAW_INPUT}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi fi } vpxenc_vp9_ivf() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then - vpxenc vp9 ${YUV_RAW_INPUT_WIDTH} ${YUV_RAW_INPUT_HEIGHT} ${TEST_FRAMES} \ - "${YUV_RAW_INPUT}" vp9.ivf + local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf" + vpxenc --codec=vp9 \ + --width="${YUV_RAW_INPUT_WIDTH}" \ + --height="${YUV_RAW_INPUT_HEIGHT}" \ + --limit="${TEST_FRAMES}" \ + --ivf \ + --test-decode=fatal \ + --output="${output}" \ + "${YUV_RAW_INPUT}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi fi } vpxenc_vp9_webm() { - if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && - [ "$(webm_io_available)" = "yes" ] ; then - vpxenc vp9 ${YUV_RAW_INPUT_WIDTH} ${YUV_RAW_INPUT_HEIGHT} ${TEST_FRAMES} \ - "${YUV_RAW_INPUT}" vp9.webm + if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" + vpxenc --codec=vp9 \ + --width="${YUV_RAW_INPUT_WIDTH}" \ + --height="${YUV_RAW_INPUT_HEIGHT}" \ + --limit="${TEST_FRAMES}" \ + --test-decode=fatal \ + --output="${output}" \ + "${YUV_RAW_INPUT}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi fi } -DISABLED_vpxenc_vp9_ivf_lossless() { +vpxenc_vp9_ivf_lossless() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then - vpxenc vp9 ${YUV_RAW_INPUT_WIDTH} ${YUV_RAW_INPUT_HEIGHT} ${TEST_FRAMES} \ - "${YUV_RAW_INPUT}" vp9_lossless.ivf --lossless + local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf" + vpxenc --codec=vp9 \ + --width="${YUV_RAW_INPUT_WIDTH}" \ + --height="${YUV_RAW_INPUT_HEIGHT}" \ + --limit="${TEST_FRAMES}" \ + --ivf \ + --output="${output}" \ + --lossless=1 \ + --test-decode=fatal \ + "${YUV_RAW_INPUT}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp9_ivf_minq0_maxq0() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then + local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf" + vpxenc --codec=vp9 \ + --width="${YUV_RAW_INPUT_WIDTH}" \ + --height="${YUV_RAW_INPUT_HEIGHT}" \ + --limit="${TEST_FRAMES}" \ + --ivf \ + --output="${output}" \ + --min-q=0 \ + --max-q=0 \ + --test-decode=fatal \ + "${YUV_RAW_INPUT}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi fi } vpxenc_tests="vpxenc_vp8_ivf vpxenc_vp8_webm - vpxenc_vp8_ivf_pipe_input + vpxenc_vp8_ivf_piped_input vpxenc_vp9_ivf vpxenc_vp9_webm - DISABLED_vpxenc_vp9_ivf_lossless" + vpxenc_vp9_ivf_lossless + vpxenc_vp9_ivf_minq0_maxq0" run_tests vpxenc_verify_environment "${vpxenc_tests}" diff --git a/test/y4m_test.cc b/test/y4m_test.cc index 73ff68308..d4a2ede20 100644 --- a/test/y4m_test.cc +++ b/test/y4m_test.cc @@ -24,14 +24,14 @@ static const unsigned int kWidth = 160; static const unsigned int kHeight = 90; static const unsigned int kFrames = 10; -typedef struct { +struct Y4mTestParam { const char *filename; unsigned int bit_depth; vpx_img_fmt format; const char *md5raw; -} test_entry_type; +}; -const test_entry_type kY4mTestVectors[] = { +const Y4mTestParam kY4mTestVectors[] = { {"park_joy_90p_8_420.y4m", 8, VPX_IMG_FMT_I420, "e5406275b9fc6bb3436c31d4a05c1cab"}, {"park_joy_90p_8_422.y4m", 8, VPX_IMG_FMT_I422, @@ -70,7 +70,7 @@ static void write_image_file(const vpx_image_t *img, FILE *file) { } class Y4mVideoSourceTest - : public ::testing::TestWithParam, + : public ::testing::TestWithParam, public ::libvpx_test::Y4mVideoSource { protected: Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {} @@ -126,7 +126,7 @@ class Y4mVideoSourceTest }; TEST_P(Y4mVideoSourceTest, SourceTest) { - const test_entry_type t = GetParam(); + const Y4mTestParam t = GetParam(); Init(t.filename, kFrames); HeaderChecks(t.bit_depth, t.format); Md5Check(t.md5raw); @@ -138,9 +138,14 @@ INSTANTIATE_TEST_CASE_P(C, Y4mVideoSourceTest, class Y4mVideoWriteTest : public Y4mVideoSourceTest { protected: - Y4mVideoWriteTest() : Y4mVideoSourceTest() {} + Y4mVideoWriteTest() {} - virtual void ReplaceInputFp(FILE *input_file) { + virtual ~Y4mVideoWriteTest() { + CloseSource(); + delete tmpfile_; + } + + virtual void ReplaceInputFile(FILE *input_file) { CloseSource(); frame_ = 0; input_file_ = input_file; @@ -153,30 +158,31 @@ class Y4mVideoWriteTest ASSERT_TRUE(input_file_ != NULL); char buf[Y4M_BUFFER_SIZE] = {0}; const struct VpxRational framerate = {y4m_.fps_n, y4m_.fps_d}; - FILE *out_file = libvpx_test::OpenTempOutFile(); - ASSERT_TRUE(out_file != NULL); + tmpfile_ = new libvpx_test::TempOutFile; + ASSERT_TRUE(tmpfile_->file() != NULL); y4m_write_file_header(buf, sizeof(buf), kWidth, kHeight, &framerate, y4m_.vpx_fmt, y4m_.bit_depth); - fputs(buf, out_file); + fputs(buf, tmpfile_->file()); for (unsigned int i = start_; i < limit_; i++) { y4m_write_frame_header(buf, sizeof(buf)); - fputs(buf, out_file); - write_image_file(img(), out_file); + fputs(buf, tmpfile_->file()); + write_image_file(img(), tmpfile_->file()); Next(); } - ReplaceInputFp(out_file); + ReplaceInputFile(tmpfile_->file()); } virtual void Init(const std::string &file_name, int limit) { Y4mVideoSourceTest::Init(file_name, limit); WriteY4mAndReadBack(); } + libvpx_test::TempOutFile *tmpfile_; }; TEST_P(Y4mVideoWriteTest, WriteTest) { - const test_entry_type t = GetParam(); + const Y4mTestParam t = GetParam(); Init(t.filename, kFrames); HeaderChecks(t.bit_depth, t.format); Md5Check(t.md5raw); diff --git a/third_party/libmkv/EbmlIDs.h b/third_party/libmkv/EbmlIDs.h deleted file mode 100644 index 44d438583..000000000 --- a/third_party/libmkv/EbmlIDs.h +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#ifndef MKV_DEFS_HPP -#define MKV_DEFS_HPP 1 - -/* Commenting out values not available in webm, but available in matroska */ - -enum mkv { - EBML = 0x1A45DFA3, - EBMLVersion = 0x4286, - EBMLReadVersion = 0x42F7, - EBMLMaxIDLength = 0x42F2, - EBMLMaxSizeLength = 0x42F3, - DocType = 0x4282, - DocTypeVersion = 0x4287, - DocTypeReadVersion = 0x4285, -/* CRC_32 = 0xBF, */ - Void = 0xEC, - SignatureSlot = 0x1B538667, - SignatureAlgo = 0x7E8A, - SignatureHash = 0x7E9A, - SignaturePublicKey = 0x7EA5, - Signature = 0x7EB5, - SignatureElements = 0x7E5B, - SignatureElementList = 0x7E7B, - SignedElement = 0x6532, - /* segment */ - Segment = 0x18538067, - /* Meta Seek Information */ - SeekHead = 0x114D9B74, - Seek = 0x4DBB, - SeekID = 0x53AB, - SeekPosition = 0x53AC, - /* Segment Information */ - Info = 0x1549A966, -/* SegmentUID = 0x73A4, */ -/* SegmentFilename = 0x7384, */ -/* PrevUID = 0x3CB923, */ -/* PrevFilename = 0x3C83AB, */ -/* NextUID = 0x3EB923, */ -/* NextFilename = 0x3E83BB, */ -/* SegmentFamily = 0x4444, */ -/* ChapterTranslate = 0x6924, */ -/* ChapterTranslateEditionUID = 0x69FC, */ -/* ChapterTranslateCodec = 0x69BF, */ -/* ChapterTranslateID = 0x69A5, */ - TimecodeScale = 0x2AD7B1, - Segment_Duration = 0x4489, - DateUTC = 0x4461, -/* Title = 0x7BA9, */ - MuxingApp = 0x4D80, - WritingApp = 0x5741, - /* Cluster */ - Cluster = 0x1F43B675, - Timecode = 0xE7, -/* SilentTracks = 0x5854, */ -/* SilentTrackNumber = 0x58D7, */ -/* Position = 0xA7, */ - PrevSize = 0xAB, - BlockGroup = 0xA0, - Block = 0xA1, -/* BlockVirtual = 0xA2, */ - BlockAdditions = 0x75A1, - BlockMore = 0xA6, - BlockAddID = 0xEE, - BlockAdditional = 0xA5, - BlockDuration = 0x9B, -/* ReferencePriority = 0xFA, */ - ReferenceBlock = 0xFB, -/* ReferenceVirtual = 0xFD, */ -/* CodecState = 0xA4, */ -/* Slices = 0x8E, */ -/* TimeSlice = 0xE8, */ - LaceNumber = 0xCC, -/* FrameNumber = 0xCD, */ -/* BlockAdditionID = 0xCB, */ -/* MkvDelay = 0xCE, */ -/* Cluster_Duration = 0xCF, */ - SimpleBlock = 0xA3, -/* EncryptedBlock = 0xAF, */ - /* Track */ - Tracks = 0x1654AE6B, - TrackEntry = 0xAE, - TrackNumber = 0xD7, - TrackUID = 0x73C5, - TrackType = 0x83, - FlagEnabled = 0xB9, - FlagDefault = 0x88, - FlagForced = 0x55AA, - FlagLacing = 0x9C, -/* MinCache = 0x6DE7, */ -/* MaxCache = 0x6DF8, */ - DefaultDuration = 0x23E383, -/* TrackTimecodeScale = 0x23314F, */ -/* TrackOffset = 0x537F, */ - MaxBlockAdditionID = 0x55EE, - Name = 0x536E, - Language = 0x22B59C, - CodecID = 0x86, - CodecPrivate = 0x63A2, - CodecName = 0x258688, -/* AttachmentLink = 0x7446, */ -/* CodecSettings = 0x3A9697, */ -/* CodecInfoURL = 0x3B4040, */ -/* CodecDownloadURL = 0x26B240, */ -/* CodecDecodeAll = 0xAA, */ -/* TrackOverlay = 0x6FAB, */ -/* TrackTranslate = 0x6624, */ -/* TrackTranslateEditionUID = 0x66FC, */ -/* TrackTranslateCodec = 0x66BF, */ -/* TrackTranslateTrackID = 0x66A5, */ - /* video */ - Video = 0xE0, - FlagInterlaced = 0x9A, - StereoMode = 0x53B8, - AlphaMode = 0x53C0, - PixelWidth = 0xB0, - PixelHeight = 0xBA, - PixelCropBottom = 0x54AA, - PixelCropTop = 0x54BB, - PixelCropLeft = 0x54CC, - PixelCropRight = 0x54DD, - DisplayWidth = 0x54B0, - DisplayHeight = 0x54BA, - DisplayUnit = 0x54B2, - AspectRatioType = 0x54B3, -/* ColourSpace = 0x2EB524, */ -/* GammaValue = 0x2FB523, */ - FrameRate = 0x2383E3, - /* end video */ - /* audio */ - Audio = 0xE1, - SamplingFrequency = 0xB5, - OutputSamplingFrequency = 0x78B5, - Channels = 0x9F, -/* ChannelPositions = 0x7D7B, */ - BitDepth = 0x6264, - /* end audio */ - /* content encoding */ -/* ContentEncodings = 0x6d80, */ -/* ContentEncoding = 0x6240, */ -/* ContentEncodingOrder = 0x5031, */ -/* ContentEncodingScope = 0x5032, */ -/* ContentEncodingType = 0x5033, */ -/* ContentCompression = 0x5034, */ -/* ContentCompAlgo = 0x4254, */ -/* ContentCompSettings = 0x4255, */ -/* ContentEncryption = 0x5035, */ -/* ContentEncAlgo = 0x47e1, */ -/* ContentEncKeyID = 0x47e2, */ -/* ContentSignature = 0x47e3, */ -/* ContentSigKeyID = 0x47e4, */ -/* ContentSigAlgo = 0x47e5, */ -/* ContentSigHashAlgo = 0x47e6, */ - /* end content encoding */ - /* Cueing Data */ - Cues = 0x1C53BB6B, - CuePoint = 0xBB, - CueTime = 0xB3, - CueTrackPositions = 0xB7, - CueTrack = 0xF7, - CueClusterPosition = 0xF1, - CueBlockNumber = 0x5378 -/* CueCodecState = 0xEA, */ -/* CueReference = 0xDB, */ -/* CueRefTime = 0x96, */ -/* CueRefCluster = 0x97, */ -/* CueRefNumber = 0x535F, */ -/* CueRefCodecState = 0xEB, */ - /* Attachment */ -/* Attachments = 0x1941A469, */ -/* AttachedFile = 0x61A7, */ -/* FileDescription = 0x467E, */ -/* FileName = 0x466E, */ -/* FileMimeType = 0x4660, */ -/* FileData = 0x465C, */ -/* FileUID = 0x46AE, */ -/* FileReferral = 0x4675, */ - /* Chapters */ -/* Chapters = 0x1043A770, */ -/* EditionEntry = 0x45B9, */ -/* EditionUID = 0x45BC, */ -/* EditionFlagHidden = 0x45BD, */ -/* EditionFlagDefault = 0x45DB, */ -/* EditionFlagOrdered = 0x45DD, */ -/* ChapterAtom = 0xB6, */ -/* ChapterUID = 0x73C4, */ -/* ChapterTimeStart = 0x91, */ -/* ChapterTimeEnd = 0x92, */ -/* ChapterFlagHidden = 0x98, */ -/* ChapterFlagEnabled = 0x4598, */ -/* ChapterSegmentUID = 0x6E67, */ -/* ChapterSegmentEditionUID = 0x6EBC, */ -/* ChapterPhysicalEquiv = 0x63C3, */ -/* ChapterTrack = 0x8F, */ -/* ChapterTrackNumber = 0x89, */ -/* ChapterDisplay = 0x80, */ -/* ChapString = 0x85, */ -/* ChapLanguage = 0x437C, */ -/* ChapCountry = 0x437E, */ -/* ChapProcess = 0x6944, */ -/* ChapProcessCodecID = 0x6955, */ -/* ChapProcessPrivate = 0x450D, */ -/* ChapProcessCommand = 0x6911, */ -/* ChapProcessTime = 0x6922, */ -/* ChapProcessData = 0x6933, */ - /* Tagging */ -/* Tags = 0x1254C367, */ -/* Tag = 0x7373, */ -/* Targets = 0x63C0, */ -/* TargetTypeValue = 0x68CA, */ -/* TargetType = 0x63CA, */ -/* Tagging_TrackUID = 0x63C5, */ -/* Tagging_EditionUID = 0x63C9, */ -/* Tagging_ChapterUID = 0x63C4, */ -/* AttachmentUID = 0x63C6, */ -/* SimpleTag = 0x67C8, */ -/* TagName = 0x45A3, */ -/* TagLanguage = 0x447A, */ -/* TagDefault = 0x4484, */ -/* TagString = 0x4487, */ -/* TagBinary = 0x4485, */ -}; -#endif diff --git a/third_party/libmkv/EbmlWriter.c b/third_party/libmkv/EbmlWriter.c deleted file mode 100644 index 27cfe861c..000000000 --- a/third_party/libmkv/EbmlWriter.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include "EbmlWriter.h" -#include -#include -#include -#include -#if defined(_MSC_VER) -#define LITERALU64(n) n -#else -#define LITERALU64(n) n##LLU -#endif - -void Ebml_WriteLen(EbmlGlobal *glob, int64_t val) { - /* TODO check and make sure we are not > than 0x0100000000000000LLU */ - unsigned char size = 8; /* size in bytes to output */ - - /* mask to compare for byte size */ - int64_t minVal = 0xff; - - for (size = 1; size < 8; size ++) { - if (val < minVal) - break; - - minVal = (minVal << 7); - } - - val |= (((uint64_t)0x80) << ((size - 1) * 7)); - - Ebml_Serialize(glob, (void *) &val, sizeof(val), size); -} - -void Ebml_WriteString(EbmlGlobal *glob, const char *str) { - const size_t size_ = strlen(str); - const uint64_t size = size_; - Ebml_WriteLen(glob, size); - /* TODO: it's not clear from the spec whether the nul terminator - * should be serialized too. For now we omit the null terminator. - */ - Ebml_Write(glob, str, (unsigned long)size); -} - -void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr) { - const size_t strlen = wcslen(wstr); - - /* TODO: it's not clear from the spec whether the nul terminator - * should be serialized too. For now we include it. - */ - const uint64_t size = strlen; - - Ebml_WriteLen(glob, size); - Ebml_Write(glob, wstr, (unsigned long)size); -} - -void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id) { - int len; - - if (class_id >= 0x01000000) - len = 4; - else if (class_id >= 0x00010000) - len = 3; - else if (class_id >= 0x00000100) - len = 2; - else - len = 1; - - Ebml_Serialize(glob, (void *)&class_id, sizeof(class_id), len); -} - -void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui) { - unsigned char sizeSerialized = 8 | 0x80; - Ebml_WriteID(glob, class_id); - Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1); - Ebml_Serialize(glob, &ui, sizeof(ui), 8); -} - -void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui) { - unsigned char size = 8; /* size in bytes to output */ - unsigned char sizeSerialized = 0; - unsigned long minVal; - - Ebml_WriteID(glob, class_id); - minVal = 0x7fLU; /* mask to compare for byte size */ - - for (size = 1; size < 4; size ++) { - if (ui < minVal) { - break; - } - - minVal <<= 7; - } - - sizeSerialized = 0x80 | size; - Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1); - Ebml_Serialize(glob, &ui, sizeof(ui), size); -} -/* TODO: perhaps this is a poor name for this id serializer helper function */ -void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin) { - int size; - for (size = 4; size > 1; size--) { - if (bin & (unsigned int)0x000000ff << ((size - 1) * 8)) - break; - } - Ebml_WriteID(glob, class_id); - Ebml_WriteLen(glob, size); - Ebml_WriteID(glob, bin); -} - -void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d) { - unsigned char len = 0x88; - - Ebml_WriteID(glob, class_id); - Ebml_Serialize(glob, &len, sizeof(len), 1); - Ebml_Serialize(glob, &d, sizeof(d), 8); -} - -void Ebml_WriteSigned16(EbmlGlobal *glob, short val) { - signed long out = ((val & 0x003FFFFF) | 0x00200000) << 8; - Ebml_Serialize(glob, &out, sizeof(out), 3); -} - -void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s) { - Ebml_WriteID(glob, class_id); - Ebml_WriteString(glob, s); -} - -void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s) { - Ebml_WriteID(glob, class_id); - Ebml_WriteUTF8(glob, s); -} - -void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length) { - Ebml_WriteID(glob, class_id); - Ebml_WriteLen(glob, data_length); - Ebml_Write(glob, data, data_length); -} - -void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize) { - unsigned char tmp = 0; - unsigned long i = 0; - - Ebml_WriteID(glob, 0xEC); - Ebml_WriteLen(glob, vSize); - - for (i = 0; i < vSize; i++) { - Ebml_Write(glob, &tmp, 1); - } -} - -/* TODO Serialize Date */ diff --git a/third_party/libmkv/EbmlWriter.h b/third_party/libmkv/EbmlWriter.h deleted file mode 100644 index b94f75733..000000000 --- a/third_party/libmkv/EbmlWriter.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#ifndef EBMLWRITER_HPP -#define EBMLWRITER_HPP -#include -#include "vpx/vpx_integer.h" - -/* note: you must define write and serialize functions as well as your own - * EBML_GLOBAL - * - * These functions MUST be implemented - */ - -typedef struct EbmlGlobal EbmlGlobal; -void Ebml_Serialize(EbmlGlobal *glob, const void *, int, unsigned long); -void Ebml_Write(EbmlGlobal *glob, const void *, unsigned long); - -/*****/ - -void Ebml_WriteLen(EbmlGlobal *glob, int64_t val); -void Ebml_WriteString(EbmlGlobal *glob, const char *str); -void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr); -void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id); -void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui); -void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui); -void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long ui); -void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d); -/* TODO make this more generic to signed */ -void Ebml_WriteSigned16(EbmlGlobal *glob, short val); -void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s); -void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s); -void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length); -void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize); -/* TODO need date function */ -#endif diff --git a/third_party/libwebm/PATENTS.TXT b/third_party/libwebm/PATENTS.TXT index 4414d8385..79d17d7d6 100644 --- a/third_party/libwebm/PATENTS.TXT +++ b/third_party/libwebm/PATENTS.TXT @@ -1,22 +1,23 @@ Additional IP Rights Grant (Patents) +------------------------------------ -"This implementation" means the copyrightable works distributed by -Google as part of the WebM Project. +"These implementations" means the copyrightable works that implement the WebM +codecs distributed by Google as part of the WebM Project. -Google hereby grants to you a perpetual, worldwide, non-exclusive, -no-charge, royalty-free, irrevocable (except as stated in this section) -patent license to make, have made, use, offer to sell, sell, import, -transfer, and otherwise run, modify and propagate the contents of this -implementation of VP8, where such license applies only to those patent -claims, both currently owned by Google and acquired in the future, -licensable by Google that are necessarily infringed by this -implementation of VP8. This grant does not include claims that would be -infringed only as a consequence of further modification of this -implementation. If you or your agent or exclusive licensee institute or -order or agree to the institution of patent litigation against any -entity (including a cross-claim or counterclaim in a lawsuit) alleging -that this implementation of VP8 or any code incorporated within this -implementation of VP8 constitutes direct or contributory patent -infringement, or inducement of patent infringement, then any patent -rights granted to you under this License for this implementation of VP8 -shall terminate as of the date such litigation is filed. +Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge, +royalty-free, irrevocable (except as stated in this section) patent license to +make, have made, use, offer to sell, sell, import, transfer, and otherwise +run, modify and propagate the contents of these implementations of WebM, where +such license applies only to those patent claims, both currently owned by +Google and acquired in the future, licensable by Google that are necessarily +infringed by these implementations of WebM. This grant does not include claims +that would be infringed only as a consequence of further modification of these +implementations. If you or your agent or exclusive licensee institute or order +or agree to the institution of patent litigation or any other patent +enforcement activity against any entity (including a cross-claim or +counterclaim in a lawsuit) alleging that any of these implementations of WebM +or any code incorporated within any of these implementations of WebM +constitutes direct or contributory patent infringement, or inducement of +patent infringement, then any patent rights granted to you under this License +for these implementations of WebM shall terminate as of the date such +litigation is filed. diff --git a/third_party/libyuv/README.libvpx b/third_party/libyuv/README.libvpx index dd368dcbd..fa5b498ca 100644 --- a/third_party/libyuv/README.libvpx +++ b/third_party/libyuv/README.libvpx @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1005 +Version: 1041 License: BSD License File: LICENSE @@ -13,5 +13,4 @@ which down-samples the original input video (f.g. 1280x720) a number of times in order to encode multiple resolution bit streams. Local Modifications: -Modified the original scaler code minimally with include file changes to fit -in our current build system. \ No newline at end of file +None. diff --git a/third_party/libyuv/include/libyuv/compare.h b/third_party/libyuv/include/libyuv/compare.h new file mode 100644 index 000000000..5dfac7c86 --- /dev/null +++ b/third_party/libyuv/include/libyuv/compare.h @@ -0,0 +1,73 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_COMPARE_H_ // NOLINT +#define INCLUDE_LIBYUV_COMPARE_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Compute a hash for specified memory. Seed of 5381 recommended. +LIBYUV_API +uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed); + +// Sum Square Error - used to compute Mean Square Error or PSNR. +LIBYUV_API +uint64 ComputeSumSquareError(const uint8* src_a, + const uint8* src_b, int count); + +LIBYUV_API +uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height); + +static const int kMaxPsnr = 128; + +LIBYUV_API +double SumSquareErrorToPsnr(uint64 sse, uint64 count); + +LIBYUV_API +double CalcFramePsnr(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height); + +LIBYUV_API +double I420Psnr(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height); + +LIBYUV_API +double CalcFrameSsim(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height); + +LIBYUV_API +double I420Ssim(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_COMPARE_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/convert.h b/third_party/libyuv/include/libyuv/convert.h new file mode 100644 index 000000000..1bd45c837 --- /dev/null +++ b/third_party/libyuv/include/libyuv/convert.h @@ -0,0 +1,254 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_H_ + +#include "libyuv/basic_types.h" +// TODO(fbarchard): Remove the following headers includes. +#include "libyuv/convert_from.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert I444 to I420. +LIBYUV_API +int I444ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I422 to I420. +LIBYUV_API +int I422ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I411 to I420. +LIBYUV_API +int I411ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Copy I420 to I420. +#define I420ToI420 I420Copy +LIBYUV_API +int I420Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I400 (grey) to I420. +LIBYUV_API +int I400ToI420(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert NV12 to I420. +LIBYUV_API +int NV12ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert NV21 to I420. +LIBYUV_API +int NV21ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert YUY2 to I420. +LIBYUV_API +int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert UYVY to I420. +LIBYUV_API +int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert M420 to I420. +LIBYUV_API +int M420ToI420(const uint8* src_m420, int src_stride_m420, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert Q420 to I420. +LIBYUV_API +int Q420ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// ARGB little endian (bgra in memory) to I420. +LIBYUV_API +int ARGBToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// BGRA little endian (argb in memory) to I420. +LIBYUV_API +int BGRAToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// ABGR little endian (rgba in memory) to I420. +LIBYUV_API +int ABGRToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGBA little endian (abgr in memory) to I420. +LIBYUV_API +int RGBAToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB little endian (bgr in memory) to I420. +LIBYUV_API +int RGB24ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB big endian (rgb in memory) to I420. +LIBYUV_API +int RAWToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB16 (RGBP fourcc) little endian to I420. +LIBYUV_API +int RGB565ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB15 (RGBO fourcc) little endian to I420. +LIBYUV_API +int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB12 (R444 fourcc) little endian to I420. +LIBYUV_API +int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +#ifdef HAVE_JPEG +// src_width/height provided by capture. +// dst_width/height for clipping determine final size. +LIBYUV_API +int MJPGToI420(const uint8* sample, size_t sample_size, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_width, int src_height, + int dst_width, int dst_height); + +// Query size of MJPG in pixels. +LIBYUV_API +int MJPGSize(const uint8* sample, size_t sample_size, + int* width, int* height); +#endif + +// Note Bayer formats (BGGR) To I420 are in format_conversion.h + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// "src_size" is needed to parse MJPG. +// "dst_stride_y" number of bytes in a row of the dst_y plane. +// Normally this would be the same as dst_width, with recommended alignment +// to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. The caller should +// allocate the I420 buffer according to rotation. +// "dst_stride_u" number of bytes in a row of the dst_u plane. +// Normally this would be the same as (dst_width + 1) / 2, with +// recommended alignment to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. +// "crop_x" and "crop_y" are starting position for cropping. +// To center, crop_x = (src_width - dst_width) / 2 +// crop_y = (src_height - dst_height) / 2 +// "src_width" / "src_height" is size of src_frame in pixels. +// "src_height" can be negative indicating a vertically flipped image source. +// "crop_width" / "crop_height" is the size to crop the src to. +// Must be less than or equal to src_width/src_height +// Cropping parameters are pre-rotation. +// "rotation" can be 0, 90, 180 or 270. +// "format" is a fourcc. ie 'I420', 'YUY2' +// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. +LIBYUV_API +int ConvertToI420(const uint8* src_frame, size_t src_size, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int crop_x, int crop_y, + int src_width, int src_height, + int crop_width, int crop_height, + enum RotationMode rotation, + uint32 format); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/convert_argb.h b/third_party/libyuv/include/libyuv/convert_argb.h new file mode 100644 index 000000000..a18014ca2 --- /dev/null +++ b/third_party/libyuv/include/libyuv/convert_argb.h @@ -0,0 +1,225 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_ARGB_H_ + +#include "libyuv/basic_types.h" +// TODO(fbarchard): Remove the following headers includes +#include "libyuv/convert_from.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" + +// TODO(fbarchard): This set of functions should exactly match convert.h +// Add missing Q420. +// TODO(fbarchard): Add tests. Create random content of right size and convert +// with C vs Opt and or to I420 and compare. +// TODO(fbarchard): Some of these functions lack parameter setting. + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Alias. +#define ARGBToARGB ARGBCopy + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I420 to ARGB. +LIBYUV_API +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I422 to ARGB. +LIBYUV_API +int I422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I444 to ARGB. +LIBYUV_API +int I444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I411 to ARGB. +LIBYUV_API +int I411ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I400 (grey) to ARGB. +LIBYUV_API +int I400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Alias. +#define YToARGB I400ToARGB_Reference + +// Convert I400 to ARGB. Reverse of ARGBToI400. +LIBYUV_API +int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert NV12 to ARGB. +LIBYUV_API +int NV12ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert NV21 to ARGB. +LIBYUV_API +int NV21ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert M420 to ARGB. +LIBYUV_API +int M420ToARGB(const uint8* src_m420, int src_stride_m420, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// TODO(fbarchard): Convert Q420 to ARGB. +// LIBYUV_API +// int Q420ToARGB(const uint8* src_y, int src_stride_y, +// const uint8* src_yuy2, int src_stride_yuy2, +// uint8* dst_argb, int dst_stride_argb, +// int width, int height); + +// Convert YUY2 to ARGB. +LIBYUV_API +int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert UYVY to ARGB. +LIBYUV_API +int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// BGRA little endian (argb in memory) to ARGB. +LIBYUV_API +int BGRAToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// ABGR little endian (rgba in memory) to ARGB. +LIBYUV_API +int ABGRToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGBA little endian (abgr in memory) to ARGB. +LIBYUV_API +int RGBAToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Deprecated function name. +#define BG24ToARGB RGB24ToARGB + +// RGB little endian (bgr in memory) to ARGB. +LIBYUV_API +int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB big endian (rgb in memory) to ARGB. +LIBYUV_API +int RAWToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB16 (RGBP fourcc) little endian to ARGB. +LIBYUV_API +int RGB565ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB15 (RGBO fourcc) little endian to ARGB. +LIBYUV_API +int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB12 (R444 fourcc) little endian to ARGB. +LIBYUV_API +int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +#ifdef HAVE_JPEG +// src_width/height provided by capture +// dst_width/height for clipping determine final size. +LIBYUV_API +int MJPGToARGB(const uint8* sample, size_t sample_size, + uint8* dst_argb, int dst_stride_argb, + int src_width, int src_height, + int dst_width, int dst_height); +#endif + +// Note Bayer formats (BGGR) to ARGB are in format_conversion.h. + +// Convert camera sample to ARGB with cropping, rotation and vertical flip. +// "src_size" is needed to parse MJPG. +// "dst_stride_argb" number of bytes in a row of the dst_argb plane. +// Normally this would be the same as dst_width, with recommended alignment +// to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. The caller should +// allocate the I420 buffer according to rotation. +// "dst_stride_u" number of bytes in a row of the dst_u plane. +// Normally this would be the same as (dst_width + 1) / 2, with +// recommended alignment to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. +// "crop_x" and "crop_y" are starting position for cropping. +// To center, crop_x = (src_width - dst_width) / 2 +// crop_y = (src_height - dst_height) / 2 +// "src_width" / "src_height" is size of src_frame in pixels. +// "src_height" can be negative indicating a vertically flipped image source. +// "crop_width" / "crop_height" is the size to crop the src to. +// Must be less than or equal to src_width/src_height +// Cropping parameters are pre-rotation. +// "rotation" can be 0, 90, 180 or 270. +// "format" is a fourcc. ie 'I420', 'YUY2' +// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. +LIBYUV_API +int ConvertToARGB(const uint8* src_frame, size_t src_size, + uint8* dst_argb, int dst_stride_argb, + int crop_x, int crop_y, + int src_width, int src_height, + int crop_width, int crop_height, + enum RotationMode rotation, + uint32 format); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/convert_from.h b/third_party/libyuv/include/libyuv/convert_from.h new file mode 100644 index 000000000..b1cf57f7d --- /dev/null +++ b/third_party/libyuv/include/libyuv/convert_from.h @@ -0,0 +1,173 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_FROM_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/rotate.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// See Also convert.h for conversions from formats to I420. + +// I420Copy in convert to I420ToI420. + +LIBYUV_API +int I420ToI422(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int I420ToI444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int I420ToI411(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21. +LIBYUV_API +int I400Copy(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// TODO(fbarchard): I420ToM420 +// TODO(fbarchard): I420ToQ420 + +LIBYUV_API +int I420ToNV12(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height); + +LIBYUV_API +int I420ToNV21(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height); + +LIBYUV_API +int I420ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int I420ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int I420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int I420ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height); + +LIBYUV_API +int I420ToRGB24(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToRAW(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToARGB1555(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToARGB4444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// Note Bayer formats (BGGR) To I420 are in format_conversion.h. + +// Convert I420 to specified format. +// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the +// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal. +LIBYUV_API +int ConvertFromI420(const uint8* y, int y_stride, + const uint8* u, int u_stride, + const uint8* v, int v_stride, + uint8* dst_sample, int dst_sample_stride, + int width, int height, + uint32 format); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/convert_from_argb.h b/third_party/libyuv/include/libyuv/convert_from_argb.h new file mode 100644 index 000000000..90f43af04 --- /dev/null +++ b/third_party/libyuv/include/libyuv/convert_from_argb.h @@ -0,0 +1,166 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy ARGB to ARGB. +#define ARGBToARGB ARGBCopy +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert ARGB To BGRA. +LIBYUV_API +int ARGBToBGRA(const uint8* src_argb, int src_stride_argb, + uint8* dst_bgra, int dst_stride_bgra, + int width, int height); + +// Convert ARGB To ABGR. +LIBYUV_API +int ARGBToABGR(const uint8* src_argb, int src_stride_argb, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height); + +// Convert ARGB To RGBA. +LIBYUV_API +int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height); + +// Convert ARGB To RGB24. +LIBYUV_API +int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height); + +// Convert ARGB To RAW. +LIBYUV_API +int ARGBToRAW(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb, int dst_stride_rgb, + int width, int height); + +// Convert ARGB To RGB565. +LIBYUV_API +int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height); + +// Convert ARGB To ARGB1555. +LIBYUV_API +int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb1555, int dst_stride_argb1555, + int width, int height); + +// Convert ARGB To ARGB4444. +LIBYUV_API +int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb4444, int dst_stride_argb4444, + int width, int height); + +// Convert ARGB To I444. +LIBYUV_API +int ARGBToI444(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB To I422. +LIBYUV_API +int ARGBToI422(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB To I420. (also in convert.h) +LIBYUV_API +int ARGBToI420(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB to J420. (JPeg full range I420). +LIBYUV_API +int ARGBToJ420(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB To I411. +LIBYUV_API +int ARGBToI411(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB to J400. (JPeg full range). +LIBYUV_API +int ARGBToJ400(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + int width, int height); + +// Convert ARGB to I400. +LIBYUV_API +int ARGBToI400(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Convert ARGB To NV12. +LIBYUV_API +int ARGBToNV12(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height); + +// Convert ARGB To NV21. +LIBYUV_API +int ARGBToNV21(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height); + +// Convert ARGB To NV21. +LIBYUV_API +int ARGBToNV21(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height); + +// Convert ARGB To YUY2. +LIBYUV_API +int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, + uint8* dst_yuy2, int dst_stride_yuy2, + int width, int height); + +// Convert ARGB To UYVY. +LIBYUV_API +int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, + uint8* dst_uyvy, int dst_stride_uyvy, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/cpu_id.h b/third_party/libyuv/include/libyuv/cpu_id.h index fd6276b49..dc858a814 100644 --- a/third_party/libyuv/include/libyuv/cpu_id.h +++ b/third_party/libyuv/include/libyuv/cpu_id.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT #define INCLUDE_LIBYUV_CPU_ID_H_ -#include "basic_types.h" +#include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { diff --git a/third_party/libyuv/include/libyuv/format_conversion.h b/third_party/libyuv/include/libyuv/format_conversion.h new file mode 100644 index 000000000..b18bf0534 --- /dev/null +++ b/third_party/libyuv/include/libyuv/format_conversion.h @@ -0,0 +1,168 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_ // NOLINT +#define INCLUDE_LIBYUV_FORMATCONVERSION_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert Bayer RGB formats to I420. +LIBYUV_API +int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Temporary API mapper. +#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \ + BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f) + +LIBYUV_API +int BayerToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + uint32 src_fourcc_bayer); + +// Convert I420 to Bayer RGB formats. +LIBYUV_API +int I420ToBayerBGGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToBayerGBRG(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToBayerGRBG(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToBayerRGGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// Temporary API mapper. +#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \ + I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f) + +LIBYUV_API +int I420ToBayer(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height, + uint32 dst_fourcc_bayer); + +// Convert Bayer RGB formats to ARGB. +LIBYUV_API +int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Temporary API mapper. +#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f) + +LIBYUV_API +int BayerToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height, + uint32 src_fourcc_bayer); + +// Converts ARGB to Bayer RGB formats. +LIBYUV_API +int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height); + +LIBYUV_API +int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height); + +LIBYUV_API +int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height); + +LIBYUV_API +int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height); + +// Temporary API mapper. +#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f) + +LIBYUV_API +int ARGBToBayer(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height, + uint32 dst_fourcc_bayer); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_FORMATCONVERSION_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/third_party/libyuv/include/libyuv/mjpeg_decoder.h new file mode 100644 index 000000000..82fd95df2 --- /dev/null +++ b/third_party/libyuv/include/libyuv/mjpeg_decoder.h @@ -0,0 +1,193 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ // NOLINT +#define INCLUDE_LIBYUV_MJPEG_DECODER_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +// NOTE: For a simplified public API use convert.h MJPGToI420(). + +struct jpeg_common_struct; +struct jpeg_decompress_struct; +struct jpeg_source_mgr; + +namespace libyuv { + +#ifdef __cplusplus +extern "C" { +#endif + +LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size); + +#ifdef __cplusplus +} // extern "C" +#endif + +static const uint32 kUnknownDataSize = 0xFFFFFFFF; + +enum JpegSubsamplingType { + kJpegYuv420, + kJpegYuv422, + kJpegYuv411, + kJpegYuv444, + kJpegYuv400, + kJpegUnknown +}; + +struct Buffer { + const uint8* data; + int len; +}; + +struct BufferVector { + Buffer* buffers; + int len; + int pos; +}; + +struct SetJmpErrorMgr; + +// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are +// simply independent JPEG images with a fixed huffman table (which is omitted). +// It is rarely used in video transmission, but is common as a camera capture +// format, especially in Logitech devices. This class implements a decoder for +// MJPEG frames. +// +// See http://tools.ietf.org/html/rfc2435 +class LIBYUV_API MJpegDecoder { + public: + typedef void (*CallbackFunction)(void* opaque, + const uint8* const* data, + const int* strides, + int rows); + + static const int kColorSpaceUnknown; + static const int kColorSpaceGrayscale; + static const int kColorSpaceRgb; + static const int kColorSpaceYCbCr; + static const int kColorSpaceCMYK; + static const int kColorSpaceYCCK; + + MJpegDecoder(); + ~MJpegDecoder(); + + // Loads a new frame, reads its headers, and determines the uncompressed + // image format. + // Returns LIBYUV_TRUE if image looks valid and format is supported. + // If return value is LIBYUV_TRUE, then the values for all the following + // getters are populated. + // src_len is the size of the compressed mjpeg frame in bytes. + LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len); + + // Returns width of the last loaded frame in pixels. + int GetWidth(); + + // Returns height of the last loaded frame in pixels. + int GetHeight(); + + // Returns format of the last loaded frame. The return value is one of the + // kColorSpace* constants. + int GetColorSpace(); + + // Number of color components in the color space. + int GetNumComponents(); + + // Sample factors of the n-th component. + int GetHorizSampFactor(int component); + + int GetVertSampFactor(int component); + + int GetHorizSubSampFactor(int component); + + int GetVertSubSampFactor(int component); + + // Public for testability. + int GetImageScanlinesPerImcuRow(); + + // Public for testability. + int GetComponentScanlinesPerImcuRow(int component); + + // Width of a component in bytes. + int GetComponentWidth(int component); + + // Height of a component. + int GetComponentHeight(int component); + + // Width of a component in bytes with padding for DCTSIZE. Public for testing. + int GetComponentStride(int component); + + // Size of a component in bytes. + int GetComponentSize(int component); + + // Call this after LoadFrame() if you decide you don't want to decode it + // after all. + LIBYUV_BOOL UnloadFrame(); + + // Decodes the entire image into a one-buffer-per-color-component format. + // dst_width must match exactly. dst_height must be <= to image height; if + // less, the image is cropped. "planes" must have size equal to at least + // GetNumComponents() and they must point to non-overlapping buffers of size + // at least GetComponentSize(i). The pointers in planes are incremented + // to point to after the end of the written data. + // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. + LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height); + + // Decodes the entire image and passes the data via repeated calls to a + // callback function. Each call will get the data for a whole number of + // image scanlines. + // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. + LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque, + int dst_width, int dst_height); + + // The helper function which recognizes the jpeg sub-sampling type. + static JpegSubsamplingType JpegSubsamplingTypeHelper( + int* subsample_x, int* subsample_y, int number_of_components); + + private: + + void AllocOutputBuffers(int num_outbufs); + void DestroyOutputBuffers(); + + LIBYUV_BOOL StartDecode(); + LIBYUV_BOOL FinishDecode(); + + void SetScanlinePointers(uint8** data); + LIBYUV_BOOL DecodeImcuRow(); + + int GetComponentScanlinePadding(int component); + + // A buffer holding the input data for a frame. + Buffer buf_; + BufferVector buf_vec_; + + jpeg_decompress_struct* decompress_struct_; + jpeg_source_mgr* source_mgr_; + SetJmpErrorMgr* error_mgr_; + + // LIBYUV_TRUE iff at least one component has scanline padding. (i.e., + // GetComponentScanlinePadding() != 0.) + LIBYUV_BOOL has_scanline_padding_; + + // Temporaries used to point to scanline outputs. + int num_outbufs_; // Outermost size of all arrays below. + uint8*** scanlines_; + int* scanlines_sizes_; + // Temporary buffer used for decoding when we can't decode directly to the + // output buffers. Large enough for just one iMCU row. + uint8** databuf_; + int* databuf_strides_; +}; + +} // namespace libyuv + +#endif // __cplusplus +#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/planar_functions.h b/third_party/libyuv/include/libyuv/planar_functions.h index 43f8df36d..d10a16985 100644 --- a/third_party/libyuv/include/libyuv/planar_functions.h +++ b/third_party/libyuv/include/libyuv/planar_functions.h @@ -11,11 +11,11 @@ #ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT #define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ -#include "basic_types.h" +#include "libyuv/basic_types.h" // TODO(fbarchard): Remove the following headers includes. -// #include "convert.h" -// #include "convert_argb.h" +#include "libyuv/convert.h" +#include "libyuv/convert_argb.h" #ifdef __cplusplus namespace libyuv { diff --git a/third_party/libyuv/include/libyuv/rotate.h b/third_party/libyuv/include/libyuv/rotate.h new file mode 100644 index 000000000..8af60b895 --- /dev/null +++ b/third_party/libyuv/include/libyuv/rotate.h @@ -0,0 +1,117 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_H_ // NOLINT +#define INCLUDE_LIBYUV_ROTATE_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Supported rotation. +typedef enum RotationMode { + kRotate0 = 0, // No rotation. + kRotate90 = 90, // Rotate 90 degrees clockwise. + kRotate180 = 180, // Rotate 180 degrees. + kRotate270 = 270, // Rotate 270 degrees clockwise. + + // Deprecated. + kRotateNone = 0, + kRotateClockwise = 90, + kRotateCounterClockwise = 270, +} RotationModeEnum; + +// Rotate I420 frame. +LIBYUV_API +int I420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_width, int src_height, enum RotationMode mode); + +// Rotate NV12 input and store in I420. +LIBYUV_API +int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_width, int src_height, enum RotationMode mode); + +// Rotate a plane by 0, 90, 180, or 270. +LIBYUV_API +int RotatePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int src_width, int src_height, enum RotationMode mode); + +// Rotate planes by 90, 180, 270. Deprecated. +LIBYUV_API +void RotatePlane90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void RotatePlane180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void RotatePlane270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void RotateUV90(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +// Rotations for when U and V are interleaved. +// These functions take one input pointer and +// split the data into two buffers while +// rotating them. Deprecated. +LIBYUV_API +void RotateUV180(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +LIBYUV_API +void RotateUV270(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +// The 90 and 270 functions are based on transposes. +// Doing a transpose with reversing the read/write +// order will result in a rotation by +- 90 degrees. +// Deprecated. +LIBYUV_API +void TransposePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void TransposeUV(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROTATE_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/rotate_argb.h b/third_party/libyuv/include/libyuv/rotate_argb.h new file mode 100644 index 000000000..660ff5573 --- /dev/null +++ b/third_party/libyuv/include/libyuv/rotate_argb.h @@ -0,0 +1,33 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_ROTATE_ARGB_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/rotate.h" // For RotationMode. + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Rotate ARGB frame +LIBYUV_API +int ARGBRotate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int src_width, int src_height, enum RotationMode mode); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h index daf5a45e1..fdfe1ae35 100644 --- a/third_party/libyuv/include/libyuv/row.h +++ b/third_party/libyuv/include/libyuv/row.h @@ -13,7 +13,11 @@ #include // For malloc. -#include "basic_types.h" +#include "libyuv/basic_types.h" + +#if defined(__native_client__) +#include "ppapi/c/pp_macros.h" // For PPAPI_RELEASE +#endif #ifdef __cplusplus namespace libyuv { @@ -38,7 +42,8 @@ extern "C" { var = 0 #if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ - defined(TARGET_IPHONE_SIMULATOR) + defined(TARGET_IPHONE_SIMULATOR) || \ + (defined(_MSC_VER) && defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif // True if compiling for SSSE3 as a requirement. @@ -47,7 +52,12 @@ extern "C" { #endif // Enable for NaCL pepper 33 for bundle and AVX2 support. -// #define NEW_BINUTILS +#if defined(__native_client__) && PPAPI_RELEASE >= 33 +#define NEW_BINUTILS +#endif +#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37 +#define LIBYUV_DISABLE_NEON +#endif // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ @@ -152,6 +162,11 @@ extern "C" { #define HAS_YUY2TOYROW_SSE2 #endif +// The following are available on x64 Visual C: +#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) +#define HAS_I422TOARGBROW_SSSE3 +#endif + // GCC >= 4.7.0 required for AVX2. #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) #if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) @@ -235,6 +250,10 @@ extern "C" { #define HAS_MIRRORROW_SSE2 #endif +// The following are available on arm64 platforms: +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#endif + // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) @@ -330,7 +349,8 @@ extern "C" { #endif // The following are available on Mips platforms: -#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) #define HAS_COPYROW_MIPS #if defined(__mips_dsp) && (__mips_dsp_rev >= 2) #define HAS_I422TOABGRROW_MIPS_DSPR2 @@ -426,7 +446,7 @@ typedef uint8 uvec8[16]; "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ #opcode " (%%r15,%%r14),%" #arg "\n" \ BUNDLEUNLOCK -#else +#else // defined(__native_client__) && defined(__x86_64__) #define BUNDLEALIGN "\n" #define MEMACCESS(base) "(%" #base ")" #define MEMACCESS2(offset, base) #offset "(%" #base ")" @@ -443,6 +463,15 @@ typedef uint8 uvec8[16]; #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" #define MEMOPARG(opcode, offset, base, index, scale, arg) \ #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n" +#endif // defined(__native_client__) && defined(__x86_64__) + +#if defined(__arm__) +#undef MEMACCESS +#if defined(__native_client__) +#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n" +#else +#define MEMACCESS(base) "\n" +#endif #endif void I444ToARGBRow_NEON(const uint8* src_y, diff --git a/third_party/libyuv/include/libyuv/scale.h b/third_party/libyuv/include/libyuv/scale.h index 973d46457..a3bc07e0f 100644 --- a/third_party/libyuv/include/libyuv/scale.h +++ b/third_party/libyuv/include/libyuv/scale.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT #define INCLUDE_LIBYUV_SCALE_H_ -#include "basic_types.h" +#include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { diff --git a/third_party/libyuv/include/libyuv/scale_argb.h b/third_party/libyuv/include/libyuv/scale_argb.h new file mode 100644 index 000000000..0c9b36257 --- /dev/null +++ b/third_party/libyuv/include/libyuv/scale_argb.h @@ -0,0 +1,57 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_SCALE_ARGB_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/scale.h" // For FilterMode + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +LIBYUV_API +int ARGBScale(const uint8* src_argb, int src_stride_argb, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + enum FilterMode filtering); + +// Clipped scale takes destination rectangle coordinates for clip values. +LIBYUV_API +int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + int clip_x, int clip_y, int clip_width, int clip_height, + enum FilterMode filtering); + +// TODO(fbarchard): Implement this. +// Scale with YUV conversion to ARGB and clipping. +LIBYUV_API +int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint32 src_fourcc, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + uint32 dst_fourcc, + int dst_width, int dst_height, + int clip_x, int clip_y, int clip_width, int clip_height, + enum FilterMode filtering); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/scale_row.h b/third_party/libyuv/include/libyuv/scale_row.h index 5d91f8f76..8dc0762f2 100644 --- a/third_party/libyuv/include/libyuv/scale_row.h +++ b/third_party/libyuv/include/libyuv/scale_row.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ // NOLINT #define INCLUDE_LIBYUV_SCALE_ROW_H_ -#include "basic_types.h" +#include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { diff --git a/third_party/libyuv/include/libyuv/version.h b/third_party/libyuv/include/libyuv/version.h new file mode 100644 index 000000000..912c4c9e0 --- /dev/null +++ b/third_party/libyuv/include/libyuv/version.h @@ -0,0 +1,16 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT +#define INCLUDE_LIBYUV_VERSION_H_ + +#define LIBYUV_VERSION 1041 + +#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/video_common.h b/third_party/libyuv/include/libyuv/video_common.h new file mode 100644 index 000000000..91acc2ffc --- /dev/null +++ b/third_party/libyuv/include/libyuv/video_common.h @@ -0,0 +1,182 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Common definitions for video, including fourcc and VideoFormat. + +#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ // NOLINT +#define INCLUDE_LIBYUV_VIDEO_COMMON_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +////////////////////////////////////////////////////////////////////////////// +// Definition of FourCC codes +////////////////////////////////////////////////////////////////////////////// + +// Convert four characters to a FourCC code. +// Needs to be a macro otherwise the OS X compiler complains when the kFormat* +// constants are used in a switch. +#ifdef __cplusplus +#define FOURCC(a, b, c, d) ( \ + (static_cast(a)) | (static_cast(b) << 8) | \ + (static_cast(c) << 16) | (static_cast(d) << 24)) +#else +#define FOURCC(a, b, c, d) ( \ + ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \ + ((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */ +#endif + +// Some pages discussing FourCC codes: +// http://www.fourcc.org/yuv.php +// http://v4l2spec.bytesex.org/spec/book1.htm +// http://developer.apple.com/quicktime/icefloe/dispatch020.html +// http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12 +// http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt + +// FourCC codes grouped according to implementation efficiency. +// Primary formats should convert in 1 efficient step. +// Secondary formats are converted in 2 steps. +// Auxilliary formats call primary converters. +enum FourCC { + // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed. + FOURCC_I420 = FOURCC('I', '4', '2', '0'), + FOURCC_I422 = FOURCC('I', '4', '2', '2'), + FOURCC_I444 = FOURCC('I', '4', '4', '4'), + FOURCC_I411 = FOURCC('I', '4', '1', '1'), + FOURCC_I400 = FOURCC('I', '4', '0', '0'), + FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), + FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), + FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), + FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), + + // 2 Secondary YUV formats: row biplanar. + FOURCC_M420 = FOURCC('M', '4', '2', '0'), + FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), + + // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp. + FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), + FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), + FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), + FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), + FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), + FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), + FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE. + FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE. + FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE. + + // 4 Secondary RGB formats: 4 Bayer Patterns. + FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), + FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), + FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), + FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), + + // 1 Primary Compressed YUV format. + FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), + + // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias. + FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), + FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), + FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), + FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420. + FOURCC_J420 = FOURCC('J', '4', '2', '0'), + FOURCC_J400 = FOURCC('J', '4', '0', '0'), + + // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc. + FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420. + FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422. + FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444. + FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2. + FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac. + FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY. + FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY on Mac. + FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG. + FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac. + FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR. + FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW. + FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG. + FOURCC_CM32 = FOURCC(0, 0, 0, 32), // Alias for BGRA kCMPixelFormat_32ARGB + FOURCC_CM24 = FOURCC(0, 0, 0, 24), // Alias for RAW kCMPixelFormat_24RGB + FOURCC_L555 = FOURCC('L', '5', '5', '5'), // Alias for RGBO. + FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP. + FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO. + + // 1 Auxiliary compressed YUV format set aside for capturer. + FOURCC_H264 = FOURCC('H', '2', '6', '4'), + + // Match any fourcc. + FOURCC_ANY = -1, +}; + +enum FourCCBpp { + // Canonical fourcc codes used in our code. + FOURCC_BPP_I420 = 12, + FOURCC_BPP_I422 = 16, + FOURCC_BPP_I444 = 24, + FOURCC_BPP_I411 = 12, + FOURCC_BPP_I400 = 8, + FOURCC_BPP_NV21 = 12, + FOURCC_BPP_NV12 = 12, + FOURCC_BPP_YUY2 = 16, + FOURCC_BPP_UYVY = 16, + FOURCC_BPP_M420 = 12, + FOURCC_BPP_Q420 = 12, + FOURCC_BPP_ARGB = 32, + FOURCC_BPP_BGRA = 32, + FOURCC_BPP_ABGR = 32, + FOURCC_BPP_RGBA = 32, + FOURCC_BPP_24BG = 24, + FOURCC_BPP_RAW = 24, + FOURCC_BPP_RGBP = 16, + FOURCC_BPP_RGBO = 16, + FOURCC_BPP_R444 = 16, + FOURCC_BPP_RGGB = 8, + FOURCC_BPP_BGGR = 8, + FOURCC_BPP_GRBG = 8, + FOURCC_BPP_GBRG = 8, + FOURCC_BPP_YV12 = 12, + FOURCC_BPP_YV16 = 16, + FOURCC_BPP_YV24 = 24, + FOURCC_BPP_YU12 = 12, + FOURCC_BPP_J420 = 12, + FOURCC_BPP_J400 = 8, + FOURCC_BPP_MJPG = 0, // 0 means unknown. + FOURCC_BPP_H264 = 0, + FOURCC_BPP_IYUV = 12, + FOURCC_BPP_YU16 = 16, + FOURCC_BPP_YU24 = 24, + FOURCC_BPP_YUYV = 16, + FOURCC_BPP_YUVS = 16, + FOURCC_BPP_HDYC = 16, + FOURCC_BPP_2VUY = 16, + FOURCC_BPP_JPEG = 1, + FOURCC_BPP_DMB1 = 1, + FOURCC_BPP_BA81 = 8, + FOURCC_BPP_RGB3 = 24, + FOURCC_BPP_BGR3 = 24, + FOURCC_BPP_CM32 = 32, + FOURCC_BPP_CM24 = 24, + + // Match any fourcc. + FOURCC_BPP_ANY = 0, // 0 means unknown. +}; + +// Converts fourcc aliases into canonical ones. +LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ NOLINT diff --git a/third_party/libyuv/source/compare.cc b/third_party/libyuv/source/compare.cc new file mode 100644 index 000000000..9ea81b4e2 --- /dev/null +++ b/third_party/libyuv/source/compare.cc @@ -0,0 +1,325 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/compare.h" + +#include +#include +#ifdef _OPENMP +#include +#endif + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// hash seed of 5381 recommended. +// Internal C version of HashDjb2 with int sized count for efficiency. +uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); + +// This module is for Visual C x86 +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || \ + (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))) +#define HAS_HASHDJB2_SSE41 +uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed); + +#if _MSC_VER >= 1700 +#define HAS_HASHDJB2_AVX2 +uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed); +#endif + +#endif // HAS_HASHDJB2_SSE41 + +// hash seed of 5381 recommended. +LIBYUV_API +uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { + const int kBlockSize = 1 << 15; // 32768; + int remainder; + uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C; +#if defined(HAS_HASHDJB2_SSE41) + if (TestCpuFlag(kCpuHasSSE41)) { + HashDjb2_SSE = HashDjb2_SSE41; + } +#endif +#if defined(HAS_HASHDJB2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HashDjb2_SSE = HashDjb2_AVX2; + } +#endif + + while (count >= (uint64)(kBlockSize)) { + seed = HashDjb2_SSE(src, kBlockSize, seed); + src += kBlockSize; + count -= kBlockSize; + } + remainder = (int)(count) & ~15; + if (remainder) { + seed = HashDjb2_SSE(src, remainder, seed); + src += remainder; + count -= remainder; + } + remainder = (int)(count) & 15; + if (remainder) { + seed = HashDjb2_C(src, remainder, seed); + } + return seed; +} + +uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_SUMSQUAREERROR_NEON +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); +#endif +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_SUMSQUAREERROR_SSE2 +uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); +#endif +// Visual C 2012 required for AVX2. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700 +#define HAS_SUMSQUAREERROR_AVX2 +uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count); +#endif + +// TODO(fbarchard): Refactor into row function. +LIBYUV_API +uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, + int count) { + // SumSquareError returns values 0 to 65535 for each squared difference. + // Up to 65536 of those can be summed and remain within a uint32. + // After each block of 65536 pixels, accumulate into a uint64. + const int kBlockSize = 65536; + int remainder = count & (kBlockSize - 1) & ~31; + uint64 sse = 0; + int i; + uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = + SumSquareError_C; +#if defined(HAS_SUMSQUAREERROR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SumSquareError = SumSquareError_NEON; + } +#endif +#if defined(HAS_SUMSQUAREERROR_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) { + // Note only used for multiples of 16 so count is not checked. + SumSquareError = SumSquareError_SSE2; + } +#endif +#if defined(HAS_SUMSQUAREERROR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + // Note only used for multiples of 32 so count is not checked. + SumSquareError = SumSquareError_AVX2; + } +#endif +#ifdef _OPENMP +#pragma omp parallel for reduction(+: sse) +#endif + for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { + sse += SumSquareError(src_a + i, src_b + i, kBlockSize); + } + src_a += count & ~(kBlockSize - 1); + src_b += count & ~(kBlockSize - 1); + if (remainder) { + sse += SumSquareError(src_a, src_b, remainder); + src_a += remainder; + src_b += remainder; + } + remainder = count & 31; + if (remainder) { + sse += SumSquareError_C(src_a, src_b, remainder); + } + return sse; +} + +LIBYUV_API +uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height) { + uint64 sse = 0; + int h; + // Coalesce rows. + if (stride_a == width && + stride_b == width) { + width *= height; + height = 1; + stride_a = stride_b = 0; + } + for (h = 0; h < height; ++h) { + sse += ComputeSumSquareError(src_a, src_b, width); + src_a += stride_a; + src_b += stride_b; + } + return sse; +} + +LIBYUV_API +double SumSquareErrorToPsnr(uint64 sse, uint64 count) { + double psnr; + if (sse > 0) { + double mse = (double)(count) / (double)(sse); + psnr = 10.0 * log10(255.0 * 255.0 * mse); + } else { + psnr = kMaxPsnr; // Limit to prevent divide by 0 + } + + if (psnr > kMaxPsnr) + psnr = kMaxPsnr; + + return psnr; +} + +LIBYUV_API +double CalcFramePsnr(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height) { + const uint64 samples = width * height; + const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, + src_b, stride_b, + width, height); + return SumSquareErrorToPsnr(sse, samples); +} + +LIBYUV_API +double I420Psnr(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height) { + const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, + src_y_b, stride_y_b, + width, height); + const int width_uv = (width + 1) >> 1; + const int height_uv = (height + 1) >> 1; + const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a, + src_u_b, stride_u_b, + width_uv, height_uv); + const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a, + src_v_b, stride_v_b, + width_uv, height_uv); + const uint64 samples = width * height + 2 * (width_uv * height_uv); + const uint64 sse = sse_y + sse_u + sse_v; + return SumSquareErrorToPsnr(sse, samples); +} + +static const int64 cc1 = 26634; // (64^2*(.01*255)^2 +static const int64 cc2 = 239708; // (64^2*(.03*255)^2 + +static double Ssim8x8_C(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b) { + int64 sum_a = 0; + int64 sum_b = 0; + int64 sum_sq_a = 0; + int64 sum_sq_b = 0; + int64 sum_axb = 0; + + int i; + for (i = 0; i < 8; ++i) { + int j; + for (j = 0; j < 8; ++j) { + sum_a += src_a[j]; + sum_b += src_b[j]; + sum_sq_a += src_a[j] * src_a[j]; + sum_sq_b += src_b[j] * src_b[j]; + sum_axb += src_a[j] * src_b[j]; + } + + src_a += stride_a; + src_b += stride_b; + } + + { + const int64 count = 64; + // scale the constants by number of pixels + const int64 c1 = (cc1 * count * count) >> 12; + const int64 c2 = (cc2 * count * count) >> 12; + + const int64 sum_a_x_sum_b = sum_a * sum_b; + + const int64 ssim_n = (2 * sum_a_x_sum_b + c1) * + (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); + + const int64 sum_a_sq = sum_a*sum_a; + const int64 sum_b_sq = sum_b*sum_b; + + const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) * + (count * sum_sq_a - sum_a_sq + + count * sum_sq_b - sum_b_sq + c2); + + if (ssim_d == 0.0) { + return DBL_MAX; + } + return ssim_n * 1.0 / ssim_d; + } +} + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +LIBYUV_API +double CalcFrameSsim(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height) { + int samples = 0; + double ssim_total = 0; + double (*Ssim8x8)(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b) = Ssim8x8_C; + + // sample point start with each 4x4 location + int i; + for (i = 0; i < height - 8; i += 4) { + int j; + for (j = 0; j < width - 8; j += 4) { + ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b); + samples++; + } + + src_a += stride_a * 4; + src_b += stride_b * 4; + } + + ssim_total /= samples; + return ssim_total; +} + +LIBYUV_API +double I420Ssim(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height) { + const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a, + src_y_b, stride_y_b, width, height); + const int width_uv = (width + 1) >> 1; + const int height_uv = (height + 1) >> 1; + const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, + src_u_b, stride_u_b, + width_uv, height_uv); + const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, + src_v_b, stride_v_b, + width_uv, height_uv); + return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/compare_common.cc b/third_party/libyuv/source/compare_common.cc new file mode 100644 index 000000000..c546b5182 --- /dev/null +++ b/third_party/libyuv/source/compare_common.cc @@ -0,0 +1,42 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) { + uint32 sse = 0u; + int i; + for (i = 0; i < count; ++i) { + int diff = src_a[i] - src_b[i]; + sse += (uint32)(diff * diff); + } + return sse; +} + +// hash seed of 5381 recommended. +// Internal C version of HashDjb2 with int sized count for efficiency. +uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) { + uint32 hash = seed; + int i; + for (i = 0; i < count; ++i) { + hash += (hash << 5) + src[i]; + } + return hash; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/compare_neon.cc b/third_party/libyuv/source/compare_neon.cc new file mode 100644 index 000000000..5e7b8e443 --- /dev/null +++ b/third_party/libyuv/source/compare_neon.cc @@ -0,0 +1,64 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) + +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 sse; + asm volatile ( + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" + + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(sse) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + return sse; +} + +#endif // __ARM_NEON__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/compare_posix.cc b/third_party/libyuv/source/compare_posix.cc new file mode 100644 index 000000000..ac361190e --- /dev/null +++ b/third_party/libyuv/source/compare_posix.cc @@ -0,0 +1,158 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { + uint32 sse; + asm volatile ( // NOLINT + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "lea " MEMLEA(0x10, 0) ",%0 \n" + "movdqa " MEMACCESS(1) ",%%xmm2 \n" + "lea " MEMLEA(0x10, 1) ",%1 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psubusb %%xmm2,%%xmm1 \n" + "psubusb %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm2 \n" + "pmaddwd %%xmm1,%%xmm1 \n" + "pmaddwd %%xmm2,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" + "paddd %%xmm2,%%xmm0 \n" + "jg 1b \n" + + "pshufd $0xee,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "pshufd $0x1,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0,%3 \n" + + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=g"(sse) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); // NOLINT + return sse; +} + +#endif // defined(__x86_64__) || defined(__i386__) + +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) +#define HAS_HASHDJB2_SSE41 +static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +static uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 +}; +static uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 +}; +static uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 +}; +static uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 +}; + +uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { + uint32 hash; + asm volatile ( // NOLINT + "movd %2,%%xmm0 \n" + "pxor %%xmm7,%%xmm7 \n" + "movdqa %4,%%xmm6 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "lea " MEMLEA(0x10, 0) ",%0 \n" + "pmulld %%xmm6,%%xmm0 \n" + "movdqa %5,%%xmm5 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "pmulld %%xmm5,%%xmm3 \n" + "movdqa %6,%%xmm5 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpckhwd %%xmm7,%%xmm4 \n" + "pmulld %%xmm5,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "pmulld %%xmm5,%%xmm2 \n" + "movdqa %8,%%xmm5 \n" + "punpckhwd %%xmm7,%%xmm1 \n" + "pmulld %%xmm5,%%xmm1 \n" + "paddd %%xmm4,%%xmm3 \n" + "paddd %%xmm2,%%xmm1 \n" + "sub $0x10,%1 \n" + "paddd %%xmm3,%%xmm1 \n" + "pshufd $0xe,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "pshufd $0x1,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "jg 1b \n" + "movd %%xmm0,%3 \n" + : "+r"(src), // %0 + "+r"(count), // %1 + "+rm"(seed), // %2 + "=g"(hash) // %3 + : "m"(kHash16x33), // %4 + "m"(kHashMul0), // %5 + "m"(kHashMul1), // %6 + "m"(kHashMul2), // %7 + "m"(kHashMul3) // %8 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); // NOLINT + return hash; +} +#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/third_party/libyuv/source/compare_win.cc b/third_party/libyuv/source/compare_win.cc new file mode 100644 index 000000000..99831651f --- /dev/null +++ b/third_party/libyuv/source/compare_win.cc @@ -0,0 +1,232 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +__declspec(naked) __declspec(align(16)) +uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { + __asm { + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count + pxor xmm0, xmm0 + pxor xmm5, xmm5 + + align 4 + wloop: + movdqa xmm1, [eax] + lea eax, [eax + 16] + movdqa xmm2, [edx] + lea edx, [edx + 16] + sub ecx, 16 + movdqa xmm3, xmm1 // abs trick + psubusb xmm1, xmm2 + psubusb xmm2, xmm3 + por xmm1, xmm2 + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm0, xmm1 + paddd xmm0, xmm2 + jg wloop + + pshufd xmm1, xmm0, 0xee + paddd xmm0, xmm1 + pshufd xmm1, xmm0, 0x01 + paddd xmm0, xmm1 + movd eax, xmm0 + ret + } +} + +// Visual C 2012 required for AVX2. +#if _MSC_VER >= 1700 +// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. +#pragma warning(disable: 4752) +__declspec(naked) __declspec(align(16)) +uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { + __asm { + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count + vpxor ymm0, ymm0, ymm0 // sum + vpxor ymm5, ymm5, ymm5 // constant 0 for unpck + sub edx, eax + + align 4 + wloop: + vmovdqu ymm1, [eax] + vmovdqu ymm2, [eax + edx] + lea eax, [eax + 32] + sub ecx, 32 + vpsubusb ymm3, ymm1, ymm2 // abs difference trick + vpsubusb ymm2, ymm2, ymm1 + vpor ymm1, ymm2, ymm3 + vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order. + vpunpckhbw ymm1, ymm1, ymm5 + vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32. + vpmaddwd ymm1, ymm1, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm2 + jg wloop + + vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. + vpaddd ymm0, ymm0, ymm1 + vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes. + vpaddd ymm0, ymm0, ymm1 + vpermq ymm1, ymm0, 0x02 // high + low lane. + vpaddd ymm0, ymm0, ymm1 + vmovd eax, xmm0 + vzeroupper + ret + } +} +#endif // _MSC_VER >= 1700 + +#define HAS_HASHDJB2_SSE41 +static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +static uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 +}; +static uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 +}; +static uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 +}; +static uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 +}; + +// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 +// 44: 66 0F 38 40 DD pmulld xmm3,xmm5 +// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 +// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 +// 83: 66 0F 38 40 CD pmulld xmm1,xmm5 +#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ + _asm _emit 0x40 _asm _emit reg + +__declspec(naked) __declspec(align(16)) +uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { + __asm { + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count + movd xmm0, [esp + 12] // seed + + pxor xmm7, xmm7 // constant 0 for unpck + movdqa xmm6, kHash16x33 + + align 4 + wloop: + movdqu xmm1, [eax] // src[0-15] + lea eax, [eax + 16] + pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16 + movdqa xmm5, kHashMul0 + movdqa xmm2, xmm1 + punpcklbw xmm2, xmm7 // src[0-7] + movdqa xmm3, xmm2 + punpcklwd xmm3, xmm7 // src[0-3] + pmulld(0xdd) // pmulld xmm3, xmm5 + movdqa xmm5, kHashMul1 + movdqa xmm4, xmm2 + punpckhwd xmm4, xmm7 // src[4-7] + pmulld(0xe5) // pmulld xmm4, xmm5 + movdqa xmm5, kHashMul2 + punpckhbw xmm1, xmm7 // src[8-15] + movdqa xmm2, xmm1 + punpcklwd xmm2, xmm7 // src[8-11] + pmulld(0xd5) // pmulld xmm2, xmm5 + movdqa xmm5, kHashMul3 + punpckhwd xmm1, xmm7 // src[12-15] + pmulld(0xcd) // pmulld xmm1, xmm5 + paddd xmm3, xmm4 // add 16 results + paddd xmm1, xmm2 + sub ecx, 16 + paddd xmm1, xmm3 + + pshufd xmm2, xmm1, 0x0e // upper 2 dwords + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0x01 + paddd xmm1, xmm2 + paddd xmm0, xmm1 + jg wloop + + movd eax, xmm0 // return hash + ret + } +} + +// Visual C 2012 required for AVX2. +#if _MSC_VER >= 1700 +__declspec(naked) __declspec(align(16)) +uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { + __asm { + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count + movd xmm0, [esp + 12] // seed + movdqa xmm6, kHash16x33 + + align 4 + wloop: + vpmovzxbd xmm3, dword ptr [eax] // src[0-3] + pmulld xmm0, xmm6 // hash *= 33 ^ 16 + vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7] + pmulld xmm3, kHashMul0 + vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11] + pmulld xmm4, kHashMul1 + vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15] + pmulld xmm2, kHashMul2 + lea eax, [eax + 16] + pmulld xmm1, kHashMul3 + paddd xmm3, xmm4 // add 16 results + paddd xmm1, xmm2 + sub ecx, 16 + paddd xmm1, xmm3 + pshufd xmm2, xmm1, 0x0e // upper 2 dwords + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0x01 + paddd xmm1, xmm2 + paddd xmm0, xmm1 + jg wloop + + movd eax, xmm0 // return hash + ret + } +} +#endif // _MSC_VER >= 1700 + +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/convert.cc b/third_party/libyuv/source/convert.cc new file mode 100644 index 000000000..874a6cb7c --- /dev/null +++ b/third_party/libyuv/source/convert.cc @@ -0,0 +1,1513 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert.h" + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/scale.h" // For ScalePlane() +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// Any I4xx To I420 format with mirroring. +static int I4xxToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_y_width, int src_y_height, + int src_uv_width, int src_uv_height) { + const int dst_y_width = Abs(src_y_width); + const int dst_y_height = Abs(src_y_height); + const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); + const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); + if (src_y_width == 0 || src_y_height == 0 || + src_uv_width == 0 || src_uv_height == 0) { + return -1; + } + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, + dst_y, dst_stride_y, dst_y_width, dst_y_height, + kFilterBilinear); + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, + dst_u, dst_stride_u, dst_uv_width, dst_uv_height, + kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, + dst_v, dst_stride_v, dst_uv_width, dst_uv_height, + kFilterBilinear); + return 0; +} + +// Copy I420 with optional flipping +// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure +// is does row coalescing. +LIBYUV_API +int I420Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + // Copy UV planes. + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// 422 chroma is 1/2 width, 1x height +// 420 chroma is 1/2 width, 1/2 height +LIBYUV_API +int I422ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int src_uv_width = SUBSAMPLE(width, 1, 1); + return I4xxToI420(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + src_uv_width, height); +} + +// 444 chroma is 1x width, 1x height +// 420 chroma is 1/2 width, 1/2 height +LIBYUV_API +int I444ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return I4xxToI420(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + width, height); +} + +// 411 chroma is 1/4 width, 1x height +// 420 chroma is 1/2 width, 1/2 height +LIBYUV_API +int I411ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int src_uv_width = SUBSAMPLE(width, 3, 2); + return I4xxToI420(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + src_uv_width, height); +} + +// I400 is greyscale typically used in MJPG +LIBYUV_API +int I400ToI420(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128); + SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128); + return 0; +} + +static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, + uint8* dst, int dst_stride, + int width, int height) { + int y; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_X86) + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src, 16) && + IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_NEON; + } +#endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif + + // Copy plane + for (y = 0; y < height - 1; y += 2) { + CopyRow(src, dst, width); + CopyRow(src + src_stride_0, dst + dst_stride, width); + src += src_stride_0 + src_stride_1; + dst += dst_stride * 2; + } + if (height & 1) { + CopyRow(src, dst, width); + } +} + +// Support converting from FOURCC_M420 +// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for +// easy conversion to I420. +// M420 format description: +// M420 is row biplanar 420: 2 rows of Y and 1 row of UV. +// Chroma is half width / half height. (420) +// src_stride_m420 is row planar. Normally this will be the width in pixels. +// The UV plane is half width, but 2 values, so src_stride_m420 applies to +// this as well as the two Y planes. +static int X420ToI420(const uint8* src_y, + int src_stride_y0, int src_stride_y1, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = + SplitUVRow_C; + if (!src_y || !src_uv || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (halfheight - 1) * dst_stride_u; + dst_v = dst_v + (halfheight - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + // Coalesce rows. + if (src_stride_y0 == width && + src_stride_y1 == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y0 = src_stride_y1 = dst_stride_y = 0; + } + // Coalesce rows. + if (src_stride_uv == halfwidth * 2 && + dst_stride_u == halfwidth && + dst_stride_v == halfwidth) { + halfwidth *= halfheight; + halfheight = 1; + src_stride_uv = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_SPLITUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { + SplitUVRow = SplitUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + SplitUVRow = SplitUVRow_Unaligned_SSE2; + if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) && + IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && + IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { + SplitUVRow = SplitUVRow_SSE2; + } + } + } +#endif +#if defined(HAS_SPLITUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { + SplitUVRow = SplitUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + SplitUVRow = SplitUVRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) { + SplitUVRow = SplitUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + SplitUVRow = SplitUVRow_NEON; + } + } +#endif +#if defined(HAS_SPLITUVROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) { + SplitUVRow = SplitUVRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(halfwidth, 16)) { + SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2; + if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) && + IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) && + IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) { + SplitUVRow = SplitUVRow_MIPS_DSPR2; + } + } + } +#endif + + if (dst_y) { + if (src_stride_y0 == src_stride_y1) { + CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height); + } else { + CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, + width, height); + } + } + + for (y = 0; y < halfheight; ++y) { + // Copy a row of UV. + SplitUVRow(src_uv, dst_u, dst_v, halfwidth); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += src_stride_uv; + } + return 0; +} + +// Convert NV12 to I420. +LIBYUV_API +int NV12ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, + src_uv, src_stride_uv, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); +} + +// Convert NV21 to I420. Same as NV12 but u and v pointers swapped. +LIBYUV_API +int NV21ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, + src_vu, src_stride_vu, + dst_y, dst_stride_y, + dst_v, dst_stride_v, + dst_u, dst_stride_u, + width, height); +} + +// Convert M420 to I420. +LIBYUV_API +int M420ToI420(const uint8* src_m420, int src_stride_m420, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2, + src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); +} + +// Convert Q420 to I420. +// Format is rows of YY/YUYV +LIBYUV_API +int Q420ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + int halfheight = (height + 1) >> 1; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int pix) = YUY2ToUV422Row_C; + void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) = + YUY2ToYRow_C; + if (!src_y || !src_yuy2 || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (halfheight - 1) * dst_stride_u; + dst_v = dst_v + (halfheight - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + // CopyRow for rows of just Y in Q420 copied to Y plane of I420. +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_NEON; + } +#endif +#if defined(HAS_COPYROW_X86) + if (IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif + +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; + YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUV422Row = YUY2ToUV422Row_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width >= 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUV422Row = YUY2ToUV422Row_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + CopyRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + + YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + CopyRow(src_y, dst_y, width); + YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); + } + return 0; +} + +// Convert YUY2 to I420. +LIBYUV_API +int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C; + void (*YUY2ToYRow)(const uint8* src_yuy2, + uint8* dst_y, int pix) = YUY2ToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2; + YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { + YUY2ToUVRow = YUY2ToUVRow_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + YUY2ToUVRow = YUY2ToUVRow_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUVRow = YUY2ToUVRow_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width >= 16) { + YUY2ToUVRow = YUY2ToUVRow_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUVRow = YUY2ToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width); + src_yuy2 += src_stride_yuy2 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + } + return 0; +} + +// Convert UYVY to I420. +LIBYUV_API +int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C; + void (*UYVYToYRow)(const uint8* src_uyvy, + uint8* dst_y, int pix) = UYVYToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + UYVYToUVRow = UYVYToUVRow_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2; + UYVYToYRow = UYVYToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { + UYVYToUVRow = UYVYToUVRow_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + UYVYToYRow = UYVYToYRow_SSE2; + } + } + } + } +#endif +#if defined(HAS_UYVYTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + UYVYToUVRow = UYVYToUVRow_Any_AVX2; + UYVYToYRow = UYVYToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToUVRow = UYVYToUVRow_AVX2; + UYVYToYRow = UYVYToYRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (width >= 16) { + UYVYToUVRow = UYVYToUVRow_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUVRow = UYVYToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width); + src_uyvy += src_stride_uyvy * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + } + return 0; +} + +// Convert ARGB to I420. +LIBYUV_API +int ARGBToI420(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + if (!src_argb || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + } + return 0; +} + +// Convert BGRA to I420. +LIBYUV_API +int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C; + void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) = + BGRAToYRow_C; + if (!src_bgra || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; + } +#if defined(HAS_BGRATOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + BGRAToUVRow = BGRAToUVRow_Any_SSSE3; + BGRAToYRow = BGRAToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3; + BGRAToYRow = BGRAToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) { + BGRAToUVRow = BGRAToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + BGRAToYRow = BGRAToYRow_SSSE3; + } + } + } + } +#elif defined(HAS_BGRATOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + BGRAToYRow = BGRAToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + BGRAToYRow = BGRAToYRow_NEON; + } + if (width >= 16) { + BGRAToUVRow = BGRAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_NEON; + } + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width); + BGRAToYRow(src_bgra, dst_y, width); + BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width); + src_bgra += src_stride_bgra * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width); + BGRAToYRow(src_bgra, dst_y, width); + } + return 0; +} + +// Convert ABGR to I420. +LIBYUV_API +int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C; + void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) = + ABGRToYRow_C; + if (!src_abgr || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +#if defined(HAS_ABGRTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ABGRToUVRow = ABGRToUVRow_Any_SSSE3; + ABGRToYRow = ABGRToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3; + ABGRToYRow = ABGRToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) { + ABGRToUVRow = ABGRToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ABGRToYRow = ABGRToYRow_SSSE3; + } + } + } + } +#elif defined(HAS_ABGRTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ABGRToYRow = ABGRToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ABGRToYRow = ABGRToYRow_NEON; + } + if (width >= 16) { + ABGRToUVRow = ABGRToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON; + } + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); + ABGRToYRow(src_abgr, dst_y, width); + ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); + src_abgr += src_stride_abgr * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width); + ABGRToYRow(src_abgr, dst_y, width); + } + return 0; +} + +// Convert RGBA to I420. +LIBYUV_API +int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C; + void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) = + RGBAToYRow_C; + if (!src_rgba || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgba = src_rgba + (height - 1) * src_stride_rgba; + src_stride_rgba = -src_stride_rgba; + } +#if defined(HAS_RGBATOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + RGBAToUVRow = RGBAToUVRow_Any_SSSE3; + RGBAToYRow = RGBAToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3; + RGBAToYRow = RGBAToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) { + RGBAToUVRow = RGBAToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + RGBAToYRow = RGBAToYRow_SSSE3; + } + } + } + } +#elif defined(HAS_RGBATOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RGBAToYRow = RGBAToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGBAToYRow = RGBAToYRow_NEON; + } + if (width >= 16) { + RGBAToUVRow = RGBAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_NEON; + } + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); + RGBAToYRow(src_rgba, dst_y, width); + RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width); + src_rgba += src_stride_rgba * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width); + RGBAToYRow(src_rgba, dst_y, width); + } + return 0; +} + +// Convert RGB24 to I420. +LIBYUV_API +int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_RGB24TOYROW_NEON) + void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C; + void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) = + RGB24ToYRow_C; +#else + void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RGB24ToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + if (!src_rgb24 || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + +#if defined(HAS_RGB24TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RGB24ToYRow = RGB24ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToYRow = RGB24ToYRow_NEON; + } + if (width >= 16) { + RGB24ToUVRow = RGB24ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24ToUVRow = RGB24ToUVRow_NEON; + } + } + } +#else // HAS_RGB24TOYROW_NEON + +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif // HAS_ARGBTOUVROW_SSSE3 +#endif // HAS_RGB24TOYROW_NEON + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RGB24TOYROW_NEON) + RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); + RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_rgb24 += src_stride_rgb24 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_RGB24TOYROW_NEON) + RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RGB24TOYROW_NEON) + free_aligned_buffer_64(row); +#endif + return 0; +} + +// Convert RAW to I420. +LIBYUV_API +int RAWToI420(const uint8* src_raw, int src_stride_raw, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_RAWTOYROW_NEON) + void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C; + void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) = + RAWToYRow_C; +#else + void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RAWToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + if (!src_raw || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + +#if defined(HAS_RAWTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RAWToYRow = RAWToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToYRow = RAWToYRow_NEON; + } + if (width >= 16) { + RAWToUVRow = RAWToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToUVRow = RAWToUVRow_NEON; + } + } + } +#else // HAS_RAWTOYROW_NEON + +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif // HAS_ARGBTOUVROW_SSSE3 +#endif // HAS_RAWTOYROW_NEON + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RAWTOYROW_NEON) + RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); + RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); +#else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_RAWTOYROW_NEON) + RAWToUVRow(src_raw, 0, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); +#else + RAWToARGBRow(src_raw, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RAWTOYROW_NEON) + free_aligned_buffer_64(row); +#endif + return 0; +} + +// Convert RGB565 to I420. +LIBYUV_API +int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_RGB565TOYROW_NEON) + void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C; + void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) = + RGB565ToYRow_C; +#else + void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RGB565ToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + if (!src_rgb565 || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; + src_stride_rgb565 = -src_stride_rgb565; + } + +#if defined(HAS_RGB565TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RGB565ToYRow = RGB565ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB565ToYRow = RGB565ToYRow_NEON; + } + if (width >= 16) { + RGB565ToUVRow = RGB565ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB565ToUVRow = RGB565ToUVRow_NEON; + } + } + } +#else // HAS_RGB565TOYROW_NEON + +#if defined(HAS_RGB565TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif // HAS_ARGBTOUVROW_SSSE3 +#endif // HAS_RGB565TOYROW_NEON + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RGB565TOYROW_NEON) + RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); + RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); +#else + RGB565ToARGBRow(src_rgb565, row, width); + RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_rgb565 += src_stride_rgb565 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_RGB565TOYROW_NEON) + RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); +#else + RGB565ToARGBRow(src_rgb565, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RGB565TOYROW_NEON) + free_aligned_buffer_64(row); +#endif + return 0; +} + +// Convert ARGB1555 to I420. +LIBYUV_API +int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_ARGB1555TOYROW_NEON) + void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C; + void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) = + ARGB1555ToYRow_C; +#else + void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + ARGB1555ToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + if (!src_argb1555 || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; + src_stride_argb1555 = -src_stride_argb1555; + } + +#if defined(HAS_ARGB1555TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToYRow = ARGB1555ToYRow_NEON; + } + if (width >= 16) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_NEON; + } + } + } +#else // HAS_ARGB1555TOYROW_NEON + +#if defined(HAS_ARGB1555TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif // HAS_ARGBTOUVROW_SSSE3 +#endif // HAS_ARGB1555TOYROW_NEON + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_ARGB1555TOYROW_NEON) + ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); + ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, + width); +#else + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_argb1555 += src_stride_argb1555 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_ARGB1555TOYROW_NEON) + ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); +#else + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_ARGB1555TOYROW_NEON) + free_aligned_buffer_64(row); +#endif + return 0; +} + +// Convert ARGB4444 to I420. +LIBYUV_API +int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_ARGB4444TOYROW_NEON) + void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C; + void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) = + ARGB4444ToYRow_C; +#else + void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + ARGB4444ToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + if (!src_argb4444 || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; + src_stride_argb4444 = -src_stride_argb4444; + } + +#if defined(HAS_ARGB4444TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToYRow = ARGB4444ToYRow_NEON; + } + if (width >= 16) { + ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_NEON; + } + } + } +#else // HAS_ARGB4444TOYROW_NEON + +#if defined(HAS_ARGB4444TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif // HAS_ARGBTOUVROW_SSSE3 +#endif // HAS_ARGB4444TOYROW_NEON + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_ARGB4444TOYROW_NEON) + ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); + ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, + width); +#else + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_argb4444 += src_stride_argb4444 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_ARGB4444TOYROW_NEON) + ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); +#else + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_ARGB4444TOYROW_NEON) + free_aligned_buffer_64(row); +#endif + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/convert_argb.cc b/third_party/libyuv/source/convert_argb.cc new file mode 100644 index 000000000..ac0bc3d15 --- /dev/null +++ b/third_party/libyuv/source/convert_argb.cc @@ -0,0 +1,938 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_argb.h" + +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/rotate_argb.h" +#include "libyuv/row.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy ARGB with optional flipping +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + + CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width * 4, height); + return 0; +} + +// Convert I444 to ARGB. +LIBYUV_API +int I444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I444ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I444ToARGBRow_C; + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u == width && + src_stride_v == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I444ToARGBRow = I444ToARGBRow_SSSE3; + } + } + } +#elif defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I444ToARGBRow = I444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I444ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to ARGB. +LIBYUV_API +int I422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I411 to ARGB. +LIBYUV_API +int I411ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I411ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I411ToARGBRow_C; + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 4 == width && + src_stride_v * 4 == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I411TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I411ToARGBRow = I411ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I411ToARGBRow = I411ToARGBRow_SSSE3; + } + } + } +#elif defined(HAS_I411TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I411ToARGBRow = I411ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I411ToARGBRow = I411ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I411ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I400 to ARGB. +LIBYUV_API +int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*YToARGBRow)(const uint8* y_buf, + uint8* rgb_buf, + int width) = YToARGBRow_C; + if (!src_y || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_YTOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + YToARGBRow = YToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + YToARGBRow = YToARGBRow_SSE2; + } + } +#elif defined(HAS_YTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + YToARGBRow = YToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + YToARGBRow = YToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + YToARGBRow(src_y, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + } + return 0; +} + +// Convert I400 to ARGB. +LIBYUV_API +int I400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) = + I400ToARGBRow_C; + if (!src_y || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_I400TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { + I400ToARGBRow = I400ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I400ToARGBRow = I400ToARGBRow_SSE2; + } + } + } +#elif defined(HAS_I400TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I400ToARGBRow = I400ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I400ToARGBRow = I400ToARGBRow_NEON; + } + } +#endif + for (y = 0; y < height; ++y) { + I400ToARGBRow(src_y, dst_argb, width); + src_y += src_stride_y; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Shuffle table for converting BGRA to ARGB. +static uvec8 kShuffleMaskBGRAToARGB = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u +}; + +// Shuffle table for converting ABGR to ARGB. +static uvec8 kShuffleMaskABGRToARGB = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u +}; + +// Shuffle table for converting RGBA to ARGB. +static uvec8 kShuffleMaskRGBAToARGB = { + 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u +}; + +// Convert BGRA to ARGB. +LIBYUV_API +int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskBGRAToARGB), + width, height); +} + +// Convert ARGB to BGRA (same as BGRAToARGB). +LIBYUV_API +int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskBGRAToARGB), + width, height); +} + +// Convert ABGR to ARGB. +LIBYUV_API +int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskABGRToARGB), + width, height); +} + +// Convert ARGB to ABGR to (same as ABGRToARGB). +LIBYUV_API +int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskABGRToARGB), + width, height); +} + +// Convert RGBA to ARGB. +LIBYUV_API +int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_rgba, src_stride_rgba, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskRGBAToARGB), + width, height); +} + +// Convert RGB24 to ARGB. +LIBYUV_API +int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RGB24ToARGBRow_C; + if (!src_rgb24 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + // Coalesce rows. + if (src_stride_rgb24 == width * 3 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_rgb24 = dst_stride_argb = 0; + } +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } + } +#elif defined(HAS_RGB24TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToARGBRow = RGB24ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + RGB24ToARGBRow(src_rgb24, dst_argb, width); + src_rgb24 += src_stride_rgb24; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RAW to ARGB. +LIBYUV_API +int RAWToARGB(const uint8* src_raw, int src_stride_raw, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RAWToARGBRow_C; + if (!src_raw || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + // Coalesce rows. + if (src_stride_raw == width * 3 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_raw = dst_stride_argb = 0; + } +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#elif defined(HAS_RAWTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RAWToARGBRow = RAWToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToARGBRow = RAWToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + RAWToARGBRow(src_raw, dst_argb, width); + src_raw += src_stride_raw; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RGB565 to ARGB. +LIBYUV_API +int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) = + RGB565ToARGBRow_C; + if (!src_rgb565 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; + src_stride_rgb565 = -src_stride_rgb565; + } + // Coalesce rows. + if (src_stride_rgb565 == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_rgb565 = dst_stride_argb = 0; + } +#if defined(HAS_RGB565TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_SSE2; + } + } +#elif defined(HAS_RGB565TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + RGB565ToARGBRow(src_rgb565, dst_argb, width); + src_rgb565 += src_stride_rgb565; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB1555 to ARGB. +LIBYUV_API +int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, + int pix) = ARGB1555ToARGBRow_C; + if (!src_argb1555 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; + src_stride_argb1555 = -src_stride_argb1555; + } + // Coalesce rows. + if (src_stride_argb1555 == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb1555 = dst_stride_argb = 0; + } +#if defined(HAS_ARGB1555TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + } + } +#elif defined(HAS_ARGB1555TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGB1555ToARGBRow(src_argb1555, dst_argb, width); + src_argb1555 += src_stride_argb1555; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB4444 to ARGB. +LIBYUV_API +int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, + int pix) = ARGB4444ToARGBRow_C; + if (!src_argb4444 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; + src_stride_argb4444 = -src_stride_argb4444; + } + // Coalesce rows. + if (src_stride_argb4444 == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb4444 = dst_stride_argb = 0; + } +#if defined(HAS_ARGB4444TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; + } + } +#elif defined(HAS_ARGB4444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGB4444ToARGBRow(src_argb4444, dst_argb, width); + src_argb4444 += src_stride_argb4444; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert NV12 to ARGB. +LIBYUV_API +int NV12ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*NV12ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_NV12TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } + } + } +#elif defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + NV12ToARGBRow = NV12ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToARGBRow(src_y, src_uv, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV21 to ARGB. +LIBYUV_API +int NV21ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*NV21ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV21ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_NV21TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + NV21ToARGBRow = NV21ToARGBRow_SSSE3; + } + } + } +#endif +#if defined(HAS_NV21TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + NV21ToARGBRow = NV21ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV21ToARGBRow(src_y, src_uv, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert M420 to ARGB. +LIBYUV_API +int M420ToARGB(const uint8* src_m420, int src_stride_m420, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*NV12ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToARGBRow_C; + if (!src_m420 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_NV12TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } + } + } +#elif defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + NV12ToARGBRow = NV12ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width); + NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2, + dst_argb + dst_stride_argb, width); + dst_argb += dst_stride_argb * 2; + src_m420 += src_stride_m420 * 3; + } + if (height & 1) { + NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width); + } + return 0; +} + +// Convert YUY2 to ARGB. +LIBYUV_API +int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) = + YUY2ToARGBRow_C; + if (!src_yuy2 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_argb = 0; + } +#if defined(HAS_YUY2TOARGBROW_SSSE3) + // Posix is 16, Windows is 8. + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; + } + } + } +#elif defined(HAS_YUY2TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + YUY2ToARGBRow = YUY2ToARGBRow_NEON; + } + } +#endif + for (y = 0; y < height; ++y) { + YUY2ToARGBRow(src_yuy2, dst_argb, width); + src_yuy2 += src_stride_yuy2; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert UYVY to ARGB. +LIBYUV_API +int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) = + UYVYToARGBRow_C; + if (!src_uyvy || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + // Coalesce rows. + if (src_stride_uyvy == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_uyvy = dst_stride_argb = 0; + } +#if defined(HAS_UYVYTOARGBROW_SSSE3) + // Posix is 16, Windows is 8. + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + UYVYToARGBRow = UYVYToARGBRow_SSSE3; + } + } + } +#elif defined(HAS_UYVYTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + UYVYToARGBRow = UYVYToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + UYVYToARGBRow = UYVYToARGBRow_NEON; + } + } +#endif + for (y = 0; y < height; ++y) { + UYVYToARGBRow(src_uyvy, dst_argb, width); + src_uyvy += src_stride_uyvy; + dst_argb += dst_stride_argb; + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/convert_from.cc b/third_party/libyuv/source/convert_from.cc new file mode 100644 index 000000000..c1a2f62f0 --- /dev/null +++ b/third_party/libyuv/source/convert_from.cc @@ -0,0 +1,1210 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_from.h" + +#include "libyuv/basic_types.h" +#include "libyuv/convert.h" // For I420Copy +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/scale.h" // For ScalePlane() +#include "libyuv/video_common.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// I420 To any I4xx YUV format with mirroring. +static int I420ToI4xx(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_y_width, int src_y_height, + int dst_uv_width, int dst_uv_height) { + const int dst_y_width = Abs(src_y_width); + const int dst_y_height = Abs(src_y_height); + const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1); + const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1); + if (src_y_width == 0 || src_y_height == 0 || + dst_uv_width <= 0 || dst_uv_height <= 0) { + return -1; + } + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, + dst_y, dst_stride_y, dst_y_width, dst_y_height, + kFilterBilinear); + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, + dst_u, dst_stride_u, dst_uv_width, dst_uv_height, + kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, + dst_v, dst_stride_v, dst_uv_width, dst_uv_height, + kFilterBilinear); + return 0; +} + +// 420 chroma is 1/2 width, 1/2 height +// 422 chroma is 1/2 width, 1x height +LIBYUV_API +int I420ToI422(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int dst_uv_width = (Abs(width) + 1) >> 1; + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + dst_uv_width, dst_uv_height); +} + +// 420 chroma is 1/2 width, 1/2 height +// 444 chroma is 1x width, 1x height +LIBYUV_API +int I420ToI444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int dst_uv_width = Abs(width); + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + dst_uv_width, dst_uv_height); +} + +// 420 chroma is 1/2 width, 1/2 height +// 411 chroma is 1/4 width, 1x height +LIBYUV_API +int I420ToI411(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int dst_uv_width = (Abs(width) + 3) >> 2; + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + dst_uv_width, dst_uv_height); +} + +// Copy to I400. Source can be I420,422,444,400,NV12,NV21 +LIBYUV_API +int I400Copy(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (!src_y || !dst_y || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +LIBYUV_API +int I422ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_yuy2, int dst_stride_yuy2, + int width, int height) { + int y; + void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_yuy2, int width) = + I422ToYUY2Row_C; + if (!src_y || !src_u || !src_v || !dst_yuy2 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_yuy2 == width * 2) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0; + } +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#elif defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + dst_yuy2 += dst_stride_yuy2; + } + return 0; +} + +LIBYUV_API +int I420ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_yuy2, int dst_stride_yuy2, + int width, int height) { + int y; + void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_yuy2, int width) = + I422ToYUY2Row_C; + if (!src_y || !src_u || !src_v || !dst_yuy2 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#elif defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + I422ToYUY2Row(src_y + src_stride_y, src_u, src_v, + dst_yuy2 + dst_stride_yuy2, width); + src_y += src_stride_y * 2; + src_u += src_stride_u; + src_v += src_stride_v; + dst_yuy2 += dst_stride_yuy2 * 2; + } + if (height & 1) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + } + return 0; +} + +LIBYUV_API +int I422ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_uyvy, int dst_stride_uyvy, + int width, int height) { + int y; + void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_uyvy, int width) = + I422ToUYVYRow_C; + if (!src_y || !src_u || !src_v || !dst_uyvy || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_uyvy == width * 2) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0; + } +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#elif defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + dst_uyvy += dst_stride_uyvy; + } + return 0; +} + +LIBYUV_API +int I420ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_uyvy, int dst_stride_uyvy, + int width, int height) { + int y; + void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_uyvy, int width) = + I422ToUYVYRow_C; + if (!src_y || !src_u || !src_v || !dst_uyvy || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#elif defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + I422ToUYVYRow(src_y + src_stride_y, src_u, src_v, + dst_uyvy + dst_stride_uyvy, width); + src_y += src_stride_y * 2; + src_u += src_stride_u; + src_v += src_stride_v; + dst_uyvy += dst_stride_uyvy * 2; + } + if (height & 1) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + } + return 0; +} + +LIBYUV_API +int I420ToNV12(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height) { + int y; + void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) = MergeUVRow_C; + // Coalesce rows. + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv; + dst_stride_y = -dst_stride_y; + dst_stride_uv = -dst_stride_uv; + } + if (src_stride_y == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + // Coalesce rows. + if (src_stride_u == halfwidth && + src_stride_v == halfwidth && + dst_stride_uv == halfwidth * 2) { + halfwidth *= halfheight; + halfheight = 1; + src_stride_u = src_stride_v = dst_stride_uv = 0; + } +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_Unaligned_SSE2; + if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && + IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && + IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif + + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + for (y = 0; y < halfheight; ++y) { + // Merge a row of U and V into a row of UV. + MergeUVRow_(src_u, src_v, dst_uv, halfwidth); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += dst_stride_uv; + } + return 0; +} + +LIBYUV_API +int I420ToNV21(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height) { + return I420ToNV12(src_y, src_stride_y, + src_v, src_stride_v, + src_u, src_stride_u, + dst_y, src_stride_y, + dst_vu, dst_stride_vu, + width, height); +} + +// Convert I420 to ARGB. +LIBYUV_API +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to BGRA. +LIBYUV_API +int I420ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_bgra, int dst_stride_bgra, + int width, int height) { + int y; + void (*I422ToBGRARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToBGRARow_C; + if (!src_y || !src_u || !src_v || !dst_bgra || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra; + dst_stride_bgra = -dst_stride_bgra; + } +#if defined(HAS_I422TOBGRAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToBGRARow = I422ToBGRARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { + I422ToBGRARow = I422ToBGRARow_SSSE3; + } + } + } +#elif defined(HAS_I422TOBGRAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToBGRARow = I422ToBGRARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_NEON; + } + } +#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) { + I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width); + dst_bgra += dst_stride_bgra; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ABGR. +LIBYUV_API +int I420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + int y; + void (*I422ToABGRRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToABGRRow_C; + if (!src_y || !src_u || !src_v || !dst_abgr || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; + dst_stride_abgr = -dst_stride_abgr; + } +#if defined(HAS_I422TOABGRROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToABGRRow = I422ToABGRRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { + I422ToABGRRow = I422ToABGRRow_SSSE3; + } + } + } +#elif defined(HAS_I422TOABGRROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToABGRRow = I422ToABGRRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToABGRRow = I422ToABGRRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width); + dst_abgr += dst_stride_abgr; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGBA. +LIBYUV_API +int I420ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height) { + int y; + void (*I422ToRGBARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } +#if defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } + } +#elif defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB24. +LIBYUV_API +int I420ToRGB24(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height) { + int y; + void (*I422ToRGB24Row)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGB24Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb24 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_I422TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB24Row = I422ToRGB24Row_SSSE3; + } + } +#elif defined(HAS_I422TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToRGB24Row = I422ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB24Row = I422ToRGB24Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RAW. +LIBYUV_API +int I420ToRAW(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_raw, int dst_stride_raw, + int width, int height) { + int y; + void (*I422ToRAWRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRAWRow_C; + if (!src_y || !src_u || !src_v || !dst_raw || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_raw = dst_raw + (height - 1) * dst_stride_raw; + dst_stride_raw = -dst_stride_raw; + } +#if defined(HAS_I422TORAWROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRAWRow = I422ToRAWRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRAWRow = I422ToRAWRow_SSSE3; + } + } +#elif defined(HAS_I422TORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToRAWRow = I422ToRAWRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRAWRow = I422ToRAWRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRAWRow(src_y, src_u, src_v, dst_raw, width); + dst_raw += dst_stride_raw; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ARGB1555. +LIBYUV_API +int I420ToARGB1555(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb1555, int dst_stride_argb1555, + int width, int height) { + int y; + void (*I422ToARGB1555Row)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGB1555Row_C; + if (!src_y || !src_u || !src_v || !dst_argb1555 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; + dst_stride_argb1555 = -dst_stride_argb1555; + } +#if defined(HAS_I422TOARGB1555ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; + } + } +#elif defined(HAS_I422TOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width); + dst_argb1555 += dst_stride_argb1555; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + + +// Convert I420 to ARGB4444. +LIBYUV_API +int I420ToARGB4444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb4444, int dst_stride_argb4444, + int width, int height) { + int y; + void (*I422ToARGB4444Row)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGB4444Row_C; + if (!src_y || !src_u || !src_v || !dst_argb4444 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444; + dst_stride_argb4444 = -dst_stride_argb4444; + } +#if defined(HAS_I422TOARGB4444ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; + } + } +#elif defined(HAS_I422TOARGB4444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width); + dst_argb4444 += dst_stride_argb4444; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB565. +LIBYUV_API +int I420ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + int y; + void (*I422ToRGB565Row)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGB565Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_I422TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_SSSE3; + } + } +#elif defined(HAS_I422TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToRGB565Row = I422ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to specified format +LIBYUV_API +int ConvertFromI420(const uint8* y, int y_stride, + const uint8* u, int u_stride, + const uint8* v, int v_stride, + uint8* dst_sample, int dst_sample_stride, + int width, int height, + uint32 fourcc) { + uint32 format = CanonicalFourCC(fourcc); + int r = 0; + if (!y || !u|| !v || !dst_sample || + width <= 0 || height == 0) { + return -1; + } + switch (format) { + // Single plane formats + case FOURCC_YUY2: + r = I420ToYUY2(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_UYVY: + r = I420ToUYVY(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_RGBP: + r = I420ToRGB565(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_RGBO: + r = I420ToARGB1555(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_R444: + r = I420ToARGB4444(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_24BG: + r = I420ToRGB24(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, + width, height); + break; + case FOURCC_RAW: + r = I420ToRAW(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, + width, height); + break; + case FOURCC_ARGB: + r = I420ToARGB(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_BGRA: + r = I420ToBGRA(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_ABGR: + r = I420ToABGR(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_RGBA: + r = I420ToRGBA(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_BGGR: + r = I420ToBayerBGGR(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_GBRG: + r = I420ToBayerGBRG(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_GRBG: + r = I420ToBayerGRBG(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_RGGB: + r = I420ToBayerRGGB(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_I400: + r = I400Copy(y, y_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_NV12: { + uint8* dst_uv = dst_sample + width * height; + r = I420ToNV12(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + dst_uv, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + } + case FOURCC_NV21: { + uint8* dst_vu = dst_sample + width * height; + r = I420ToNV21(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + dst_vu, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + } + // TODO(fbarchard): Add M420 and Q420. + // Triplanar formats + // TODO(fbarchard): halfstride instead of halfwidth + case FOURCC_I420: + case FOURCC_YU12: + case FOURCC_YV12: { + int halfwidth = (width + 1) / 2; + int halfheight = (height + 1) / 2; + uint8* dst_u; + uint8* dst_v; + if (format == FOURCC_YV12) { + dst_v = dst_sample + width * height; + dst_u = dst_v + halfwidth * halfheight; + } else { + dst_u = dst_sample + width * height; + dst_v = dst_u + halfwidth * halfheight; + } + r = I420Copy(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, halfwidth, + dst_v, halfwidth, + width, height); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + int halfwidth = (width + 1) / 2; + uint8* dst_u; + uint8* dst_v; + if (format == FOURCC_YV16) { + dst_v = dst_sample + width * height; + dst_u = dst_v + halfwidth * height; + } else { + dst_u = dst_sample + width * height; + dst_v = dst_u + halfwidth * height; + } + r = I420ToI422(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, halfwidth, + dst_v, halfwidth, + width, height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + uint8* dst_u; + uint8* dst_v; + if (format == FOURCC_YV24) { + dst_v = dst_sample + width * height; + dst_u = dst_v + width * height; + } else { + dst_u = dst_sample + width * height; + dst_v = dst_u + width * height; + } + r = I420ToI444(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, width, + dst_v, width, + width, height); + break; + } + case FOURCC_I411: { + int quarterwidth = (width + 3) / 4; + uint8* dst_u = dst_sample + width * height; + uint8* dst_v = dst_u + quarterwidth * height; + r = I420ToI411(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, quarterwidth, + dst_v, quarterwidth, + width, height); + break; + } + + // Formats not supported - MJPG, biplanar, some rgb formats. + default: + return -1; // unknown fourcc - return failure code. + } + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/convert_from_argb.cc b/third_party/libyuv/source/convert_from_argb.cc new file mode 100644 index 000000000..121a41611 --- /dev/null +++ b/third_party/libyuv/source/convert_from_argb.cc @@ -0,0 +1,1113 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_from_argb.h" + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#include "libyuv/planar_functions.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// ARGB little endian (bgra in memory) to I444 +LIBYUV_API +int ARGBToI444(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV444Row_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width && + dst_stride_u == width && + dst_stride_v == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOUV444ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUV444Row = ARGBToUV444Row_SSSE3; + } + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } + +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + ARGBToUV444Row = ARGBToUV444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + ARGBToUV444Row = ARGBToUV444Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUV444Row(src_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// ARGB little endian (bgra in memory) to I422 +LIBYUV_API +int ARGBToI422(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV422Row_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width && + dst_stride_u * 2 == width && + dst_stride_v * 2 == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOUV422ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUV422Row = ARGBToUV422Row_SSSE3; + } + } + } +#endif + +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUV422Row(src_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// ARGB little endian (bgra in memory) to I411 +LIBYUV_API +int ARGBToI411(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV411Row_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width && + dst_stride_u * 4 == width && + dst_stride_v * 4 == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 32) { + ARGBToUV411Row = ARGBToUV411Row_Any_NEON; + if (IS_ALIGNED(width, 32)) { + ARGBToUV411Row = ARGBToUV411Row_NEON; + } + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUV411Row(src_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +LIBYUV_API +int ARGBToNV12(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) = MergeUVRow_C; + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); + uint8* row_v = row_u + ((halfwidth + 15) & ~15); + if (!src_argb || + !dst_y || !dst_uv || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } + } +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } + } +#endif +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); + return 0; +} + +// Same as NV12 but U and V swapped. +LIBYUV_API +int ARGBToNV21(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) = MergeUVRow_C; + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); + uint8* row_v = row_u + ((halfwidth + 15) & ~15); + if (!src_argb || + !dst_y || !dst_uv || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } + } +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } + } +#endif +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); + return 0; +} + +// Convert ARGB to YUY2. +LIBYUV_API +int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, + uint8* dst_yuy2, int dst_stride_yuy2, + int width, int height) { + int y; + void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV422Row_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C; + + if (!src_argb || !dst_yuy2 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_yuy2 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_yuy2 = 0; + } +#if defined(HAS_ARGBTOUV422ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUV422Row = ARGBToUV422Row_SSSE3; + } + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } + } +#endif + +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#elif defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif + + { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8* row_u = row_y + ((width + 63) & ~63); + uint8* row_v = row_u + ((width + 63) & ~63) / 2; + + for (y = 0; y < height; ++y) { + ARGBToUV422Row(src_argb, row_u, row_v, width); + ARGBToYRow(src_argb, row_y, width); + I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width); + src_argb += src_stride_argb; + dst_yuy2 += dst_stride_yuy2; + } + + free_aligned_buffer_64(row_y); + } + return 0; +} + +// Convert ARGB to UYVY. +LIBYUV_API +int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, + uint8* dst_uyvy, int dst_stride_uyvy, + int width, int height) { + int y; + void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV422Row_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C; + + if (!src_argb || !dst_uyvy || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_uyvy == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_uyvy = 0; + } +#if defined(HAS_ARGBTOUV422ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUV422Row = ARGBToUV422Row_SSSE3; + } + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } + } +#endif + +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#elif defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif + + { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8* row_u = row_y + ((width + 63) & ~63); + uint8* row_v = row_u + ((width + 63) & ~63) / 2; + + for (y = 0; y < height; ++y) { + ARGBToUV422Row(src_argb, row_u, row_v, width); + ARGBToYRow(src_argb, row_y, width); + I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width); + src_argb += src_stride_argb; + dst_uyvy += dst_stride_uyvy; + } + + free_aligned_buffer_64(row_y); + } + return 0; +} + +// Convert ARGB to I400. +LIBYUV_API +int ARGBToI400(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height) { + int y; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + if (!src_argb || !dst_y || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = 0; + } +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + } + return 0; +} + +// Shuffle table for converting ARGB to RGBA. +static uvec8 kShuffleMaskARGBToRGBA = { + 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u +}; + +// Convert ARGB to RGBA. +LIBYUV_API +int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height) { + return ARGBShuffle(src_argb, src_stride_argb, + dst_rgba, dst_stride_rgba, + (const uint8*)(&kShuffleMaskARGBToRGBA), + width, height); +} + +// Convert ARGB To RGB24. +LIBYUV_API +int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height) { + int y; + void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRGB24Row_C; + if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_rgb24 == width * 3) { + width *= height; + height = 1; + src_stride_argb = dst_stride_rgb24 = 0; + } +#if defined(HAS_ARGBTORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; + } + } +#elif defined(HAS_ARGBTORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB24Row = ARGBToRGB24Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRGB24Row(src_argb, dst_rgb24, width); + src_argb += src_stride_argb; + dst_rgb24 += dst_stride_rgb24; + } + return 0; +} + +// Convert ARGB To RAW. +LIBYUV_API +int ARGBToRAW(const uint8* src_argb, int src_stride_argb, + uint8* dst_raw, int dst_stride_raw, + int width, int height) { + int y; + void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRAWRow_C; + if (!src_argb || !dst_raw || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_raw == width * 3) { + width *= height; + height = 1; + src_stride_argb = dst_stride_raw = 0; + } +#if defined(HAS_ARGBTORAWROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_SSSE3; + } + } +#elif defined(HAS_ARGBTORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToRAWRow = ARGBToRAWRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRAWRow = ARGBToRAWRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRAWRow(src_argb, dst_raw, width); + src_argb += src_stride_argb; + dst_raw += dst_stride_raw; + } + return 0; +} + +// Convert ARGB To RGB565. +LIBYUV_API +int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + int y; + void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRGB565Row_C; + if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_rgb565 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_rgb565 = 0; + } +#if defined(HAS_ARGBTORGB565ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + } + } +#elif defined(HAS_ARGBTORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRGB565Row(src_argb, dst_rgb565, width); + src_argb += src_stride_argb; + dst_rgb565 += dst_stride_rgb565; + } + return 0; +} + +// Convert ARGB To ARGB1555. +LIBYUV_API +int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb1555, int dst_stride_argb1555, + int width, int height) { + int y; + void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToARGB1555Row_C; + if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb1555 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb1555 = 0; + } +#if defined(HAS_ARGBTOARGB1555ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; + } + } +#elif defined(HAS_ARGBTOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToARGB1555Row(src_argb, dst_argb1555, width); + src_argb += src_stride_argb; + dst_argb1555 += dst_stride_argb1555; + } + return 0; +} + +// Convert ARGB To ARGB4444. +LIBYUV_API +int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb4444, int dst_stride_argb4444, + int width, int height) { + int y; + void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToARGB4444Row_C; + if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb4444 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb4444 = 0; + } +#if defined(HAS_ARGBTOARGB4444ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; + } + } +#elif defined(HAS_ARGBTOARGB4444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToARGB4444Row(src_argb, dst_argb4444, width); + src_argb += src_stride_argb; + dst_argb4444 += dst_stride_argb4444; + } + return 0; +} + +// Convert ARGB to J420. (JPeg full range I420). +LIBYUV_API +int ARGBToJ420(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = + ARGBToYJRow_C; + if (!src_argb || + !dst_yj || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3; + ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + if (width >= 16) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; + } + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToYJRow(src_argb, dst_yj, width); + ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width); + src_argb += src_stride_argb * 2; + dst_yj += dst_stride_yj * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYJRow(src_argb, dst_yj, width); + } + return 0; +} + +// Convert ARGB to J400. +LIBYUV_API +int ARGBToJ400(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + int width, int height) { + int y; + void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = + ARGBToYJRow_C; + if (!src_argb || !dst_yj || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_yj = 0; + } +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToYJRow(src_argb, dst_yj, width); + src_argb += src_stride_argb; + dst_yj += dst_stride_yj; + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/convert_jpeg.cc b/third_party/libyuv/source/convert_jpeg.cc new file mode 100644 index 000000000..bcb980f7f --- /dev/null +++ b/third_party/libyuv/source/convert_jpeg.cc @@ -0,0 +1,392 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert.h" + +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#ifdef HAVE_JPEG +struct I420Buffers { + uint8* y; + int y_stride; + uint8* u; + int u_stride; + uint8* v; + int v_stride; + int w; + int h; +}; + +static void JpegCopyI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I420Copy(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI422ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I422ToI420(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI444ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I444ToI420(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI411ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I411ToI420(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI400ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I400ToI420(data[0], strides[0], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +// Query size of MJPG in pixels. +LIBYUV_API +int MJPGSize(const uint8* sample, size_t sample_size, + int* width, int* height) { + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret) { + *width = mjpeg_decoder.GetWidth(); + *height = mjpeg_decoder.GetHeight(); + } + mjpeg_decoder.UnloadFrame(); + return ret ? 0 : -1; // -1 for runtime failure. +} + +// MJPG (Motion JPeg) to I420 +// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +LIBYUV_API +int MJPGToI420(const uint8* sample, + size_t sample_size, + uint8* y, int y_stride, + uint8* u, int u_stride, + uint8* v, int v_stride, + int w, int h, + int dw, int dh) { + if (sample_size == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret && (mjpeg_decoder.GetWidth() != w || + mjpeg_decoder.GetHeight() != h)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh }; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh); + // YUV411 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 4 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh); + } else { + // TODO(fbarchard): Implement conversion for any other colorspace/sample + // factors that occur in practice. 411 is supported by libjpeg + // ERROR: Unable to convert MJPEG frame because format is not supported + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} + +#ifdef HAVE_JPEG +struct ARGBBuffers { + uint8* argb; + int argb_stride; + int w; + int h; +}; + +static void JpegI420ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I420ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI422ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I422ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI444ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I444ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI411ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I411ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI400ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I400ToARGB(data[0], strides[0], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +// MJPG (Motion JPeg) to ARGB +// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +LIBYUV_API +int MJPGToARGB(const uint8* sample, + size_t sample_size, + uint8* argb, int argb_stride, + int w, int h, + int dw, int dh) { + if (sample_size == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret && (mjpeg_decoder.GetWidth() != w || + mjpeg_decoder.GetHeight() != h)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + ARGBBuffers bufs = { argb, argb_stride, dw, dh }; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh); + // YUV411 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 4 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh); + } else { + // TODO(fbarchard): Implement conversion for any other colorspace/sample + // factors that occur in practice. 411 is supported by libjpeg + // ERROR: Unable to convert MJPEG frame because format is not supported + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} +#endif + +#endif + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/convert_to_argb.cc b/third_party/libyuv/source/convert_to_argb.cc new file mode 100644 index 000000000..1b228a7b4 --- /dev/null +++ b/third_party/libyuv/source/convert_to_argb.cc @@ -0,0 +1,327 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_argb.h" + +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/rotate_argb.h" +#include "libyuv/row.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// src_width is used for source stride computation +// src_height is used to compute location of planes, and indicate inversion +// sample_size is measured in bytes and is the size of the frame. +// With MJPEG it is the compressed size of the frame. +LIBYUV_API +int ConvertToARGB(const uint8* sample, size_t sample_size, + uint8* crop_argb, int argb_stride, + int crop_x, int crop_y, + int src_width, int src_height, + int crop_width, int crop_height, + enum RotationMode rotation, + uint32 fourcc) { + uint32 format = CanonicalFourCC(fourcc); + int aligned_src_width = (src_width + 1) & ~1; + const uint8* src; + const uint8* src_uv; + int abs_src_height = (src_height < 0) ? -src_height : src_height; + int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; + int r = 0; + + // One pass rotation is available for some formats. For the rest, convert + // to I420 (with optional vertical flipping) into a temporary I420 buffer, + // and then rotate the I420 to the final destination buffer. + // For in-place conversion, if destination crop_argb is same as source sample, + // also enable temporary buffer. + LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) || + crop_argb == sample; + uint8* tmp_argb = crop_argb; + int tmp_argb_stride = argb_stride; + uint8* rotate_buffer = NULL; + int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; + + if (crop_argb == NULL || sample == NULL || + src_width <= 0 || crop_width <= 0 || + src_height == 0 || crop_height == 0) { + return -1; + } + if (src_height < 0) { + inv_crop_height = -inv_crop_height; + } + + if (need_buf) { + int argb_size = crop_width * abs_crop_height * 4; + rotate_buffer = (uint8*)malloc(argb_size); + if (!rotate_buffer) { + return 1; // Out of memory runtime error. + } + crop_argb = rotate_buffer; + argb_stride = crop_width; + } + + switch (format) { + // Single plane formats + case FOURCC_YUY2: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = YUY2ToARGB(src, aligned_src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_UYVY: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = UYVYToARGB(src, aligned_src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_24BG: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RGB24ToARGB(src, src_width * 3, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RAW: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RAWToARGB(src, src_width * 3, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_ARGB: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToARGB(src, src_width * 4, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_BGRA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = BGRAToARGB(src, src_width * 4, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_ABGR: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ABGRToARGB(src, src_width * 4, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = RGBAToARGB(src, src_width * 4, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBP: + src = sample + (src_width * crop_y + crop_x) * 2; + r = RGB565ToARGB(src, src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBO: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB1555ToARGB(src, src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_R444: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB4444ToARGB(src, src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + // TODO(fbarchard): Support cropping Bayer by odd numbers + // by adjusting fourcc. + case FOURCC_BGGR: + src = sample + (src_width * crop_y + crop_x); + r = BayerBGGRToARGB(src, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + + case FOURCC_GBRG: + src = sample + (src_width * crop_y + crop_x); + r = BayerGBRGToARGB(src, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + + case FOURCC_GRBG: + src = sample + (src_width * crop_y + crop_x); + r = BayerGRBGToARGB(src, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + + case FOURCC_RGGB: + src = sample + (src_width * crop_y + crop_x); + r = BayerRGGBToARGB(src, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + + case FOURCC_I400: + src = sample + src_width * crop_y + crop_x; + r = I400ToARGB(src, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + + // Biplanar formats + case FOURCC_NV12: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + r = NV12ToARGB(src, src_width, + src_uv, aligned_src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_NV21: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + // Call NV12 but with u and v parameters swapped. + r = NV21ToARGB(src, src_width, + src_uv, aligned_src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_M420: + src = sample + (src_width * crop_y) * 12 / 8 + crop_x; + r = M420ToARGB(src, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; +// case FOURCC_Q420: +// src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x; +// src_uv = sample + (src_width + aligned_src_width * 2) * crop_y + +// src_width + crop_x * 2; +// r = Q420ToARGB(src, src_width * 3, +// src_uv, src_width * 3, +// crop_argb, argb_stride, +// crop_width, inv_crop_height); +// break; + // Triplanar formats + case FOURCC_I420: + case FOURCC_YU12: + case FOURCC_YV12: { + const uint8* src_y = sample + (src_width * crop_y + crop_x); + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + if (format == FOURCC_YV12) { + src_v = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } + r = I420ToARGB(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + if (format == FOURCC_YV16) { + src_v = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } + r = I422ToARGB(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + if (format == FOURCC_YV24) { + src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } else { + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } + r = I444ToARGB(src_y, src_width, + src_u, src_width, + src_v, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I411: { + int quarterwidth = (src_width + 3) / 4; + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u = sample + src_width * abs_src_height + + quarterwidth * crop_y + crop_x / 4; + const uint8* src_v = sample + src_width * abs_src_height + + quarterwidth * (abs_src_height + crop_y) + crop_x / 4; + r = I411ToARGB(src_y, src_width, + src_u, quarterwidth, + src_v, quarterwidth, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + } +#ifdef HAVE_JPEG + case FOURCC_MJPG: + r = MJPGToARGB(sample, sample_size, + crop_argb, argb_stride, + src_width, abs_src_height, crop_width, inv_crop_height); + break; +#endif + default: + r = -1; // unknown fourcc - return failure code. + } + + if (need_buf) { + if (!r) { + r = ARGBRotate(crop_argb, argb_stride, + tmp_argb, tmp_argb_stride, + crop_width, abs_crop_height, rotation); + } + free(rotate_buffer); + } + + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/convert_to_i420.cc b/third_party/libyuv/source/convert_to_i420.cc new file mode 100644 index 000000000..7b194fff7 --- /dev/null +++ b/third_party/libyuv/source/convert_to_i420.cc @@ -0,0 +1,383 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/convert.h" + +#include "libyuv/format_conversion.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// src_width is used for source stride computation +// src_height is used to compute location of planes, and indicate inversion +// sample_size is measured in bytes and is the size of the frame. +// With MJPEG it is the compressed size of the frame. +LIBYUV_API +int ConvertToI420(const uint8* sample, + size_t sample_size, + uint8* y, int y_stride, + uint8* u, int u_stride, + uint8* v, int v_stride, + int crop_x, int crop_y, + int src_width, int src_height, + int crop_width, int crop_height, + enum RotationMode rotation, + uint32 fourcc) { + uint32 format = CanonicalFourCC(fourcc); + int aligned_src_width = (src_width + 1) & ~1; + const uint8* src; + const uint8* src_uv; + int abs_src_height = (src_height < 0) ? -src_height : src_height; + int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; + int r = 0; + LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 && + format != FOURCC_NV12 && format != FOURCC_NV21 && + format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample; + uint8* tmp_y = y; + uint8* tmp_u = u; + uint8* tmp_v = v; + int tmp_y_stride = y_stride; + int tmp_u_stride = u_stride; + int tmp_v_stride = v_stride; + uint8* rotate_buffer = NULL; + int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; + + if (!y || !u || !v || !sample || + src_width <= 0 || crop_width <= 0 || + src_height == 0 || crop_height == 0) { + return -1; + } + if (src_height < 0) { + inv_crop_height = -inv_crop_height; + } + + // One pass rotation is available for some formats. For the rest, convert + // to I420 (with optional vertical flipping) into a temporary I420 buffer, + // and then rotate the I420 to the final destination buffer. + // For in-place conversion, if destination y is same as source sample, + // also enable temporary buffer. + if (need_buf) { + int y_size = crop_width * abs_crop_height; + int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); + rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); + if (!rotate_buffer) { + return 1; // Out of memory runtime error. + } + y = rotate_buffer; + u = y + y_size; + v = u + uv_size; + y_stride = crop_width; + u_stride = v_stride = ((crop_width + 1) / 2); + } + + switch (format) { + // Single plane formats + case FOURCC_YUY2: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = YUY2ToI420(src, aligned_src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_UYVY: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = UYVYToI420(src, aligned_src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBP: + src = sample + (src_width * crop_y + crop_x) * 2; + r = RGB565ToI420(src, src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBO: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB1555ToI420(src, src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_R444: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB4444ToI420(src, src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_24BG: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RGB24ToI420(src, src_width * 3, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RAW: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RAWToI420(src, src_width * 3, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_ARGB: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_BGRA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = BGRAToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_ABGR: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ABGRToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = RGBAToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + // TODO(fbarchard): Support cropping Bayer by odd numbers + // by adjusting fourcc. + case FOURCC_BGGR: + src = sample + (src_width * crop_y + crop_x); + r = BayerBGGRToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_GBRG: + src = sample + (src_width * crop_y + crop_x); + r = BayerGBRGToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_GRBG: + src = sample + (src_width * crop_y + crop_x); + r = BayerGRBGToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGGB: + src = sample + (src_width * crop_y + crop_x); + r = BayerRGGBToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_I400: + src = sample + src_width * crop_y + crop_x; + r = I400ToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + // Biplanar formats + case FOURCC_NV12: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + r = NV12ToI420Rotate(src, src_width, + src_uv, aligned_src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height, rotation); + break; + case FOURCC_NV21: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + // Call NV12 but with u and v parameters swapped. + r = NV12ToI420Rotate(src, src_width, + src_uv, aligned_src_width, + y, y_stride, + v, v_stride, + u, u_stride, + crop_width, inv_crop_height, rotation); + break; + case FOURCC_M420: + src = sample + (src_width * crop_y) * 12 / 8 + crop_x; + r = M420ToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_Q420: + src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x; + src_uv = sample + (src_width + aligned_src_width * 2) * crop_y + + src_width + crop_x * 2; + r = Q420ToI420(src, src_width * 3, + src_uv, src_width * 3, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + // Triplanar formats + case FOURCC_I420: + case FOURCC_YU12: + case FOURCC_YV12: { + const uint8* src_y = sample + (src_width * crop_y + crop_x); + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + if (format == FOURCC_YV12) { + src_v = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } + r = I420Rotate(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height, rotation); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + if (format == FOURCC_YV16) { + src_v = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } + r = I422ToI420(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + if (format == FOURCC_YV24) { + src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } else { + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } + r = I444ToI420(src_y, src_width, + src_u, src_width, + src_v, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I411: { + int quarterwidth = (src_width + 3) / 4; + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u = sample + src_width * abs_src_height + + quarterwidth * crop_y + crop_x / 4; + const uint8* src_v = sample + src_width * abs_src_height + + quarterwidth * (abs_src_height + crop_y) + crop_x / 4; + r = I411ToI420(src_y, src_width, + src_u, quarterwidth, + src_v, quarterwidth, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + } +#ifdef HAVE_JPEG + case FOURCC_MJPG: + r = MJPGToI420(sample, sample_size, + y, y_stride, + u, u_stride, + v, v_stride, + src_width, abs_src_height, crop_width, inv_crop_height); + break; +#endif + default: + r = -1; // unknown fourcc - return failure code. + } + + if (need_buf) { + if (!r) { + r = I420Rotate(y, y_stride, + u, u_stride, + v, v_stride, + tmp_y, tmp_y_stride, + tmp_u, tmp_u_stride, + tmp_v, tmp_v_stride, + crop_width, abs_crop_height, rotation); + } + free(rotate_buffer); + } + + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc index 520cfe510..2e0d61d20 100644 --- a/third_party/libyuv/source/cpu_id.cc +++ b/third_party/libyuv/source/cpu_id.cc @@ -8,9 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/libyuv/include/libyuv/cpu_id.h" +#include "libyuv/cpu_id.h" -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) #include // For __cpuidex() #endif #if !defined(__pnacl__) && !defined(__CLR_VER) && \ @@ -27,7 +27,7 @@ #include #include -#include "third_party/libyuv/include/libyuv/basic_types.h" // For CPU_X86 +#include "libyuv/basic_types.h" // For CPU_X86 #ifdef __cplusplus namespace libyuv { @@ -48,7 +48,7 @@ extern "C" { defined(__i386__) || defined(__x86_64__)) LIBYUV_API void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) #if (_MSC_FULL_VER >= 160040219) __cpuidex((int*)(cpu_info), info_eax, info_ecx); #elif defined(_M_IX86) @@ -188,10 +188,14 @@ LIBYUV_API SAFEBUFFERS int InitCpuFlags(void) { #if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86) + uint32 cpu_info0[4] = { 0, 0, 0, 0 }; uint32 cpu_info1[4] = { 0, 0, 0, 0 }; uint32 cpu_info7[4] = { 0, 0, 0, 0 }; + CpuId(0, 0, cpu_info0); CpuId(1, 0, cpu_info1); - CpuId(7, 0, cpu_info7); + if (cpu_info0[0] >= 7) { + CpuId(7, 0, cpu_info7); + } cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | @@ -199,6 +203,7 @@ int InitCpuFlags(void) { ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) | ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | kCpuHasX86; + #ifdef HAS_XGETBV if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave TestOsSaveYmm()) { // Saves YMM. diff --git a/third_party/libyuv/source/format_conversion.cc b/third_party/libyuv/source/format_conversion.cc new file mode 100644 index 000000000..a3daf96a9 --- /dev/null +++ b/third_party/libyuv/source/format_conversion.cc @@ -0,0 +1,552 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/format_conversion.h" + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/video_common.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// generate a selector mask useful for pshufb +static uint32 GenerateSelector(int select0, int select1) { + return (uint32)(select0) | + (uint32)((select1 + 4) << 8) | + (uint32)((select0 + 8) << 16) | + (uint32)((select1 + 12) << 24); +} + +static int MakeSelectors(const int blue_index, + const int green_index, + const int red_index, + uint32 dst_fourcc_bayer, + uint32* index_map) { + // Now build a lookup table containing the indices for the four pixels in each + // 2x2 Bayer grid. + switch (dst_fourcc_bayer) { + case FOURCC_BGGR: + index_map[0] = GenerateSelector(blue_index, green_index); + index_map[1] = GenerateSelector(green_index, red_index); + break; + case FOURCC_GBRG: + index_map[0] = GenerateSelector(green_index, blue_index); + index_map[1] = GenerateSelector(red_index, green_index); + break; + case FOURCC_RGGB: + index_map[0] = GenerateSelector(red_index, green_index); + index_map[1] = GenerateSelector(green_index, blue_index); + break; + case FOURCC_GRBG: + index_map[0] = GenerateSelector(green_index, red_index); + index_map[1] = GenerateSelector(blue_index, green_index); + break; + default: + return -1; // Bad FourCC + } + return 0; +} + +// Converts 32 bit ARGB to Bayer RGB formats. +LIBYUV_API +int ARGBToBayer(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height, + uint32 dst_fourcc_bayer) { + int y; + const int blue_index = 0; // Offsets for ARGB format + const int green_index = 1; + const int red_index = 2; + uint32 index_map[2]; + void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) = ARGBToBayerRow_C; + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOBAYERROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerRow_SSSE3; + } + } +#elif defined(HAS_ARGBTOBAYERROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToBayerRow = ARGBToBayerRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerRow_NEON; + } + } +#endif + if (MakeSelectors(blue_index, green_index, red_index, + dst_fourcc_bayer, index_map)) { + return -1; // Bad FourCC + } + + for (y = 0; y < height; ++y) { + ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width); + src_argb += src_stride_argb; + dst_bayer += dst_stride_bayer; + } + return 0; +} + +#define AVG(a, b) (((a) + (b)) >> 1) + +static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer, + uint8* dst_argb, int pix) { + const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; + uint8 g = src_bayer0[1]; + uint8 r = src_bayer1[1]; + int x; + for (x = 0; x < pix - 2; x += 2) { + dst_argb[0] = src_bayer0[0]; + dst_argb[1] = AVG(g, src_bayer0[1]); + dst_argb[2] = AVG(r, src_bayer1[1]); + dst_argb[3] = 255U; + dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]); + dst_argb[5] = src_bayer0[1]; + dst_argb[6] = src_bayer1[1]; + dst_argb[7] = 255U; + g = src_bayer0[1]; + r = src_bayer1[1]; + src_bayer0 += 2; + src_bayer1 += 2; + dst_argb += 8; + } + dst_argb[0] = src_bayer0[0]; + dst_argb[1] = AVG(g, src_bayer0[1]); + dst_argb[2] = AVG(r, src_bayer1[1]); + dst_argb[3] = 255U; + if (!(pix & 1)) { + dst_argb[4] = src_bayer0[0]; + dst_argb[5] = src_bayer0[1]; + dst_argb[6] = src_bayer1[1]; + dst_argb[7] = 255U; + } +} + +static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer, + uint8* dst_argb, int pix) { + const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; + uint8 g = src_bayer0[1]; + uint8 b = src_bayer1[1]; + int x; + for (x = 0; x < pix - 2; x += 2) { + dst_argb[0] = AVG(b, src_bayer1[1]); + dst_argb[1] = AVG(g, src_bayer0[1]); + dst_argb[2] = src_bayer0[0]; + dst_argb[3] = 255U; + dst_argb[4] = src_bayer1[1]; + dst_argb[5] = src_bayer0[1]; + dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]); + dst_argb[7] = 255U; + g = src_bayer0[1]; + b = src_bayer1[1]; + src_bayer0 += 2; + src_bayer1 += 2; + dst_argb += 8; + } + dst_argb[0] = AVG(b, src_bayer1[1]); + dst_argb[1] = AVG(g, src_bayer0[1]); + dst_argb[2] = src_bayer0[0]; + dst_argb[3] = 255U; + if (!(pix & 1)) { + dst_argb[4] = src_bayer1[1]; + dst_argb[5] = src_bayer0[1]; + dst_argb[6] = src_bayer0[0]; + dst_argb[7] = 255U; + } +} + +static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer, + uint8* dst_argb, int pix) { + const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; + uint8 b = src_bayer0[1]; + int x; + for (x = 0; x < pix - 2; x += 2) { + dst_argb[0] = AVG(b, src_bayer0[1]); + dst_argb[1] = src_bayer0[0]; + dst_argb[2] = src_bayer1[0]; + dst_argb[3] = 255U; + dst_argb[4] = src_bayer0[1]; + dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]); + dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]); + dst_argb[7] = 255U; + b = src_bayer0[1]; + src_bayer0 += 2; + src_bayer1 += 2; + dst_argb += 8; + } + dst_argb[0] = AVG(b, src_bayer0[1]); + dst_argb[1] = src_bayer0[0]; + dst_argb[2] = src_bayer1[0]; + dst_argb[3] = 255U; + if (!(pix & 1)) { + dst_argb[4] = src_bayer0[1]; + dst_argb[5] = src_bayer0[0]; + dst_argb[6] = src_bayer1[0]; + dst_argb[7] = 255U; + } +} + +static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer, + uint8* dst_argb, int pix) { + const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; + uint8 r = src_bayer0[1]; + int x; + for (x = 0; x < pix - 2; x += 2) { + dst_argb[0] = src_bayer1[0]; + dst_argb[1] = src_bayer0[0]; + dst_argb[2] = AVG(r, src_bayer0[1]); + dst_argb[3] = 255U; + dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]); + dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]); + dst_argb[6] = src_bayer0[1]; + dst_argb[7] = 255U; + r = src_bayer0[1]; + src_bayer0 += 2; + src_bayer1 += 2; + dst_argb += 8; + } + dst_argb[0] = src_bayer1[0]; + dst_argb[1] = src_bayer0[0]; + dst_argb[2] = AVG(r, src_bayer0[1]); + dst_argb[3] = 255U; + if (!(pix & 1)) { + dst_argb[4] = src_bayer1[0]; + dst_argb[5] = src_bayer0[0]; + dst_argb[6] = src_bayer0[1]; + dst_argb[7] = 255U; + } +} + +// Converts any Bayer RGB format to ARGB. +LIBYUV_API +int BayerToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height, + uint32 src_fourcc_bayer) { + int y; + void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int pix); + void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int pix); + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + switch (src_fourcc_bayer) { + case FOURCC_BGGR: + BayerRow0 = BayerRowBG; + BayerRow1 = BayerRowGR; + break; + case FOURCC_GBRG: + BayerRow0 = BayerRowGB; + BayerRow1 = BayerRowRG; + break; + case FOURCC_GRBG: + BayerRow0 = BayerRowGR; + BayerRow1 = BayerRowBG; + break; + case FOURCC_RGGB: + BayerRow0 = BayerRowRG; + BayerRow1 = BayerRowGB; + break; + default: + return -1; // Bad FourCC + } + + for (y = 0; y < height - 1; y += 2) { + BayerRow0(src_bayer, src_stride_bayer, dst_argb, width); + BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, + dst_argb + dst_stride_argb, width); + src_bayer += src_stride_bayer * 2; + dst_argb += dst_stride_argb * 2; + } + if (height & 1) { + BayerRow0(src_bayer, src_stride_bayer, dst_argb, width); + } + return 0; +} + +// Converts any Bayer RGB format to ARGB. +LIBYUV_API +int BayerToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + uint32 src_fourcc_bayer) { + void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int pix); + void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int pix); + + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + // Negative height means invert the image. + if (height < 0) { + int halfheight; + height = -height; + halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (halfheight - 1) * dst_stride_u; + dst_v = dst_v + (halfheight - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + ARGBToUVRow = ARGBToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } + } +#endif + + switch (src_fourcc_bayer) { + case FOURCC_BGGR: + BayerRow0 = BayerRowBG; + BayerRow1 = BayerRowGR; + break; + case FOURCC_GBRG: + BayerRow0 = BayerRowGB; + BayerRow1 = BayerRowRG; + break; + case FOURCC_GRBG: + BayerRow0 = BayerRowGR; + BayerRow1 = BayerRowBG; + break; + case FOURCC_RGGB: + BayerRow0 = BayerRowRG; + BayerRow1 = BayerRowGB; + break; + default: + return -1; // Bad FourCC + } + + { + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + int y; + for (y = 0; y < height - 1; y += 2) { + BayerRow0(src_bayer, src_stride_bayer, row, width); + BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, + row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + src_bayer += src_stride_bayer * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + BayerRow0(src_bayer, src_stride_bayer, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + } + free_aligned_buffer_64(row); + } + return 0; +} + +// Convert I420 to Bayer. +LIBYUV_API +int I420ToBayer(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height, + uint32 dst_fourcc_bayer) { + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; + void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) = ARGBToBayerRow_C; + const int blue_index = 0; // Offsets for ARGB format + const int green_index = 1; + const int red_index = 2; + uint32 index_map[2]; + // Negative height means invert the image. + if (height < 0) { + int halfheight; + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } +#endif + +#if defined(HAS_ARGBTOBAYERROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerRow_SSSE3; + } + } +#elif defined(HAS_ARGBTOBAYERROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToBayerRow = ARGBToBayerRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerRow_NEON; + } + } +#endif + + if (MakeSelectors(blue_index, green_index, red_index, + dst_fourcc_bayer, index_map)) { + return -1; // Bad FourCC + } + { + // Allocate a row of ARGB. + align_buffer_64(row, width * 4); + int y; + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, row, width); + ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width); + dst_bayer += dst_stride_bayer; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + free_aligned_buffer_64(row); + } + return 0; +} + +#define MAKEBAYERFOURCC(BAYER) \ +LIBYUV_API \ +int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer, \ + uint8* dst_y, int dst_stride_y, \ + uint8* dst_u, int dst_stride_u, \ + uint8* dst_v, int dst_stride_v, \ + int width, int height) { \ + return BayerToI420(src_bayer, src_stride_bayer, \ + dst_y, dst_stride_y, \ + dst_u, dst_stride_u, \ + dst_v, dst_stride_v, \ + width, height, \ + FOURCC_##BAYER); \ +} \ + \ +LIBYUV_API \ +int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y, \ + const uint8* src_u, int src_stride_u, \ + const uint8* src_v, int src_stride_v, \ + uint8* dst_bayer, int dst_stride_bayer, \ + int width, int height) { \ + return I420ToBayer(src_y, src_stride_y, \ + src_u, src_stride_u, \ + src_v, src_stride_v, \ + dst_bayer, dst_stride_bayer, \ + width, height, \ + FOURCC_##BAYER); \ +} \ + \ +LIBYUV_API \ +int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb, \ + uint8* dst_bayer, int dst_stride_bayer, \ + int width, int height) { \ + return ARGBToBayer(src_argb, src_stride_argb, \ + dst_bayer, dst_stride_bayer, \ + width, height, \ + FOURCC_##BAYER); \ +} \ + \ +LIBYUV_API \ +int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer, \ + uint8* dst_argb, int dst_stride_argb, \ + int width, int height) { \ + return BayerToARGB(src_bayer, src_stride_bayer, \ + dst_argb, dst_stride_argb, \ + width, height, \ + FOURCC_##BAYER); \ +} + +MAKEBAYERFOURCC(BGGR) +MAKEBAYERFOURCC(GBRG) +MAKEBAYERFOURCC(GRBG) +MAKEBAYERFOURCC(RGGB) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/mjpeg_decoder.cc b/third_party/libyuv/source/mjpeg_decoder.cc new file mode 100644 index 000000000..15b0ed88a --- /dev/null +++ b/third_party/libyuv/source/mjpeg_decoder.cc @@ -0,0 +1,566 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/mjpeg_decoder.h" + +#ifdef HAVE_JPEG +#include + +#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\ + !defined(TARGET_IPHONE_SIMULATOR) +// Must be included before jpeglib. +#include +#define HAVE_SETJMP +#endif +struct FILE; // For jpeglib.h. + +// C++ build requires extern C for jpeg internals. +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#ifdef __cplusplus +} // extern "C" +#endif + +#include "libyuv/planar_functions.h" // For CopyPlane(). + +namespace libyuv { + +#ifdef HAVE_SETJMP +struct SetJmpErrorMgr { + jpeg_error_mgr base; // Must be at the top + jmp_buf setjmp_buffer; +}; +#endif + +const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN; +const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE; +const int MJpegDecoder::kColorSpaceRgb = JCS_RGB; +const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr; +const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK; +const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK; + +// Methods that are passed to jpeglib. +boolean fill_input_buffer(jpeg_decompress_struct* cinfo); +void init_source(jpeg_decompress_struct* cinfo); +void skip_input_data(jpeg_decompress_struct* cinfo, + long num_bytes); // NOLINT +void term_source(jpeg_decompress_struct* cinfo); +void ErrorHandler(jpeg_common_struct* cinfo); + +MJpegDecoder::MJpegDecoder() + : has_scanline_padding_(LIBYUV_FALSE), + num_outbufs_(0), + scanlines_(NULL), + scanlines_sizes_(NULL), + databuf_(NULL), + databuf_strides_(NULL) { + decompress_struct_ = new jpeg_decompress_struct; + source_mgr_ = new jpeg_source_mgr; +#ifdef HAVE_SETJMP + error_mgr_ = new SetJmpErrorMgr; + decompress_struct_->err = jpeg_std_error(&error_mgr_->base); + // Override standard exit()-based error handler. + error_mgr_->base.error_exit = &ErrorHandler; +#endif + decompress_struct_->client_data = NULL; + source_mgr_->init_source = &init_source; + source_mgr_->fill_input_buffer = &fill_input_buffer; + source_mgr_->skip_input_data = &skip_input_data; + source_mgr_->resync_to_restart = &jpeg_resync_to_restart; + source_mgr_->term_source = &term_source; + jpeg_create_decompress(decompress_struct_); + decompress_struct_->src = source_mgr_; + buf_vec_.buffers = &buf_; + buf_vec_.len = 1; +} + +MJpegDecoder::~MJpegDecoder() { + jpeg_destroy_decompress(decompress_struct_); + delete decompress_struct_; + delete source_mgr_; +#ifdef HAVE_SETJMP + delete error_mgr_; +#endif + DestroyOutputBuffers(); +} + +LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { + if (!ValidateJpeg(src, src_len)) { + return LIBYUV_FALSE; + } + + buf_.data = src; + buf_.len = (int)(src_len); + buf_vec_.pos = 0; + decompress_struct_->client_data = &buf_vec_; +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called jpeg_read_header, it experienced an error, and we called + // longjmp() and rewound the stack to here. Return error. + return LIBYUV_FALSE; + } +#endif + if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) { + // ERROR: Bad MJPEG header + return LIBYUV_FALSE; + } + AllocOutputBuffers(GetNumComponents()); + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_size = GetComponentScanlinesPerImcuRow(i); + if (scanlines_sizes_[i] != scanlines_size) { + if (scanlines_[i]) { + delete scanlines_[i]; + } + scanlines_[i] = new uint8* [scanlines_size]; + scanlines_sizes_[i] = scanlines_size; + } + + // We allocate padding for the final scanline to pad it up to DCTSIZE bytes + // to avoid memory errors, since jpeglib only reads full MCUs blocks. For + // the preceding scanlines, the padding is not needed/wanted because the + // following addresses will already be valid (they are the initial bytes of + // the next scanline) and will be overwritten when jpeglib writes out that + // next scanline. + int databuf_stride = GetComponentStride(i); + int databuf_size = scanlines_size * databuf_stride; + if (databuf_strides_[i] != databuf_stride) { + if (databuf_[i]) { + delete databuf_[i]; + } + databuf_[i] = new uint8[databuf_size]; + databuf_strides_[i] = databuf_stride; + } + + if (GetComponentStride(i) != GetComponentWidth(i)) { + has_scanline_padding_ = LIBYUV_TRUE; + } + } + return LIBYUV_TRUE; +} + +static int DivideAndRoundUp(int numerator, int denominator) { + return (numerator + denominator - 1) / denominator; +} + +static int DivideAndRoundDown(int numerator, int denominator) { + return numerator / denominator; +} + +// Returns width of the last loaded frame. +int MJpegDecoder::GetWidth() { + return decompress_struct_->image_width; +} + +// Returns height of the last loaded frame. +int MJpegDecoder::GetHeight() { + return decompress_struct_->image_height; +} + +// Returns format of the last loaded frame. The return value is one of the +// kColorSpace* constants. +int MJpegDecoder::GetColorSpace() { + return decompress_struct_->jpeg_color_space; +} + +// Number of color components in the color space. +int MJpegDecoder::GetNumComponents() { + return decompress_struct_->num_components; +} + +// Sample factors of the n-th component. +int MJpegDecoder::GetHorizSampFactor(int component) { + return decompress_struct_->comp_info[component].h_samp_factor; +} + +int MJpegDecoder::GetVertSampFactor(int component) { + return decompress_struct_->comp_info[component].v_samp_factor; +} + +int MJpegDecoder::GetHorizSubSampFactor(int component) { + return decompress_struct_->max_h_samp_factor / + GetHorizSampFactor(component); +} + +int MJpegDecoder::GetVertSubSampFactor(int component) { + return decompress_struct_->max_v_samp_factor / + GetVertSampFactor(component); +} + +int MJpegDecoder::GetImageScanlinesPerImcuRow() { + return decompress_struct_->max_v_samp_factor * DCTSIZE; +} + +int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) { + int vs = GetVertSubSampFactor(component); + return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs); +} + +int MJpegDecoder::GetComponentWidth(int component) { + int hs = GetHorizSubSampFactor(component); + return DivideAndRoundUp(GetWidth(), hs); +} + +int MJpegDecoder::GetComponentHeight(int component) { + int vs = GetVertSubSampFactor(component); + return DivideAndRoundUp(GetHeight(), vs); +} + +// Get width in bytes padded out to a multiple of DCTSIZE +int MJpegDecoder::GetComponentStride(int component) { + return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1); +} + +int MJpegDecoder::GetComponentSize(int component) { + return GetComponentWidth(component) * GetComponentHeight(component); +} + +LIBYUV_BOOL MJpegDecoder::UnloadFrame() { +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called jpeg_abort_decompress, it experienced an error, and we called + // longjmp() and rewound the stack to here. Return error. + return LIBYUV_FALSE; + } +#endif + jpeg_abort_decompress(decompress_struct_); + return LIBYUV_TRUE; +} + +// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height. +LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( + uint8** planes, int dst_width, int dst_height) { + if (dst_width != GetWidth() || + dst_height > GetHeight()) { + // ERROR: Bad dimensions + return LIBYUV_FALSE; + } +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called into jpeglib, it experienced an error sometime during this + // function call, and we called longjmp() and rewound the stack to here. + // Return error. + return LIBYUV_FALSE; + } +#endif + if (!StartDecode()) { + return LIBYUV_FALSE; + } + SetScanlinePointers(databuf_); + int lines_left = dst_height; + // Compute amount of lines to skip to implement vertical crop. + // TODO(fbarchard): Ensure skip is a multiple of maximum component + // subsample. ie 2 + int skip = (GetHeight() - dst_height) / 2; + if (skip > 0) { + // There is no API to skip lines in the output data, so we read them + // into the temp buffer. + while (skip >= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + skip -= GetImageScanlinesPerImcuRow(); + } + if (skip > 0) { + // Have a partial iMCU row left over to skip. Must read it and then + // copy the parts we want into the destination. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + // TODO(fbarchard): Compute skip to avoid this + assert(skip % GetVertSubSampFactor(i) == 0); + int rows_to_skip = + DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) - + rows_to_skip; + int data_to_skip = rows_to_skip * GetComponentStride(i); + CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), + planes[i], GetComponentWidth(i), + GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + lines_left -= (GetImageScanlinesPerImcuRow() - skip); + } + } + + // Read full MCUs but cropped horizontally + for (; lines_left > GetImageScanlinesPerImcuRow(); + lines_left -= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i); + CopyPlane(databuf_[i], GetComponentStride(i), + planes[i], GetComponentWidth(i), + GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + } + + if (lines_left > 0) { + // Have a partial iMCU row left over to decode. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_to_copy = + DivideAndRoundUp(lines_left, GetVertSubSampFactor(i)); + CopyPlane(databuf_[i], GetComponentStride(i), + planes[i], GetComponentWidth(i), + GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + } + return FinishDecode(); +} + +LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque, + int dst_width, int dst_height) { + if (dst_width != GetWidth() || + dst_height > GetHeight()) { + // ERROR: Bad dimensions + return LIBYUV_FALSE; + } +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called into jpeglib, it experienced an error sometime during this + // function call, and we called longjmp() and rewound the stack to here. + // Return error. + return LIBYUV_FALSE; + } +#endif + if (!StartDecode()) { + return LIBYUV_FALSE; + } + SetScanlinePointers(databuf_); + int lines_left = dst_height; + // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop + int skip = (GetHeight() - dst_height) / 2; + if (skip > 0) { + while (skip >= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + skip -= GetImageScanlinesPerImcuRow(); + } + if (skip > 0) { + // Have a partial iMCU row left over to skip. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + // TODO(fbarchard): Compute skip to avoid this + assert(skip % GetVertSubSampFactor(i) == 0); + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int data_to_skip = rows_to_skip * GetComponentStride(i); + // Change our own data buffer pointers so we can pass them to the + // callback. + databuf_[i] += data_to_skip; + } + int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip; + (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy); + // Now change them back. + for (int i = 0; i < num_outbufs_; ++i) { + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int data_to_skip = rows_to_skip * GetComponentStride(i); + databuf_[i] -= data_to_skip; + } + lines_left -= scanlines_to_copy; + } + } + // Read full MCUs until we get to the crop point. + for (; lines_left >= GetImageScanlinesPerImcuRow(); + lines_left -= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow()); + } + if (lines_left > 0) { + // Have a partial iMCU row left over to decode. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + (*fn)(opaque, databuf_, databuf_strides_, lines_left); + } + return FinishDecode(); +} + +void init_source(j_decompress_ptr cinfo) { + fill_input_buffer(cinfo); +} + +boolean fill_input_buffer(j_decompress_ptr cinfo) { + BufferVector* buf_vec = (BufferVector*)(cinfo->client_data); + if (buf_vec->pos >= buf_vec->len) { + assert(0 && "No more data"); + // ERROR: No more data + return FALSE; + } + cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data; + cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len; + ++buf_vec->pos; + return TRUE; +} + +void skip_input_data(j_decompress_ptr cinfo, + long num_bytes) { // NOLINT + cinfo->src->next_input_byte += num_bytes; +} + +void term_source(j_decompress_ptr cinfo) { + // Nothing to do. +} + +#ifdef HAVE_SETJMP +void ErrorHandler(j_common_ptr cinfo) { + // This is called when a jpeglib command experiences an error. Unfortunately + // jpeglib's error handling model is not very flexible, because it expects the + // error handler to not return--i.e., it wants the program to terminate. To + // recover from errors we use setjmp() as shown in their example. setjmp() is + // C's implementation for the "call with current continuation" functionality + // seen in some functional programming languages. + // A formatted message can be output, but is unsafe for release. +#ifdef DEBUG + char buf[JMSG_LENGTH_MAX]; + (*cinfo->err->format_message)(cinfo, buf); + // ERROR: Error in jpeglib: buf +#endif + + SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err); + // This rewinds the call stack to the point of the corresponding setjmp() + // and causes it to return (for a second time) with value 1. + longjmp(mgr->setjmp_buffer, 1); +} +#endif + +void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { + if (num_outbufs != num_outbufs_) { + // We could perhaps optimize this case to resize the output buffers without + // necessarily having to delete and recreate each one, but it's not worth + // it. + DestroyOutputBuffers(); + + scanlines_ = new uint8** [num_outbufs]; + scanlines_sizes_ = new int[num_outbufs]; + databuf_ = new uint8* [num_outbufs]; + databuf_strides_ = new int[num_outbufs]; + + for (int i = 0; i < num_outbufs; ++i) { + scanlines_[i] = NULL; + scanlines_sizes_[i] = 0; + databuf_[i] = NULL; + databuf_strides_[i] = 0; + } + + num_outbufs_ = num_outbufs; + } +} + +void MJpegDecoder::DestroyOutputBuffers() { + for (int i = 0; i < num_outbufs_; ++i) { + delete [] scanlines_[i]; + delete [] databuf_[i]; + } + delete [] scanlines_; + delete [] databuf_; + delete [] scanlines_sizes_; + delete [] databuf_strides_; + scanlines_ = NULL; + databuf_ = NULL; + scanlines_sizes_ = NULL; + databuf_strides_ = NULL; + num_outbufs_ = 0; +} + +// JDCT_IFAST and do_block_smoothing improve performance substantially. +LIBYUV_BOOL MJpegDecoder::StartDecode() { + decompress_struct_->raw_data_out = TRUE; + decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default + decompress_struct_->dither_mode = JDITHER_NONE; + // Not applicable to 'raw': + decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE); + // Only for buffered mode: + decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE); + // Blocky but fast: + decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE); + + if (!jpeg_start_decompress(decompress_struct_)) { + // ERROR: Couldn't start JPEG decompressor"; + return LIBYUV_FALSE; + } + return LIBYUV_TRUE; +} + +LIBYUV_BOOL MJpegDecoder::FinishDecode() { + // jpeglib considers it an error if we finish without decoding the whole + // image, so we call "abort" rather than "finish". + jpeg_abort_decompress(decompress_struct_); + return LIBYUV_TRUE; +} + +void MJpegDecoder::SetScanlinePointers(uint8** data) { + for (int i = 0; i < num_outbufs_; ++i) { + uint8* data_i = data[i]; + for (int j = 0; j < scanlines_sizes_[i]; ++j) { + scanlines_[i][j] = data_i; + data_i += GetComponentStride(i); + } + } +} + +inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() { + return (unsigned int)(GetImageScanlinesPerImcuRow()) == + jpeg_read_raw_data(decompress_struct_, + scanlines_, + GetImageScanlinesPerImcuRow()); +} + +// The helper function which recognizes the jpeg sub-sampling type. +JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( + int* subsample_x, int* subsample_y, int number_of_components) { + if (number_of_components == 3) { // Color images. + if (subsample_x[0] == 1 && subsample_y[0] == 1 && + subsample_x[1] == 2 && subsample_y[1] == 2 && + subsample_x[2] == 2 && subsample_y[2] == 2) { + return kJpegYuv420; + } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && + subsample_x[1] == 2 && subsample_y[1] == 1 && + subsample_x[2] == 2 && subsample_y[2] == 1) { + return kJpegYuv422; + } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && + subsample_x[1] == 1 && subsample_y[1] == 1 && + subsample_x[2] == 1 && subsample_y[2] == 1) { + return kJpegYuv444; + } + } else if (number_of_components == 1) { // Grey-scale images. + if (subsample_x[0] == 1 && subsample_y[0] == 1) { + return kJpegYuv400; + } + } + return kJpegUnknown; +} + +} // namespace libyuv +#endif // HAVE_JPEG + diff --git a/third_party/libyuv/source/mjpeg_validate.cc b/third_party/libyuv/source/mjpeg_validate.cc new file mode 100644 index 000000000..23d22d099 --- /dev/null +++ b/third_party/libyuv/source/mjpeg_validate.cc @@ -0,0 +1,47 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/mjpeg_decoder.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Helper function to validate the jpeg appears intact. +// TODO(fbarchard): Optimize case where SOI is found but EOI is not. +LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) { + size_t i; + if (sample_size < 64) { + // ERROR: Invalid jpeg size: sample_size + return LIBYUV_FALSE; + } + if (sample[0] != 0xff || sample[1] != 0xd8) { // Start Of Image + // ERROR: Invalid jpeg initial start code + return LIBYUV_FALSE; + } + for (i = sample_size - 2; i > 1;) { + if (sample[i] != 0xd9) { + if (sample[i] == 0xff && sample[i + 1] == 0xd9) { // End Of Image + return LIBYUV_TRUE; // Success: Valid jpeg. + } + --i; + } + --i; + } + // ERROR: Invalid jpeg end code not found. Size sample_size + return LIBYUV_FALSE; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/third_party/libyuv/source/planar_functions.cc b/third_party/libyuv/source/planar_functions.cc index 68b8f46e4..3857008ca 100644 --- a/third_party/libyuv/source/planar_functions.cc +++ b/third_party/libyuv/source/planar_functions.cc @@ -8,15 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/libyuv/include/libyuv/planar_functions.h" +#include "libyuv/planar_functions.h" #include // for memset() -#include "third_party/libyuv/include/libyuv/cpu_id.h" +#include "libyuv/cpu_id.h" #ifdef HAVE_JPEG -#include "third_party/libyuv/include/libyuv/mjpeg_decoder.h" +#include "libyuv/mjpeg_decoder.h" #endif -#include "third_party/libyuv/include/libyuv/row.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -37,6 +37,10 @@ void CopyPlane(const uint8* src_y, int src_stride_y, height = 1; src_stride_y = dst_stride_y = 0; } + // Nothing to do. + if (src_y == dst_y && src_stride_y == dst_stride_y) { + return; + } #if defined(HAS_COPYROW_X86) if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { CopyRow = CopyRow_X86; diff --git a/third_party/libyuv/source/rotate.cc b/third_party/libyuv/source/rotate.cc new file mode 100644 index 000000000..2ef3228cb --- /dev/null +++ b/third_party/libyuv/source/rotate.cc @@ -0,0 +1,1301 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" + +#include "libyuv/cpu_id.h" +#include "libyuv/convert.h" +#include "libyuv/planar_functions.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#if defined(__APPLE__) && defined(__i386__) +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".private_extern _" #name " \n" \ + ".align 4,0x90 \n" \ +"_" #name ": \n" +#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__) +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".align 4,0x90 \n" \ +"_" #name ": \n" +#else +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".align 4,0x90 \n" \ +#name ": \n" +#endif +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_MIRRORROW_NEON +void MirrorRow_NEON(const uint8* src, uint8* dst, int width); +#define HAS_MIRRORROW_UV_NEON +void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width); +#define HAS_TRANSPOSE_WX8_NEON +void TransposeWx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +#define HAS_TRANSPOSE_UVWX8_NEON +void TransposeUVWx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width); +#endif // defined(__ARM_NEON__) + +#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ + defined(__mips__) && \ + defined(__mips_dsp) && (__mips_dsp_rev >= 2) +#define HAS_TRANSPOSE_WX8_MIPS_DSPR2 +void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); + +void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2 +void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width); +#endif // defined(__mips__) + +#if !defined(LIBYUV_DISABLE_X86) && \ + defined(_M_IX86) && defined(_MSC_VER) +#define HAS_TRANSPOSE_WX8_SSSE3 +__declspec(naked) __declspec(align(16)) +static void TransposeWx8_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + __asm { + push edi + push esi + push ebp + mov eax, [esp + 12 + 4] // src + mov edi, [esp + 12 + 8] // src_stride + mov edx, [esp + 12 + 12] // dst + mov esi, [esp + 12 + 16] // dst_stride + mov ecx, [esp + 12 + 20] // width + + // Read in the data from the source pointer. + // First round of bit swap. + align 4 + convertloop: + movq xmm0, qword ptr [eax] + lea ebp, [eax + 8] + movq xmm1, qword ptr [eax + edi] + lea eax, [eax + 2 * edi] + punpcklbw xmm0, xmm1 + movq xmm2, qword ptr [eax] + movdqa xmm1, xmm0 + palignr xmm1, xmm1, 8 + movq xmm3, qword ptr [eax + edi] + lea eax, [eax + 2 * edi] + punpcklbw xmm2, xmm3 + movdqa xmm3, xmm2 + movq xmm4, qword ptr [eax] + palignr xmm3, xmm3, 8 + movq xmm5, qword ptr [eax + edi] + punpcklbw xmm4, xmm5 + lea eax, [eax + 2 * edi] + movdqa xmm5, xmm4 + movq xmm6, qword ptr [eax] + palignr xmm5, xmm5, 8 + movq xmm7, qword ptr [eax + edi] + punpcklbw xmm6, xmm7 + mov eax, ebp + movdqa xmm7, xmm6 + palignr xmm7, xmm7, 8 + // Second round of bit swap. + punpcklwd xmm0, xmm2 + punpcklwd xmm1, xmm3 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + palignr xmm2, xmm2, 8 + palignr xmm3, xmm3, 8 + punpcklwd xmm4, xmm6 + punpcklwd xmm5, xmm7 + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + palignr xmm6, xmm6, 8 + palignr xmm7, xmm7, 8 + // Third round of bit swap. + // Write to the destination pointer. + punpckldq xmm0, xmm4 + movq qword ptr [edx], xmm0 + movdqa xmm4, xmm0 + palignr xmm4, xmm4, 8 + movq qword ptr [edx + esi], xmm4 + lea edx, [edx + 2 * esi] + punpckldq xmm2, xmm6 + movdqa xmm6, xmm2 + palignr xmm6, xmm6, 8 + movq qword ptr [edx], xmm2 + punpckldq xmm1, xmm5 + movq qword ptr [edx + esi], xmm6 + lea edx, [edx + 2 * esi] + movdqa xmm5, xmm1 + movq qword ptr [edx], xmm1 + palignr xmm5, xmm5, 8 + punpckldq xmm3, xmm7 + movq qword ptr [edx + esi], xmm5 + lea edx, [edx + 2 * esi] + movq qword ptr [edx], xmm3 + movdqa xmm7, xmm3 + palignr xmm7, xmm7, 8 + sub ecx, 8 + movq qword ptr [edx + esi], xmm7 + lea edx, [edx + 2 * esi] + jg convertloop + + pop ebp + pop esi + pop edi + ret + } +} + +#define HAS_TRANSPOSE_UVWX8_SSE2 +__declspec(naked) __declspec(align(16)) +static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int w) { + __asm { + push ebx + push esi + push edi + push ebp + mov eax, [esp + 16 + 4] // src + mov edi, [esp + 16 + 8] // src_stride + mov edx, [esp + 16 + 12] // dst_a + mov esi, [esp + 16 + 16] // dst_stride_a + mov ebx, [esp + 16 + 20] // dst_b + mov ebp, [esp + 16 + 24] // dst_stride_b + mov ecx, esp + sub esp, 4 + 16 + and esp, ~15 + mov [esp + 16], ecx + mov ecx, [ecx + 16 + 28] // w + + align 4 + convertloop: + // Read in the data from the source pointer. + // First round of bit swap. + movdqa xmm0, [eax] + movdqa xmm1, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm0 // use xmm7 as temp register. + punpcklbw xmm0, xmm1 + punpckhbw xmm7, xmm1 + movdqa xmm1, xmm7 + movdqa xmm2, [eax] + movdqa xmm3, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm2 + punpcklbw xmm2, xmm3 + punpckhbw xmm7, xmm3 + movdqa xmm3, xmm7 + movdqa xmm4, [eax] + movdqa xmm5, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm4 + punpcklbw xmm4, xmm5 + punpckhbw xmm7, xmm5 + movdqa xmm5, xmm7 + movdqa xmm6, [eax] + movdqa xmm7, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa [esp], xmm5 // backup xmm5 + neg edi + movdqa xmm5, xmm6 // use xmm5 as temp register. + punpcklbw xmm6, xmm7 + punpckhbw xmm5, xmm7 + movdqa xmm7, xmm5 + lea eax, [eax + 8 * edi + 16] + neg edi + // Second round of bit swap. + movdqa xmm5, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm5, xmm2 + movdqa xmm2, xmm5 + movdqa xmm5, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm5, xmm3 + movdqa xmm3, xmm5 + movdqa xmm5, xmm4 + punpcklwd xmm4, xmm6 + punpckhwd xmm5, xmm6 + movdqa xmm6, xmm5 + movdqa xmm5, [esp] // restore xmm5 + movdqa [esp], xmm6 // backup xmm6 + movdqa xmm6, xmm5 // use xmm6 as temp register. + punpcklwd xmm5, xmm7 + punpckhwd xmm6, xmm7 + movdqa xmm7, xmm6 + // Third round of bit swap. + // Write to the destination pointer. + movdqa xmm6, xmm0 + punpckldq xmm0, xmm4 + punpckhdq xmm6, xmm4 + movdqa xmm4, xmm6 + movdqa xmm6, [esp] // restore xmm6 + movlpd qword ptr [edx], xmm0 + movhpd qword ptr [ebx], xmm0 + movlpd qword ptr [edx + esi], xmm4 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm4 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm2 // use xmm0 as the temp register. + punpckldq xmm2, xmm6 + movlpd qword ptr [edx], xmm2 + movhpd qword ptr [ebx], xmm2 + punpckhdq xmm0, xmm6 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm1 // use xmm0 as the temp register. + punpckldq xmm1, xmm5 + movlpd qword ptr [edx], xmm1 + movhpd qword ptr [ebx], xmm1 + punpckhdq xmm0, xmm5 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm3 // use xmm0 as the temp register. + punpckldq xmm3, xmm7 + movlpd qword ptr [edx], xmm3 + movhpd qword ptr [ebx], xmm3 + punpckhdq xmm0, xmm7 + sub ecx, 8 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + jg convertloop + + mov esp, [esp + 16] + pop ebp + pop edi + pop esi + pop ebx + ret + } +} +#elif !defined(LIBYUV_DISABLE_X86) && \ + (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) +#define HAS_TRANSPOSE_WX8_SSSE3 +static void TransposeWx8_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + asm volatile ( + // Read in the data from the source pointer. + // First round of bit swap. + ".p2align 2 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc" + #if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + #endif + ); +} + +#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) +#define HAS_TRANSPOSE_UVWX8_SSE2 +void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int w); + asm ( + DECLARE_FUNCTION(TransposeUVWx8_SSE2) + "push %ebx \n" + "push %esi \n" + "push %edi \n" + "push %ebp \n" + "mov 0x14(%esp),%eax \n" + "mov 0x18(%esp),%edi \n" + "mov 0x1c(%esp),%edx \n" + "mov 0x20(%esp),%esi \n" + "mov 0x24(%esp),%ebx \n" + "mov 0x28(%esp),%ebp \n" + "mov %esp,%ecx \n" + "sub $0x14,%esp \n" + "and $0xfffffff0,%esp \n" + "mov %ecx,0x10(%esp) \n" + "mov 0x2c(%ecx),%ecx \n" + +"1: \n" + "movdqa (%eax),%xmm0 \n" + "movdqa (%eax,%edi,1),%xmm1 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm0,%xmm7 \n" + "punpcklbw %xmm1,%xmm0 \n" + "punpckhbw %xmm1,%xmm7 \n" + "movdqa %xmm7,%xmm1 \n" + "movdqa (%eax),%xmm2 \n" + "movdqa (%eax,%edi,1),%xmm3 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm2,%xmm7 \n" + "punpcklbw %xmm3,%xmm2 \n" + "punpckhbw %xmm3,%xmm7 \n" + "movdqa %xmm7,%xmm3 \n" + "movdqa (%eax),%xmm4 \n" + "movdqa (%eax,%edi,1),%xmm5 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm4,%xmm7 \n" + "punpcklbw %xmm5,%xmm4 \n" + "punpckhbw %xmm5,%xmm7 \n" + "movdqa %xmm7,%xmm5 \n" + "movdqa (%eax),%xmm6 \n" + "movdqa (%eax,%edi,1),%xmm7 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm5,(%esp) \n" + "neg %edi \n" + "movdqa %xmm6,%xmm5 \n" + "punpcklbw %xmm7,%xmm6 \n" + "punpckhbw %xmm7,%xmm5 \n" + "movdqa %xmm5,%xmm7 \n" + "lea 0x10(%eax,%edi,8),%eax \n" + "neg %edi \n" + "movdqa %xmm0,%xmm5 \n" + "punpcklwd %xmm2,%xmm0 \n" + "punpckhwd %xmm2,%xmm5 \n" + "movdqa %xmm5,%xmm2 \n" + "movdqa %xmm1,%xmm5 \n" + "punpcklwd %xmm3,%xmm1 \n" + "punpckhwd %xmm3,%xmm5 \n" + "movdqa %xmm5,%xmm3 \n" + "movdqa %xmm4,%xmm5 \n" + "punpcklwd %xmm6,%xmm4 \n" + "punpckhwd %xmm6,%xmm5 \n" + "movdqa %xmm5,%xmm6 \n" + "movdqa (%esp),%xmm5 \n" + "movdqa %xmm6,(%esp) \n" + "movdqa %xmm5,%xmm6 \n" + "punpcklwd %xmm7,%xmm5 \n" + "punpckhwd %xmm7,%xmm6 \n" + "movdqa %xmm6,%xmm7 \n" + "movdqa %xmm0,%xmm6 \n" + "punpckldq %xmm4,%xmm0 \n" + "punpckhdq %xmm4,%xmm6 \n" + "movdqa %xmm6,%xmm4 \n" + "movdqa (%esp),%xmm6 \n" + "movlpd %xmm0,(%edx) \n" + "movhpd %xmm0,(%ebx) \n" + "movlpd %xmm4,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm4,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm2,%xmm0 \n" + "punpckldq %xmm6,%xmm2 \n" + "movlpd %xmm2,(%edx) \n" + "movhpd %xmm2,(%ebx) \n" + "punpckhdq %xmm6,%xmm0 \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm1,%xmm0 \n" + "punpckldq %xmm5,%xmm1 \n" + "movlpd %xmm1,(%edx) \n" + "movhpd %xmm1,(%ebx) \n" + "punpckhdq %xmm5,%xmm0 \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm3,%xmm0 \n" + "punpckldq %xmm7,%xmm3 \n" + "movlpd %xmm3,(%edx) \n" + "movhpd %xmm3,(%ebx) \n" + "punpckhdq %xmm7,%xmm0 \n" + "sub $0x8,%ecx \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "jg 1b \n" + "mov 0x10(%esp),%esp \n" + "pop %ebp \n" + "pop %edi \n" + "pop %esi \n" + "pop %ebx \n" +#if defined(__native_client__) + "pop %ecx \n" + "and $0xffffffe0,%ecx \n" + "jmp *%ecx \n" +#else + "ret \n" +#endif +); +#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ + defined(__x86_64__) +// 64 bit version has enough registers to do 16x8 to 8x16 at a time. +#define HAS_TRANSPOSE_WX8_FAST_SSSE3 +static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + asm volatile ( + // Read in the data from the source pointer. + // First round of bit swap. + ".p2align 2 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqa (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqa (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqa (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqa (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqa (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" +); +} + +#define HAS_TRANSPOSE_UVWX8_SSE2 +static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int w) { + asm volatile ( + // Read in the data from the source pointer. + // First round of bit swap. + ".p2align 2 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqa (%0),%%xmm2 \n" + "movdqa (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqa (%0),%%xmm4 \n" + "movdqa (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqa (%0),%%xmm6 \n" + "movdqa (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" + // Second round of bit swap. + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(w) // %3 + : "r"((intptr_t)(src_stride)), // %4 + "r"((intptr_t)(dst_stride_a)), // %5 + "r"((intptr_t)(dst_stride_b)) // %6 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9" +); +} +#endif +#endif + +static void TransposeWx8_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst[0] = src[0 * src_stride]; + dst[1] = src[1 * src_stride]; + dst[2] = src[2 * src_stride]; + dst[3] = src[3 * src_stride]; + dst[4] = src[4 * src_stride]; + dst[5] = src[5 * src_stride]; + dst[6] = src[6 * src_stride]; + dst[7] = src[7 * src_stride]; + ++src; + dst += dst_stride; + } +} + +static void TransposeWxH_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + int i; + for (i = 0; i < width; ++i) { + int j; + for (j = 0; j < height; ++j) { + dst[i * dst_stride + j] = src[j * src_stride + i]; + } + } +} + +LIBYUV_API +void TransposePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + int i = height; + void (*TransposeWx8)(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) = TransposeWx8_C; +#if defined(HAS_TRANSPOSE_WX8_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + TransposeWx8 = TransposeWx8_NEON; + } +#endif +#if defined(HAS_TRANSPOSE_WX8_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + TransposeWx8 = TransposeWx8_SSSE3; + } +#endif +#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { + TransposeWx8 = TransposeWx8_FAST_SSSE3; + } +#endif +#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { + if (IS_ALIGNED(width, 4) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2; + } else { + TransposeWx8 = TransposeWx8_MIPS_DSPR2; + } + } +#endif + + // Work across the source in 8x8 tiles + while (i >= 8) { + TransposeWx8(src, src_stride, dst, dst_stride, width); + src += 8 * src_stride; // Go down 8 rows. + dst += 8; // Move over 8 columns. + i -= 8; + } + + TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); +} + +LIBYUV_API +void RotatePlane90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Rotate by 90 is a transpose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src += src_stride * (height - 1); + src_stride = -src_stride; + TransposePlane(src, src_stride, dst, dst_stride, width, height); +} + +LIBYUV_API +void RotatePlane270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Rotate by 270 is a transpose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst += dst_stride * (width - 1); + dst_stride = -dst_stride; + TransposePlane(src, src_stride, dst, dst_stride, width, height); +} + +LIBYUV_API +void RotatePlane180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Swap first and last row and mirror the content. Uses a temporary row. + align_buffer_64(row, width); + const uint8* src_bot = src + src_stride * (height - 1); + uint8* dst_bot = dst + dst_stride * (height - 1); + int half_height = (height + 1) >> 1; + int y; + void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_NEON; + } +#endif +#if defined(HAS_MIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + MirrorRow = MirrorRow_SSE2; + } +#endif +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + MirrorRow = MirrorRow_SSSE3; + } +#endif +#if defined(HAS_MIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } +#endif +#if defined(HAS_MIRRORROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) { + MirrorRow = MirrorRow_MIPS_DSPR2; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_NEON; + } +#endif +#if defined(HAS_COPYROW_X86) + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif + + // Odd height will harmlessly mirror the middle row twice. + for (y = 0; y < half_height; ++y) { + MirrorRow(src, row, width); // Mirror first row into a buffer + src += src_stride; + MirrorRow(src_bot, dst, width); // Mirror last row into first row + dst += dst_stride; + CopyRow(row, dst_bot, width); // Copy first mirrored row into last + src_bot -= src_stride; + dst_bot -= dst_stride; + } + free_aligned_buffer_64(row); +} + +static void TransposeUVWx8_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst_a[0] = src[0 * src_stride + 0]; + dst_b[0] = src[0 * src_stride + 1]; + dst_a[1] = src[1 * src_stride + 0]; + dst_b[1] = src[1 * src_stride + 1]; + dst_a[2] = src[2 * src_stride + 0]; + dst_b[2] = src[2 * src_stride + 1]; + dst_a[3] = src[3 * src_stride + 0]; + dst_b[3] = src[3 * src_stride + 1]; + dst_a[4] = src[4 * src_stride + 0]; + dst_b[4] = src[4 * src_stride + 1]; + dst_a[5] = src[5 * src_stride + 0]; + dst_b[5] = src[5 * src_stride + 1]; + dst_a[6] = src[6 * src_stride + 0]; + dst_b[6] = src[6 * src_stride + 1]; + dst_a[7] = src[7 * src_stride + 0]; + dst_b[7] = src[7 * src_stride + 1]; + src += 2; + dst_a += dst_stride_a; + dst_b += dst_stride_b; + } +} + +static void TransposeUVWxH_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + int i; + for (i = 0; i < width * 2; i += 2) { + int j; + for (j = 0; j < height; ++j) { + dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; + dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; + } + } +} + +LIBYUV_API +void TransposeUV(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + int i = height; + void (*TransposeUVWx8)(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) = TransposeUVWx8_C; +#if defined(HAS_TRANSPOSE_UVWX8_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + TransposeUVWx8 = TransposeUVWx8_NEON; + } +#elif defined(HAS_TRANSPOSE_UVWX8_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 8) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { + TransposeUVWx8 = TransposeUVWx8_SSE2; + } +#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; + } +#endif + + // Work through the source in 8x8 tiles. + while (i >= 8) { + TransposeUVWx8(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width); + src += 8 * src_stride; // Go down 8 rows. + dst_a += 8; // Move over 8 columns. + dst_b += 8; // Move over 8 columns. + i -= 8; + } + + TransposeUVWxH_C(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width, i); +} + +LIBYUV_API +void RotateUV90(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + src += src_stride * (height - 1); + src_stride = -src_stride; + + TransposeUV(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width, height); +} + +LIBYUV_API +void RotateUV270(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + dst_a += dst_stride_a * (width - 1); + dst_b += dst_stride_b * (width - 1); + dst_stride_a = -dst_stride_a; + dst_stride_b = -dst_stride_b; + + TransposeUV(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width, height); +} + +// Rotate 180 is a horizontal and vertical flip. +LIBYUV_API +void RotateUV180(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + int i; + void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) = + MirrorUVRow_C; +#if defined(HAS_MIRRORUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + MirrorRowUV = MirrorUVRow_NEON; + } +#elif defined(HAS_MIRRORROW_UV_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { + MirrorRowUV = MirrorUVRow_SSSE3; + } +#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + MirrorRowUV = MirrorUVRow_MIPS_DSPR2; + } +#endif + + dst_a += dst_stride_a * (height - 1); + dst_b += dst_stride_b * (height - 1); + + for (i = 0; i < height; ++i) { + MirrorRowUV(src, dst_a, dst_b, width); + src += src_stride; + dst_a -= dst_stride_a; + dst_b -= dst_stride_b; + } +} + +LIBYUV_API +int RotatePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height, + enum RotationMode mode) { + if (!src || width <= 0 || height == 0 || !dst) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src = src + (height - 1) * src_stride; + src_stride = -src_stride; + } + + switch (mode) { + case kRotate0: + // copy frame + CopyPlane(src, src_stride, + dst, dst_stride, + width, height); + return 0; + case kRotate90: + RotatePlane90(src, src_stride, + dst, dst_stride, + width, height); + return 0; + case kRotate270: + RotatePlane270(src, src_stride, + dst, dst_stride, + width, height); + return 0; + case kRotate180: + RotatePlane180(src, src_stride, + dst, dst_stride, + width, height); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int I420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || + !dst_y || !dst_u || !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // copy frame + return I420Copy(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); + case kRotate90: + RotatePlane90(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotatePlane90(src_u, src_stride_u, + dst_u, dst_stride_u, + halfwidth, halfheight); + RotatePlane90(src_v, src_stride_v, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate270: + RotatePlane270(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotatePlane270(src_u, src_stride_u, + dst_u, dst_stride_u, + halfwidth, halfheight); + RotatePlane270(src_v, src_stride_v, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotatePlane180(src_u, src_stride_u, + dst_u, dst_stride_u, + halfwidth, halfheight); + RotatePlane180(src_v, src_stride_v, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_uv || width <= 0 || height == 0 || + !dst_y || !dst_u || !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + + switch (mode) { + case kRotate0: + // copy frame + return NV12ToI420(src_y, src_stride_y, + src_uv, src_stride_uv, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); + case kRotate90: + RotatePlane90(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotateUV90(src_uv, src_stride_uv, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate270: + RotatePlane270(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotateUV270(src_uv, src_stride_uv, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotateUV180(src_uv, src_stride_uv, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + default: + break; + } + return -1; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/rotate_argb.cc b/third_party/libyuv/source/rotate_argb.cc new file mode 100644 index 000000000..ab0f9ce07 --- /dev/null +++ b/third_party/libyuv/source/rotate_argb.cc @@ -0,0 +1,209 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" + +#include "libyuv/cpu_id.h" +#include "libyuv/convert.h" +#include "libyuv/planar_functions.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// ARGBScale has a function to copy pixels to a row, striding each source +// pixel by a constant. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || \ + (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__)) +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride, + int src_stepx, + uint8* dst_ptr, int dst_width); +#endif +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_SCALEARGBROWDOWNEVEN_NEON +void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride, + int src_stepx, + uint8* dst_ptr, int dst_width); +#endif + +void ScaleARGBRowDownEven_C(const uint8* src_ptr, int, + int src_stepx, + uint8* dst_ptr, int dst_width); + +static void ARGBTranspose(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + int i; + int src_pixel_step = src_stride >> 2; + void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride, + int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C; +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) && // Width of dest. + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; + } +#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) && // Width of dest. + IS_ALIGNED(src, 4)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; + } +#endif + + for (i = 0; i < width; ++i) { // column of source to row of dest. + ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height); + dst += dst_stride; + src += 4; + } +} + +void ARGBRotate90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Rotate by 90 is a ARGBTranspose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src += src_stride * (height - 1); + src_stride = -src_stride; + ARGBTranspose(src, src_stride, dst, dst_stride, width, height); +} + +void ARGBRotate270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Rotate by 270 is a ARGBTranspose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst += dst_stride * (width - 1); + dst_stride = -dst_stride; + ARGBTranspose(src, src_stride, dst, dst_stride, width, height); +} + +void ARGBRotate180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Swap first and last row and mirror the content. Uses a temporary row. + align_buffer_64(row, width * 4); + const uint8* src_bot = src + src_stride * (height - 1); + uint8* dst_bot = dst + dst_stride * (height - 1); + int half_height = (height + 1) >> 1; + int y; + void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = + ARGBMirrorRow_C; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_ARGBMIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + ARGBMirrorRow = ARGBMirrorRow_SSSE3; + } +#endif +#if defined(HAS_ARGBMIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_AVX2; + } +#endif +#if defined(HAS_ARGBMIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_NEON; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) { + CopyRow = CopyRow_NEON; + } +#endif +#if defined(HAS_COPYROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + CopyRow = CopyRow_X86; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif + + // Odd height will harmlessly mirror the middle row twice. + for (y = 0; y < half_height; ++y) { + ARGBMirrorRow(src, row, width); // Mirror first row into a buffer + ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row + CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last + src += src_stride; + dst += dst_stride; + src_bot -= src_stride; + dst_bot -= dst_stride; + } + free_aligned_buffer_64(row); +} + +LIBYUV_API +int ARGBRotate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height, + enum RotationMode mode) { + if (!src_argb || width <= 0 || height == 0 || !dst_argb) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + + switch (mode) { + case kRotate0: + // copy frame + return ARGBCopy(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + case kRotate90: + ARGBRotate90(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + return 0; + case kRotate270: + ARGBRotate270(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + return 0; + case kRotate180: + ARGBRotate180(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + return 0; + default: + break; + } + return -1; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/rotate_mips.cc b/third_party/libyuv/source/rotate_mips.cc new file mode 100644 index 000000000..70770fd06 --- /dev/null +++ b/third_party/libyuv/source/rotate_mips.cc @@ -0,0 +1,485 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_MIPS) && \ + defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) + +void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 + "addu $t3, $t2, %[src_stride] \n" + "addu $t5, $t4, %[src_stride] \n" + "addu $t6, $t2, $t4 \n" + "andi $t0, %[dst], 0x3 \n" + "andi $t1, %[dst_stride], 0x3 \n" + "or $t0, $t0, $t1 \n" + "bnez $t0, 11f \n" + " subu $t7, $t9, %[src_stride] \n" +//dst + dst_stride word aligned + "1: \n" + "lbu $t0, 0(%[src]) \n" + "lbux $t1, %[src_stride](%[src]) \n" + "lbux $t8, $t2(%[src]) \n" + "lbux $t9, $t3(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s0, $t8, $t0 \n" + "lbux $t0, $t4(%[src]) \n" + "lbux $t1, $t5(%[src]) \n" + "lbux $t8, $t6(%[src]) \n" + "lbux $t9, $t7(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s1, $t8, $t0 \n" + "sw $s0, 0(%[dst]) \n" + "addiu %[width], -1 \n" + "addiu %[src], 1 \n" + "sw $s1, 4(%[dst]) \n" + "bnez %[width], 1b \n" + " addu %[dst], %[dst], %[dst_stride] \n" + "b 2f \n" +//dst + dst_stride unaligned + "11: \n" + "lbu $t0, 0(%[src]) \n" + "lbux $t1, %[src_stride](%[src]) \n" + "lbux $t8, $t2(%[src]) \n" + "lbux $t9, $t3(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s0, $t8, $t0 \n" + "lbux $t0, $t4(%[src]) \n" + "lbux $t1, $t5(%[src]) \n" + "lbux $t8, $t6(%[src]) \n" + "lbux $t9, $t7(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s1, $t8, $t0 \n" + "swr $s0, 0(%[dst]) \n" + "swl $s0, 3(%[dst]) \n" + "addiu %[width], -1 \n" + "addiu %[src], 1 \n" + "swr $s1, 4(%[dst]) \n" + "swl $s1, 7(%[dst]) \n" + "bnez %[width], 11b \n" + "addu %[dst], %[dst], %[dst_stride] \n" + "2: \n" + ".set pop \n" + :[src] "+r" (src), + [dst] "+r" (dst), + [width] "+r" (width) + :[src_stride] "r" (src_stride), + [dst_stride] "r" (dst_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1" + ); +} + +void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + __asm__ __volatile__ ( + ".set noat \n" + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 + "addu $t3, $t2, %[src_stride] \n" + "addu $t5, $t4, %[src_stride] \n" + "addu $t6, $t2, $t4 \n" + + "srl $AT, %[width], 0x2 \n" + "andi $t0, %[dst], 0x3 \n" + "andi $t1, %[dst_stride], 0x3 \n" + "or $t0, $t0, $t1 \n" + "bnez $t0, 11f \n" + " subu $t7, $t9, %[src_stride] \n" +//dst + dst_stride word aligned + "1: \n" + "lw $t0, 0(%[src]) \n" + "lwx $t1, %[src_stride](%[src]) \n" + "lwx $t8, $t2(%[src]) \n" + "lwx $t9, $t3(%[src]) \n" + +// t0 = | 30 | 20 | 10 | 00 | +// t1 = | 31 | 21 | 11 | 01 | +// t8 = | 32 | 22 | 12 | 02 | +// t9 = | 33 | 23 | 13 | 03 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 21 | 01 | 20 | 00 | + // s1 = | 23 | 03 | 22 | 02 | + // s2 = | 31 | 11 | 30 | 10 | + // s3 = | 33 | 13 | 32 | 12 | + + "precr.qb.ph $s4, $s1, $s0 \n" + "precrq.qb.ph $s5, $s1, $s0 \n" + "precr.qb.ph $s6, $s3, $s2 \n" + "precrq.qb.ph $s7, $s3, $s2 \n" + + // s4 = | 03 | 02 | 01 | 00 | + // s5 = | 23 | 22 | 21 | 20 | + // s6 = | 13 | 12 | 11 | 10 | + // s7 = | 33 | 32 | 31 | 30 | + + "lwx $t0, $t4(%[src]) \n" + "lwx $t1, $t5(%[src]) \n" + "lwx $t8, $t6(%[src]) \n" + "lwx $t9, $t7(%[src]) \n" + +// t0 = | 34 | 24 | 14 | 04 | +// t1 = | 35 | 25 | 15 | 05 | +// t8 = | 36 | 26 | 16 | 06 | +// t9 = | 37 | 27 | 17 | 07 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 25 | 05 | 24 | 04 | + // s1 = | 27 | 07 | 26 | 06 | + // s2 = | 35 | 15 | 34 | 14 | + // s3 = | 37 | 17 | 36 | 16 | + + "precr.qb.ph $t0, $s1, $s0 \n" + "precrq.qb.ph $t1, $s1, $s0 \n" + "precr.qb.ph $t8, $s3, $s2 \n" + "precrq.qb.ph $t9, $s3, $s2 \n" + + // t0 = | 07 | 06 | 05 | 04 | + // t1 = | 27 | 26 | 25 | 24 | + // t8 = | 17 | 16 | 15 | 14 | + // t9 = | 37 | 36 | 35 | 34 | + + "addu $s0, %[dst], %[dst_stride] \n" + "addu $s1, $s0, %[dst_stride] \n" + "addu $s2, $s1, %[dst_stride] \n" + + "sw $s4, 0(%[dst]) \n" + "sw $t0, 4(%[dst]) \n" + "sw $s6, 0($s0) \n" + "sw $t8, 4($s0) \n" + "sw $s5, 0($s1) \n" + "sw $t1, 4($s1) \n" + "sw $s7, 0($s2) \n" + "sw $t9, 4($s2) \n" + + "addiu $AT, -1 \n" + "addiu %[src], 4 \n" + + "bnez $AT, 1b \n" + " addu %[dst], $s2, %[dst_stride] \n" + "b 2f \n" +//dst + dst_stride unaligned + "11: \n" + "lw $t0, 0(%[src]) \n" + "lwx $t1, %[src_stride](%[src]) \n" + "lwx $t8, $t2(%[src]) \n" + "lwx $t9, $t3(%[src]) \n" + +// t0 = | 30 | 20 | 10 | 00 | +// t1 = | 31 | 21 | 11 | 01 | +// t8 = | 32 | 22 | 12 | 02 | +// t9 = | 33 | 23 | 13 | 03 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 21 | 01 | 20 | 00 | + // s1 = | 23 | 03 | 22 | 02 | + // s2 = | 31 | 11 | 30 | 10 | + // s3 = | 33 | 13 | 32 | 12 | + + "precr.qb.ph $s4, $s1, $s0 \n" + "precrq.qb.ph $s5, $s1, $s0 \n" + "precr.qb.ph $s6, $s3, $s2 \n" + "precrq.qb.ph $s7, $s3, $s2 \n" + + // s4 = | 03 | 02 | 01 | 00 | + // s5 = | 23 | 22 | 21 | 20 | + // s6 = | 13 | 12 | 11 | 10 | + // s7 = | 33 | 32 | 31 | 30 | + + "lwx $t0, $t4(%[src]) \n" + "lwx $t1, $t5(%[src]) \n" + "lwx $t8, $t6(%[src]) \n" + "lwx $t9, $t7(%[src]) \n" + +// t0 = | 34 | 24 | 14 | 04 | +// t1 = | 35 | 25 | 15 | 05 | +// t8 = | 36 | 26 | 16 | 06 | +// t9 = | 37 | 27 | 17 | 07 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 25 | 05 | 24 | 04 | + // s1 = | 27 | 07 | 26 | 06 | + // s2 = | 35 | 15 | 34 | 14 | + // s3 = | 37 | 17 | 36 | 16 | + + "precr.qb.ph $t0, $s1, $s0 \n" + "precrq.qb.ph $t1, $s1, $s0 \n" + "precr.qb.ph $t8, $s3, $s2 \n" + "precrq.qb.ph $t9, $s3, $s2 \n" + + // t0 = | 07 | 06 | 05 | 04 | + // t1 = | 27 | 26 | 25 | 24 | + // t8 = | 17 | 16 | 15 | 14 | + // t9 = | 37 | 36 | 35 | 34 | + + "addu $s0, %[dst], %[dst_stride] \n" + "addu $s1, $s0, %[dst_stride] \n" + "addu $s2, $s1, %[dst_stride] \n" + + "swr $s4, 0(%[dst]) \n" + "swl $s4, 3(%[dst]) \n" + "swr $t0, 4(%[dst]) \n" + "swl $t0, 7(%[dst]) \n" + "swr $s6, 0($s0) \n" + "swl $s6, 3($s0) \n" + "swr $t8, 4($s0) \n" + "swl $t8, 7($s0) \n" + "swr $s5, 0($s1) \n" + "swl $s5, 3($s1) \n" + "swr $t1, 4($s1) \n" + "swl $t1, 7($s1) \n" + "swr $s7, 0($s2) \n" + "swl $s7, 3($s2) \n" + "swr $t9, 4($s2) \n" + "swl $t9, 7($s2) \n" + + "addiu $AT, -1 \n" + "addiu %[src], 4 \n" + + "bnez $AT, 11b \n" + " addu %[dst], $s2, %[dst_stride] \n" + "2: \n" + ".set pop \n" + ".set at \n" + :[src] "+r" (src), + [dst] "+r" (dst), + [width] "+r" (width) + :[src_stride] "r" (src_stride), + [dst_stride] "r" (dst_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7" + ); +} + +void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 + "addu $t3, $t2, %[src_stride] \n" + "addu $t5, $t4, %[src_stride] \n" + "addu $t6, $t2, $t4 \n" + "subu $t7, $t9, %[src_stride] \n" + "srl $t1, %[width], 1 \n" + +// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b + "andi $t0, %[dst_a], 0x3 \n" + "andi $t8, %[dst_b], 0x3 \n" + "or $t0, $t0, $t8 \n" + "andi $t8, %[dst_stride_a], 0x3 \n" + "andi $s5, %[dst_stride_b], 0x3 \n" + "or $t8, $t8, $s5 \n" + "or $t0, $t0, $t8 \n" + "bnez $t0, 11f \n" + " nop \n" +// dst + dst_stride word aligned (both, a & b dst addresses) + "1: \n" + "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| + "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| + "addu $s5, %[dst_a], %[dst_stride_a] \n" + "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| + "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| + "addu $s6, %[dst_b], %[dst_stride_b] \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| + "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| + "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| + + "sw $s3, 0($s5) \n" + "sw $s4, 0($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| + + "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| + "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| + "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| + "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| + "sw $s3, 0(%[dst_a]) \n" + "sw $s4, 0(%[dst_b]) \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| + "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| + "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| + "sw $s3, 4($s5) \n" + "sw $s4, 4($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| + + "addiu %[src], 4 \n" + "addiu $t1, -1 \n" + "sll $t0, %[dst_stride_a], 1 \n" + "sll $t8, %[dst_stride_b], 1 \n" + "sw $s3, 4(%[dst_a]) \n" + "sw $s4, 4(%[dst_b]) \n" + "addu %[dst_a], %[dst_a], $t0 \n" + "bnez $t1, 1b \n" + " addu %[dst_b], %[dst_b], $t8 \n" + "b 2f \n" + " nop \n" + +// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned + "11: \n" + "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| + "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| + "addu $s5, %[dst_a], %[dst_stride_a] \n" + "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| + "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| + "addu $s6, %[dst_b], %[dst_stride_b] \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| + "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| + "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| + + "swr $s3, 0($s5) \n" + "swl $s3, 3($s5) \n" + "swr $s4, 0($s6) \n" + "swl $s4, 3($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| + + "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| + "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| + "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| + "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| + "swr $s3, 0(%[dst_a]) \n" + "swl $s3, 3(%[dst_a]) \n" + "swr $s4, 0(%[dst_b]) \n" + "swl $s4, 3(%[dst_b]) \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| + "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| + "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| + + "swr $s3, 4($s5) \n" + "swl $s3, 7($s5) \n" + "swr $s4, 4($s6) \n" + "swl $s4, 7($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| + + "addiu %[src], 4 \n" + "addiu $t1, -1 \n" + "sll $t0, %[dst_stride_a], 1 \n" + "sll $t8, %[dst_stride_b], 1 \n" + "swr $s3, 4(%[dst_a]) \n" + "swl $s3, 7(%[dst_a]) \n" + "swr $s4, 4(%[dst_b]) \n" + "swl $s4, 7(%[dst_b]) \n" + "addu %[dst_a], %[dst_a], $t0 \n" + "bnez $t1, 11b \n" + " addu %[dst_b], %[dst_b], $t8 \n" + + "2: \n" + ".set pop \n" + : [src] "+r" (src), + [dst_a] "+r" (dst_a), + [dst_b] "+r" (dst_b), + [width] "+r" (width), + [src_stride] "+r" (src_stride) + : [dst_stride_a] "r" (dst_stride_a), + [dst_stride_b] "r" (dst_stride_b) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/rotate_neon.cc b/third_party/libyuv/source/rotate_neon.cc new file mode 100644 index 000000000..d354e11fa --- /dev/null +++ b/third_party/libyuv/source/rotate_neon.cc @@ -0,0 +1,533 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) + +static uvec8 kVTbl4x4Transpose = + { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + +void TransposeWx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + const uint8* src_temp = NULL; + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %5, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + ".p2align 2 \n" + "1: \n" + "mov %0, %1 \n" + + MEMACCESS(0) + "vld1.8 {d0}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d1}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d3}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d4}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d5}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d6}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d7}, [%0] \n" + + "vtrn.8 d1, d0 \n" + "vtrn.8 d3, d2 \n" + "vtrn.8 d5, d4 \n" + "vtrn.8 d7, d6 \n" + + "vtrn.16 d1, d3 \n" + "vtrn.16 d0, d2 \n" + "vtrn.16 d5, d7 \n" + "vtrn.16 d4, d6 \n" + + "vtrn.32 d1, d5 \n" + "vtrn.32 d0, d4 \n" + "vtrn.32 d3, d7 \n" + "vtrn.32 d2, d6 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.8 {d1}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d3}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d2}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d5}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d4}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d7}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d6}, [%0] \n" + + "add %1, #8 \n" // src += 8 + "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride + "subs %5, #8 \n" // w -= 8 + "bge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %5, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %5, #2 \n" + "blt 3f \n" + + "cmp %5, #4 \n" + "blt 2f \n" + + // 4x8 block + "mov %0, %1 \n" + MEMACCESS(0) + "vld1.32 {d0[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d0[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d1[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d1[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d2[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d2[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d3[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d3[1]}, [%0] \n" + + "mov %0, %3 \n" + + MEMACCESS(6) + "vld1.8 {q3}, [%6] \n" + + "vtbl.8 d4, {d0, d1}, d6 \n" + "vtbl.8 d5, {d0, d1}, d7 \n" + "vtbl.8 d0, {d2, d3}, d6 \n" + "vtbl.8 d1, {d2, d3}, d7 \n" + + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + MEMACCESS(0) + "vst1.32 {d4[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d4[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d5[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d5[1]}, [%0] \n" + + "add %0, %3, #4 \n" + MEMACCESS(0) + "vst1.32 {d0[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d0[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d1[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d1[1]}, [%0] \n" + + "add %1, #4 \n" // src += 4 + "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride + "subs %5, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %5, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + MEMACCESS(0) + "vld1.16 {d0[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d0[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d0[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d0[3]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[3]}, [%0] \n" + + "vtrn.8 d0, d1 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.64 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.64 {d1}, [%0] \n" + + "add %1, #2 \n" // src += 2 + "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride + "subs %5, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + MEMACCESS(1) + "vld1.8 {d0[0]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[1]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[2]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[3]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[4]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[5]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[6]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[7]}, [%1] \n" + + MEMACCESS(3) + "vst1.64 {d0}, [%3] \n" + + "4: \n" + + : "+r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst), // %3 + "+r"(dst_stride), // %4 + "+r"(width) // %5 + : "r"(&kVTbl4x4Transpose) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3" + ); +} + +static uvec8 kVTbl4x4TransposeDi = + { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; + +void TransposeUVWx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + const uint8* src_temp = NULL; + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %7, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + ".p2align 2 \n" + "1: \n" + "mov %0, %1 \n" + + MEMACCESS(0) + "vld2.8 {d0, d1}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d2, d3}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d4, d5}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d6, d7}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d16, d17}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d18, d19}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d20, d21}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d22, d23}, [%0] \n" + + "vtrn.8 q1, q0 \n" + "vtrn.8 q3, q2 \n" + "vtrn.8 q9, q8 \n" + "vtrn.8 q11, q10 \n" + + "vtrn.16 q1, q3 \n" + "vtrn.16 q0, q2 \n" + "vtrn.16 q9, q11 \n" + "vtrn.16 q8, q10 \n" + + "vtrn.32 q1, q9 \n" + "vtrn.32 q0, q8 \n" + "vtrn.32 q3, q11 \n" + "vtrn.32 q2, q10 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + "vrev16.8 q8, q8 \n" + "vrev16.8 q9, q9 \n" + "vrev16.8 q10, q10 \n" + "vrev16.8 q11, q11 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.8 {d2}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d6}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d4}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d18}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d16}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d22}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d20}, [%0] \n" + + "mov %0, %5 \n" + + MEMACCESS(0) + "vst1.8 {d3}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d1}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d7}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d5}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d19}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d17}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d23}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d21}, [%0] \n" + + "add %1, #8*2 \n" // src += 8*2 + "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a + "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b + "subs %7, #8 \n" // w -= 8 + "bge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %7, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %7, #2 \n" + "blt 3f \n" + + "cmp %7, #4 \n" + "blt 2f \n" + + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + MEMACCESS(0) + "vld1.64 {d0}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d1}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d2}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d3}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d4}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d5}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d6}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d7}, [%0] \n" + + MEMACCESS(8) + "vld1.8 {q15}, [%8] \n" + + "vtrn.8 q0, q1 \n" + "vtrn.8 q2, q3 \n" + + "vtbl.8 d16, {d0, d1}, d30 \n" + "vtbl.8 d17, {d0, d1}, d31 \n" + "vtbl.8 d18, {d2, d3}, d30 \n" + "vtbl.8 d19, {d2, d3}, d31 \n" + "vtbl.8 d20, {d4, d5}, d30 \n" + "vtbl.8 d21, {d4, d5}, d31 \n" + "vtbl.8 d22, {d6, d7}, d30 \n" + "vtbl.8 d23, {d6, d7}, d31 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.32 {d16[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d16[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d17[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d17[1]}, [%0], %4 \n" + + "add %0, %3, #4 \n" + MEMACCESS(0) + "vst1.32 {d20[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d20[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d21[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d21[1]}, [%0] \n" + + "mov %0, %5 \n" + + MEMACCESS(0) + "vst1.32 {d18[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d18[1]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d19[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d19[1]}, [%0], %6 \n" + + "add %0, %5, #4 \n" + MEMACCESS(0) + "vst1.32 {d22[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d22[1]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d23[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d23[1]}, [%0] \n" + + "add %1, #4*2 \n" // src += 4 * 2 + "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a + "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b + "subs %7, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %7, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + MEMACCESS(0) + "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[3], d3[3]}, [%0] \n" + + "vtrn.8 d0, d1 \n" + "vtrn.8 d2, d3 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.64 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.64 {d2}, [%0] \n" + + "mov %0, %5 \n" + + MEMACCESS(0) + "vst1.64 {d1}, [%0], %6 \n" + MEMACCESS(0) + "vst1.64 {d3}, [%0] \n" + + "add %1, #2*2 \n" // src += 2 * 2 + "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a + "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b + "subs %7, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + MEMACCESS(1) + "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[7], d1[7]}, [%1] \n" + + MEMACCESS(3) + "vst1.64 {d0}, [%3] \n" + MEMACCESS(5) + "vst1.64 {d1}, [%5] \n" + + "4: \n" + + : "+r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst_a), // %3 + "+r"(dst_stride_a), // %4 + "+r"(dst_b), // %5 + "+r"(dst_stride_b), // %6 + "+r"(width) // %7 + : "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", + "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); +} +#endif + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/row_any.cc b/third_party/libyuv/source/row_any.cc index 27a0de119..97ef84417 100644 --- a/third_party/libyuv/source/row_any.cc +++ b/third_party/libyuv/source/row_any.cc @@ -8,9 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/libyuv/include/libyuv/row.h" +#include "libyuv/row.h" -#include "third_party/libyuv/include/libyuv/basic_types.h" +#include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { @@ -35,10 +35,12 @@ extern "C" { } #ifdef HAS_I422TOARGBROW_SSSE3 -YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, - 0, 4, 7) YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1, 4, 7) +#endif // HAS_I422TOARGBROW_SSSE3 +#ifdef HAS_I444TOARGBROW_SSSE3 +YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, + 0, 4, 7) YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2, 4, 7) YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, @@ -59,7 +61,7 @@ YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7) YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7) YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15) YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15) -#endif // HAS_I422TOARGBROW_SSSE3 +#endif // HAS_I444TOARGBROW_SSSE3 #ifdef HAS_I422TOARGBROW_AVX2 YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15) #endif // HAS_I422TOARGBROW_AVX2 diff --git a/third_party/libyuv/source/row_common.cc b/third_party/libyuv/source/row_common.cc index ceb3836cd..fa2b752a2 100644 --- a/third_party/libyuv/source/row_common.cc +++ b/third_party/libyuv/source/row_common.cc @@ -8,11 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/libyuv/include/libyuv/row.h" +#include "libyuv/row.h" #include // For memcpy and memset. -#include "third_party/libyuv/include/libyuv/basic_types.h" +#include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { diff --git a/third_party/libyuv/source/row_mips.cc b/third_party/libyuv/source/row_mips.cc index a804670d3..ae9370c1b 100644 --- a/third_party/libyuv/source/row_mips.cc +++ b/third_party/libyuv/source/row_mips.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/libyuv/include/libyuv/row.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -16,7 +16,8 @@ extern "C" { #endif // The following are available on Mips platforms: -#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) #ifdef HAS_COPYROW_MIPS void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { @@ -376,7 +377,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { // MIPS DSPR2 functions #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \ - (__mips_dsp_rev >= 2) + (__mips_dsp_rev >= 2) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) + void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { __asm__ __volatile__ ( diff --git a/third_party/libyuv/source/row_neon.cc b/third_party/libyuv/source/row_neon.cc index c5ae2c583..1392cf5fc 100644 --- a/third_party/libyuv/source/row_neon.cc +++ b/third_party/libyuv/source/row_neon.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/libyuv/include/libyuv/row.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -20,34 +20,46 @@ extern "C" { // Read 8 Y, 4 U and 4 V from 422 #define READYUV422 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.32 {d2[0]}, [%1]! \n" \ + MEMACCESS(2) \ "vld1.32 {d2[1]}, [%2]! \n" // Read 8 Y, 2 U and 2 V from 422 #define READYUV411 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.16 {d2[0]}, [%1]! \n" \ + MEMACCESS(2) \ "vld1.16 {d2[1]}, [%2]! \n" \ "vmov.u8 d3, d2 \n" \ "vzip.u8 d2, d3 \n" // Read 8 Y, 8 U and 8 V from 444 #define READYUV444 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.8 {d2}, [%1]! \n" \ + MEMACCESS(2) \ "vld1.8 {d3}, [%2]! \n" \ "vpaddl.u8 q1, q1 \n" \ "vrshrn.u16 d2, q1, #1 \n" // Read 8 Y, and set 4 U and 4 V to 128 #define READYUV400 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ "vmov.u8 d2, #128 \n" // Read 8 Y and 4 UV from NV12 #define READNV12 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.8 {d2}, [%1]! \n" \ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ "vuzp.u8 d2, d3 \n" \ @@ -55,7 +67,9 @@ extern "C" { // Read 8 Y and 4 VU from NV21 #define READNV21 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.8 {d2}, [%1]! \n" \ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ "vuzp.u8 d3, d2 \n" \ @@ -63,6 +77,7 @@ extern "C" { // Read 8 YUY2 #define READYUY2 \ + MEMACCESS(0) \ "vld2.8 {d0, d2}, [%0]! \n" \ "vmov.u8 d3, d2 \n" \ "vuzp.u8 d2, d3 \n" \ @@ -70,6 +85,7 @@ extern "C" { // Read 8 UYVY #define READUYVY \ + MEMACCESS(0) \ "vld2.8 {d2, d3}, [%0]! \n" \ "vmov.u8 d0, d3 \n" \ "vmov.u8 d3, d2 \n" \ @@ -113,7 +129,9 @@ void I444ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -124,6 +142,7 @@ void I444ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -144,7 +163,9 @@ void I422ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -155,6 +176,7 @@ void I422ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -175,7 +197,9 @@ void I411ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -186,6 +210,7 @@ void I411ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -206,7 +231,9 @@ void I422ToBGRARow_NEON(const uint8* src_y, uint8* dst_bgra, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -218,6 +245,7 @@ void I422ToBGRARow_NEON(const uint8* src_y, "subs %4, %4, #8 \n" "vswp.u8 d20, d22 \n" "vmov.u8 d19, #255 \n" + MEMACCESS(3) "vst4.8 {d19, d20, d21, d22}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -238,7 +266,9 @@ void I422ToABGRRow_NEON(const uint8* src_y, uint8* dst_abgr, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -250,6 +280,7 @@ void I422ToABGRRow_NEON(const uint8* src_y, "subs %4, %4, #8 \n" "vswp.u8 d20, d22 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -270,7 +301,9 @@ void I422ToRGBARow_NEON(const uint8* src_y, uint8* dst_rgba, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -281,6 +314,7 @@ void I422ToRGBARow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vmov.u8 d19, #255 \n" + MEMACCESS(3) "vst4.8 {d19, d20, d21, d22}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -301,7 +335,9 @@ void I422ToRGB24Row_NEON(const uint8* src_y, uint8* dst_rgb24, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -311,6 +347,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y, READYUV422 YUV422TORGB "subs %4, %4, #8 \n" + MEMACCESS(3) "vst3.8 {d20, d21, d22}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -331,7 +368,9 @@ void I422ToRAWRow_NEON(const uint8* src_y, uint8* dst_raw, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -342,6 +381,7 @@ void I422ToRAWRow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vswp.u8 d20, d22 \n" + MEMACCESS(3) "vst3.8 {d20, d21, d22}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -374,7 +414,9 @@ void I422ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -385,6 +427,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" ARGBTORGB565 + MEMACCESS(3) "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. "bgt 1b \n" : "+r"(src_y), // %0 @@ -420,7 +463,9 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, uint8* dst_argb1555, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -432,6 +477,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" ARGBTOARGB1555 + MEMACCESS(3) "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. "bgt 1b \n" : "+r"(src_y), // %0 @@ -461,7 +507,9 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, uint8* dst_argb4444, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -474,6 +522,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" ARGBTOARGB4444 + MEMACCESS(3) "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. "bgt 1b \n" : "+r"(src_y), // %0 @@ -492,7 +541,9 @@ void YToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(3) "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) "vld1.8 {d25}, [%4] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -503,6 +554,7 @@ void YToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %2, %2, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -522,10 +574,12 @@ void I400ToARGBRow_NEON(const uint8* src_y, ".p2align 2 \n" "vmov.u8 d23, #255 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d20}, [%0]! \n" "vmov d21, d20 \n" "vmov d22, d20 \n" "subs %2, %2, #8 \n" + MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -541,7 +595,9 @@ void NV12ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(4) "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -552,6 +608,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %3, %3, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(2) "vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -570,7 +627,9 @@ void NV21ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(4) "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -581,6 +640,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %3, %3, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(2) "vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -599,7 +659,9 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( + MEMACCESS(4) "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -610,6 +672,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, YUV422TORGB "subs %3, %3, #8 \n" ARGBTORGB565 + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. "bgt 1b \n" : "+r"(src_y), // %0 @@ -628,7 +691,9 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( + MEMACCESS(4) "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -639,6 +704,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, YUV422TORGB "subs %3, %3, #8 \n" ARGBTORGB565 + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. "bgt 1b \n" : "+r"(src_y), // %0 @@ -656,7 +722,9 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(3) "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) "vld1.8 {d25}, [%4] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -667,6 +735,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, YUV422TORGB "subs %2, %2, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_yuy2), // %0 @@ -683,7 +752,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(3) "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) "vld1.8 {d25}, [%4] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -694,6 +765,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, YUV422TORGB "subs %2, %2, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_uyvy), // %0 @@ -712,9 +784,12 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store U + MEMACCESS(2) "vst1.8 {q1}, [%2]! \n" // store V "bgt 1b \n" : "+r"(src_uv), // %0 @@ -732,9 +807,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load U + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" // load V "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(2) "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV "bgt 1b \n" : @@ -752,8 +830,10 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop + MEMACCESS(1) "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 "bgt 1b \n" : "+r"(src), // %0 @@ -770,6 +850,7 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) { "vdup.u32 q0, %2 \n" // duplicate 4 ints "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" // store "bgt 1b \n" : "+r"(dst), // %0 @@ -798,10 +879,13 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0], r3 \n" // src -= 16 "subs %2, #16 \n" // 16 pixels per loop. "vrev64.8 q0, q0 \n" + MEMACCESS(1) "vst1.8 {d1}, [%1]! \n" // dst += 16 + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" "bgt 1b \n" : "+r"(src), // %0 @@ -822,10 +906,13 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 "subs %3, #8 \n" // 8 pixels per loop. "vrev64.8 q0, q0 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // dst += 8 + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" "bgt 1b \n" : "+r"(src_uv), // %0 @@ -846,10 +933,13 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0], r3 \n" // src -= 16 "subs %2, #4 \n" // 4 pixels per loop. "vrev64.32 q0, q0 \n" + MEMACCESS(1) "vst1.8 {d1}, [%1]! \n" // dst += 16 + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" "bgt 1b \n" : "+r"(src), // %0 @@ -865,8 +955,10 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { "vmov.u8 d4, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_rgb24), // %0 @@ -882,9 +974,11 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { "vmov.u8 d4, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B + MEMACCESS(1) "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_raw), // %0 @@ -912,9 +1006,11 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { "vmov.u8 d3, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_rgb565), // %0 @@ -958,9 +1054,11 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, "vmov.u8 d3, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb1555), // %0 @@ -987,9 +1085,11 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, "vmov.u8 d3, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb4444), // %0 @@ -1004,8 +1104,10 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1020,9 +1122,11 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B + MEMACCESS(1) "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1037,8 +1141,10 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_yuy2), // %0 @@ -1053,8 +1159,10 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_uyvy), // %0 @@ -1070,9 +1178,12 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) "vst1.8 {d1}, [%1]! \n" // store 8 U. + MEMACCESS(2) "vst1.8 {d3}, [%2]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 @@ -1089,9 +1200,12 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 U. + MEMACCESS(2) "vst1.8 {d2}, [%2]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 @@ -1109,12 +1223,16 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, "add %1, %0, %1 \n" // stride + src_yuy2 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. "vrhadd.u8 d1, d1, d5 \n" // average rows of U "vrhadd.u8 d3, d3, d7 \n" // average rows of V + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" // store 8 U. + MEMACCESS(3) "vst1.8 {d3}, [%3]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 @@ -1133,12 +1251,16 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, "add %1, %0, %1 \n" // stride + src_uyvy ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. "vrhadd.u8 d0, d0, d4 \n" // average rows of U "vrhadd.u8 d2, d2, d6 \n" // average rows of V + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 U. + MEMACCESS(3) "vst1.8 {d2}, [%3]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 @@ -1157,10 +1279,13 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, // change the stride to row 2 pointer "add %1, %0 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. "vrhadd.u8 q0, q1 \n" // average row 1 and 2 + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" "bgt 1b \n" : "+r"(src_uv), // %0 @@ -1178,11 +1303,13 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, asm volatile ( "vmov.u32 d6[0], %3 \n" // selector "1: \n" + MEMACCESS(0) "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels "vtrn.u32 d4, d5 \n" // combine 8 pixels + MEMACCESS(1) "vst1.8 {d4}, [%1]! \n" // store 8. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1198,8 +1325,10 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, uint32 /*selector*/, int pix) { asm volatile ( "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) "vst1.8 {d1}, [%1]! \n" // store 8 G's. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1214,12 +1343,15 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( + MEMACCESS(3) "vld1.8 {q2}, [%3] \n" // shuffler "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 4 pixels. "subs %2, %2, #4 \n" // 4 processed per loop "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + MEMACCESS(1) "vst1.8 {q1}, [%1]! \n" // store 4. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1237,10 +1369,14 @@ void I422ToYUY2Row_NEON(const uint8* src_y, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + MEMACCESS(1) "vld1.8 {d1}, [%1]! \n" // load 8 Us + MEMACCESS(2) "vld1.8 {d3}, [%2]! \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 @@ -1260,10 +1396,14 @@ void I422ToUYVYRow_NEON(const uint8* src_y, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + MEMACCESS(1) "vld1.8 {d0}, [%1]! \n" // load 8 Us + MEMACCESS(2) "vld1.8 {d2}, [%2]! \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 @@ -1280,9 +1420,11 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTORGB565 + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1298,9 +1440,11 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1317,9 +1461,11 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1338,6 +1484,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "vmov.u8 d27, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -1345,6 +1492,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1362,12 +1510,14 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B "vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1390,6 +1540,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -1405,7 +1556,9 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1429,7 +1582,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. @@ -1450,7 +1605,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1475,12 +1632,16 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(0) "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. + MEMACCESS(0) "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. @@ -1508,7 +1669,9 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1547,12 +1710,16 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1564,7 +1731,9 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1591,12 +1760,16 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1608,7 +1781,9 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1634,12 +1809,16 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. @@ -1651,7 +1830,9 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q3, q2, q1) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_bgra), // %0 @@ -1677,12 +1858,16 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1694,7 +1879,9 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q2, q1, q0) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_abgr), // %0 @@ -1720,12 +1907,16 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. @@ -1737,7 +1928,9 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgba), // %0 @@ -1763,12 +1956,16 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + MEMACCESS(0) "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + MEMACCESS(1) "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1780,7 +1977,9 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgb24), // %0 @@ -1806,12 +2005,16 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + MEMACCESS(0) "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + MEMACCESS(1) "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1823,7 +2026,9 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q2, q1, q0) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_raw), // %0 @@ -1850,22 +2055,26 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. RGB565TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. RGB565TOARGB "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. RGB565TOARGB "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. RGB565TOARGB "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. @@ -1887,7 +2096,9 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgb565), // %0 @@ -1914,22 +2125,26 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. RGB555TOARGB "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. RGB555TOARGB "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. @@ -1951,7 +2166,9 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb1555), // %0 @@ -1978,22 +2195,26 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. @@ -2015,7 +2236,9 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb4444), // %0 @@ -2037,6 +2260,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { "vmov.u8 d27, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB @@ -2045,6 +2269,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgb565), // %0 @@ -2063,6 +2288,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { "vmov.u8 d27, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB @@ -2071,6 +2297,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb1555), // %0 @@ -2089,6 +2316,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { "vmov.u8 d27, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB @@ -2097,6 +2325,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb4444), // %0 @@ -2115,6 +2344,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d1, d4 \n" // R @@ -2122,6 +2352,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { "vmlal.u8 q8, d3, d6 \n" // B "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_bgra), // %0 @@ -2140,6 +2371,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // R @@ -2147,6 +2379,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { "vmlal.u8 q8, d2, d6 \n" // B "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_abgr), // %0 @@ -2165,6 +2398,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d1, d4 \n" // B @@ -2172,6 +2406,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { "vmlal.u8 q8, d3, d6 \n" // R "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgba), // %0 @@ -2190,6 +2425,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // B @@ -2197,6 +2433,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { "vmlal.u8 q8, d2, d6 \n" // R "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgb24), // %0 @@ -2215,6 +2452,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // B @@ -2222,6 +2460,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { "vmlal.u8 q8, d2, d6 \n" // R "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_raw), // %0 @@ -2252,7 +2491,9 @@ void InterpolateRow_NEON(uint8* dst_ptr, "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vmull.u8 q13, d0, d4 \n" @@ -2261,46 +2502,58 @@ void InterpolateRow_NEON(uint8* dst_ptr, "vmlal.u8 q14, d3, d5 \n" "vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d1, q14, #8 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 1b \n" "b 99f \n" // Blend 25 / 75. "25: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 25b \n" "b 99f \n" // Blend 50 / 50. "50: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 50b \n" "b 99f \n" // Blend 75 / 25. "75: \n" + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q0}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 75b \n" "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" "subs %3, %3, #16 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 100b \n" @@ -2323,7 +2576,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "blt 89f \n" // Blend 8 pixels. "8: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q10, d4, d3 \n" // db * a @@ -2337,6 +2592,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "vqadd.u8 q0, q0, q2 \n" // + sbg "vqadd.u8 d2, d2, d6 \n" // + sr "vmov.u8 d3, #255 \n" // a = 255 + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. "bge 8b \n" @@ -2346,7 +2602,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // Blend 1 pixels. "1: \n" + MEMACCESS(0) "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + MEMACCESS(1) "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. "subs %3, %3, #1 \n" // 1 processed per loop. "vmull.u8 q10, d4, d3 \n" // db * a @@ -2360,6 +2618,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "vqadd.u8 q0, q0, q2 \n" // + sbg "vqadd.u8 d2, d2, d6 \n" // + sr "vmov.u8 d3, #255 \n" // a = 255 + MEMACCESS(2) "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. "bge 1b \n" @@ -2379,6 +2638,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( // Attenuate 8 pixels. "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q10, d0, d3 \n" // b * a @@ -2387,6 +2647,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -2410,6 +2671,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. "subs %1, %1, #8 \n" // 8 processed per loop. "vmovl.u8 q0, d0 \n" // b (0 .. 255) @@ -2427,6 +2689,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, "vqmovn.u16 d0, q0 \n" "vqmovn.u16 d2, q1 \n" "vqmovn.u16 d4, q2 \n" + MEMACCESS(0) "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(dst_argb), // %0 @@ -2451,6 +2714,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vmovl.u8 q10, d20 \n" // b (0 .. 255) @@ -2465,6 +2729,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, "vqmovn.u16 d22, q11 \n" "vqmovn.u16 d24, q12 \n" "vqmovn.u16 d26, q13 \n" + MEMACCESS(1) "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -2485,6 +2750,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -2493,6 +2759,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B "vmov d1, d0 \n" // G "vmov d2, d0 \n" // R + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -2520,6 +2787,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "vmov.u8 d30, #50 \n" // BR coefficient ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. "subs %1, %1, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d20 \n" // B to Sepia B @@ -2534,6 +2802,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + MEMACCESS(0) "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(dst_argb), // %0 @@ -2550,12 +2819,14 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, const int8* matrix_argb, int width) { asm volatile ( + MEMACCESS(3) "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. "vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit @@ -2594,6 +2865,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + MEMACCESS(1) "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -2614,7 +2886,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q0, d0, d1 \n" // multiply B @@ -2625,6 +2899,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" @@ -2645,11 +2920,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 q0, q0, q2 \n" // add B, G "vqadd.u8 q1, q1, q3 \n" // add R, A + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" @@ -2669,11 +2947,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vqsub.u8 q0, q0, q2 \n" // subtract B, G "vqsub.u8 q1, q1, q3 \n" // subtract R, A + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" @@ -2698,12 +2979,15 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + MEMACCESS(1) "vld1.8 {d1}, [%1]! \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 d0, d0, d1 \n" // add "vmov.u8 d1, d0 \n" "vmov.u8 d2, d0 \n" + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 @@ -2722,10 +3006,13 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // 16 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" // load 16 sobely. "subs %3, %3, #16 \n" // 16 processed per loop. "vqadd.u8 q0, q0, q1 \n" // add + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" // store 16 pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 @@ -2749,10 +3036,13 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + MEMACCESS(1) "vld1.8 {d0}, [%1]! \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 d1, d0, d2 \n" // add + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 @@ -2773,21 +3063,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0}, [%0],%5 \n" // top + MEMACCESS(0) "vld1.8 {d1}, [%0],%6 \n" "vsubl.u8 q0, d0, d1 \n" + MEMACCESS(1) "vld1.8 {d2}, [%1],%5 \n" // center * 2 + MEMACCESS(1) "vld1.8 {d3}, [%1],%6 \n" "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n" + MEMACCESS(2) "vld1.8 {d2}, [%2],%5 \n" // bottom + MEMACCESS(2) "vld1.8 {d3}, [%2],%6 \n" "subs %4, %4, #8 \n" // 8 pixels "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vabs.s16 q0, q0 \n" "vqmovn.u16 d0, q0 \n" + MEMACCESS(3) "vst1.8 {d0}, [%3]! \n" // store 8 sobelx "bgt 1b \n" : "+r"(src_y0), // %0 @@ -2810,21 +3107,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0}, [%0],%4 \n" // left + MEMACCESS(1) "vld1.8 {d1}, [%1],%4 \n" "vsubl.u8 q0, d0, d1 \n" + MEMACCESS(0) "vld1.8 {d2}, [%0],%4 \n" // center * 2 + MEMACCESS(1) "vld1.8 {d3}, [%1],%4 \n" "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n" + MEMACCESS(0) "vld1.8 {d2}, [%0],%5 \n" // right + MEMACCESS(1) "vld1.8 {d3}, [%1],%5 \n" "subs %3, %3, #8 \n" // 8 pixels "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vabs.s16 q0, q0 \n" "vqmovn.u16 d0, q0 \n" + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 sobely "bgt 1b \n" : "+r"(src_y0), // %0 diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc new file mode 100644 index 000000000..46e9ceb33 --- /dev/null +++ b/third_party/libyuv/source/row_neon64.cc @@ -0,0 +1,3323 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +// Read 8 Y, 4 U and 4 V from 422 +#define READYUV422 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.32 {d2[0]}, [%1]! \n" \ + MEMACCESS(2) \ + "vld1.32 {d2[1]}, [%2]! \n" + +// Read 8 Y, 2 U and 2 V from 422 +#define READYUV411 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.16 {d2[0]}, [%1]! \n" \ + MEMACCESS(2) \ + "vld1.16 {d2[1]}, [%2]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vzip.u8 d2, d3 \n" + +// Read 8 Y, 8 U and 8 V from 444 +#define READYUV444 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.8 {d2}, [%1]! \n" \ + MEMACCESS(2) \ + "vld1.8 {d3}, [%2]! \n" \ + "vpaddl.u8 q1, q1 \n" \ + "vrshrn.u16 d2, q1, #1 \n" + +// Read 8 Y, and set 4 U and 4 V to 128 +#define READYUV400 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + "vmov.u8 d2, #128 \n" + +// Read 8 Y and 4 UV from NV12 +#define READNV12 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 Y and 4 VU from NV21 +#define READNV21 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ + "vuzp.u8 d3, d2 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 YUY2 +#define READYUY2 \ + MEMACCESS(0) \ + "vld2.8 {d0, d2}, [%0]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 UYVY +#define READUYVY \ + MEMACCESS(0) \ + "vld2.8 {d2, d3}, [%0]! \n" \ + "vmov.u8 d0, d3 \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +#define YUV422TORGB \ + "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\ + "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\ + "vmull.s8 q9, d2, d25 \n"/* u/v G component */\ + "vmov.u8 d1, #0 \n"/* split odd/even y apart */\ + "vtrn.u8 d0, d1 \n" \ + "vsub.s16 q0, q0, q15 \n"/* offset y */\ + "vmul.s16 q0, q0, q14 \n" \ + "vadd.s16 d18, d19 \n" \ + "vqadd.s16 d20, d0, d16 \n" /* B */ \ + "vqadd.s16 d21, d1, d16 \n" \ + "vqadd.s16 d22, d0, d17 \n" /* R */ \ + "vqadd.s16 d23, d1, d17 \n" \ + "vqadd.s16 d16, d0, d18 \n" /* G */ \ + "vqadd.s16 d17, d1, d18 \n" \ + "vqshrun.s16 d0, q10, #6 \n" /* B */ \ + "vqshrun.s16 d1, q11, #6 \n" /* G */ \ + "vqshrun.s16 d2, q8, #6 \n" /* R */ \ + "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\ + "vmovl.u8 q11, d1 \n" \ + "vmovl.u8 q8, d2 \n" \ + "vtrn.u8 d20, d21 \n" \ + "vtrn.u8 d22, d23 \n" \ + "vtrn.u8 d16, d17 \n" \ + "vmov.u8 d21, d16 \n" + +static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102, + 0, 0, 0, 0, 0, 0, 0, 0 }; +static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, + 0, 0, 0, 0, 0, 0, 0, 0 }; + +#ifdef HAS_I444TOARGBROW_NEON +void I444ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV444 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I444TOARGBROW_NEON + +#ifdef HAS_I422TOARGBROW_NEON +void I422ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TOARGBROW_NEON + +#ifdef HAS_I411TOARGBROW_NEON +void I411ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV411 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I411TOARGBROW_NEON + +#ifdef HAS_I422TOBGRAROW_NEON +void I422ToBGRARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vmov.u8 d19, #255 \n" + MEMACCESS(3) + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_bgra), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TOBGRAROW_NEON + +#ifdef HAS_I422TOABGRROW_NEON +void I422ToABGRRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_abgr), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TOABGRROW_NEON + +#ifdef HAS_I422TORGBAROW_NEON +void I422ToRGBARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d19, #255 \n" + MEMACCESS(3) + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgba), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TORGBAROW_NEON + +#ifdef HAS_I422TORGB24ROW_NEON +void I422ToRGB24Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + MEMACCESS(3) + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb24), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TORGB24ROW_NEON + +#ifdef HAS_I422TORAWROW_NEON +void I422ToRAWRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + MEMACCESS(3) + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_raw), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TORAWROW_NEON + +#define ARGBTORGB565 \ + "vshr.u8 d20, d20, #3 \n" /* B */ \ + "vshr.u8 d21, d21, #2 \n" /* G */ \ + "vshr.u8 d22, d22, #3 \n" /* R */ \ + "vmovl.u8 q8, d20 \n" /* B */ \ + "vmovl.u8 q9, d21 \n" /* G */ \ + "vmovl.u8 q10, d22 \n" /* R */ \ + "vshl.u16 q9, q9, #5 \n" /* G */ \ + "vshl.u16 q10, q10, #11 \n" /* R */ \ + "vorr q0, q8, q9 \n" /* BG */ \ + "vorr q0, q0, q10 \n" /* BGR */ + +#ifdef HAS_I422TORGB565ROW_NEON +void I422ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + ARGBTORGB565 + MEMACCESS(3) + "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TORGB565ROW_NEON + +#define ARGBTOARGB1555 \ + "vshr.u8 q10, q10, #3 \n" /* B */ \ + "vshr.u8 d22, d22, #3 \n" /* R */ \ + "vshr.u8 d23, d23, #7 \n" /* A */ \ + "vmovl.u8 q8, d20 \n" /* B */ \ + "vmovl.u8 q9, d21 \n" /* G */ \ + "vmovl.u8 q10, d22 \n" /* R */ \ + "vmovl.u8 q11, d23 \n" /* A */ \ + "vshl.u16 q9, q9, #5 \n" /* G */ \ + "vshl.u16 q10, q10, #10 \n" /* R */ \ + "vshl.u16 q11, q11, #15 \n" /* A */ \ + "vorr q0, q8, q9 \n" /* BG */ \ + "vorr q1, q10, q11 \n" /* RA */ \ + "vorr q0, q0, q1 \n" /* BGRA */ + +#ifdef HAS_I422TOARGB1555ROW_NEON +void I422ToARGB1555Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + ARGBTOARGB1555 + MEMACCESS(3) + "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TOARGB1555ROW_NEON + +#define ARGBTOARGB4444 \ + "vshr.u8 d20, d20, #4 \n" /* B */ \ + "vbic.32 d21, d21, d4 \n" /* G */ \ + "vshr.u8 d22, d22, #4 \n" /* R */ \ + "vbic.32 d23, d23, d4 \n" /* A */ \ + "vorr d0, d20, d21 \n" /* BG */ \ + "vorr d1, d22, d23 \n" /* RA */ \ + "vzip.u8 d0, d1 \n" /* BGRA */ + +#ifdef HAS_I422TOARGB4444ROW_NEON +void I422ToARGB4444Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + ARGBTOARGB4444 + MEMACCESS(3) + "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb4444), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TOARGB4444ROW_NEON + +#ifdef HAS_YTOARGBROW_NEON +void YToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) + "vld1.8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV400 + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_YTOARGBROW_NEON + +#ifdef HAS_I400TOARGBROW_NEON +void I400ToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + ".p2align 2 \n" + "vmov.u8 d23, #255 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d20}, [%0]! \n" + "vmov d21, d20 \n" + "vmov d22, d20 \n" + "subs %2, %2, #8 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d20", "d21", "d22", "d23" + ); +} +#endif // HAS_I400TOARGBROW_NEON + +#ifdef HAS_NV12TOARGBROW_NEON +void NV12ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(4) + "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV12 + YUV422TORGB + "subs %3, %3, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(2) + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_NV12TOARGBROW_NEON + +#ifdef HAS_NV21TOARGBROW_NEON +void NV21ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(4) + "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV21 + YUV422TORGB + "subs %3, %3, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(2) + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_NV21TOARGBROW_NEON + +#ifdef HAS_NV12TORGB565ROW_NEON +void NV12ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + MEMACCESS(4) + "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV12 + YUV422TORGB + "subs %3, %3, #8 \n" + ARGBTORGB565 + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_NV12TORGB565ROW_NEON + +#ifdef HAS_NV21TORGB565ROW_NEON +void NV21ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + MEMACCESS(4) + "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV21 + YUV422TORGB + "subs %3, %3, #8 \n" + ARGBTORGB565 + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_NV21TORGB565ROW_NEON + +#ifdef HAS_YUY2TOARGBROW_NEON +void YUY2ToARGBRow_NEON(const uint8* src_yuy2, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) + "vld1.8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUY2 + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_YUY2TOARGBROW_NEON + +#ifdef HAS_UYVYTOARGBROW_NEON +void UYVYToARGBRow_NEON(const uint8* src_uyvy, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) + "vld1.8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READUYVY + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_UYVYTOARGBROW_NEON + +// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. +#ifdef HAS_SPLITUVROW_NEON +void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store U + MEMACCESS(2) + "vst1.8 {q1}, [%2]! \n" // store V + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} +#endif // HAS_SPLITUVROW_NEON + +// Reads 16 U's and V's and writes out 16 pairs of UV. +#ifdef HAS_MERGEUVROW_NEON +void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load U + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(2) + "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV + "bgt 1b \n" + : + "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} +#endif // HAS_MERGEUVROW_NEON + +// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. +#ifdef HAS_COPYROW_NEON +void CopyRow_NEON(const uint8* src, uint8* dst, int count) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + MEMACCESS(1) + "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} +#endif // HAS_COPYROW_NEON + +// SetRow8 writes 'count' bytes using a 32 bit value repeated. +#ifdef HAS_SETROW_NEON +void SetRow_NEON(uint8* dst, uint32 v32, int count) { + asm volatile ( + "vdup.u32 q0, %2 \n" // duplicate 4 ints + "1: \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(count) // %1 + : "r"(v32) // %2 + : "cc", "memory", "q0" + ); +} +#endif // HAS_SETROW_NEON + +// TODO(fbarchard): Make fully assembler +// SetRow32 writes 'count' words using a 32 bit value repeated. +#ifdef HAS_ARGBSETROWS_NEON +void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + for (int y = 0; y < height; ++y) { + SetRow_NEON(dst, v32, width << 2); + dst += dst_stride; + } +} +#endif // HAS_ARGBSETROWS_NEON + +#ifdef HAS_MIRRORROW_NEON +void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { + asm volatile ( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2 \n" + "sub %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #16 \n" // 16 pixels per loop. + "vrev64.8 q0, q0 \n" + MEMACCESS(1) + "vst1.8 {d1}, [%1]! \n" // dst += 16 + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0" + ); +} +#endif // HAS_MIRRORROW_NEON + +#ifdef HAS_MIRRORUVROW_NEON +void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + // Start at end of source row. + "mov r12, #-16 \n" + "add %0, %0, %3, lsl #1 \n" + "sub %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %3, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // dst += 8 + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "r12", "q0" + ); +} +#endif // HAS_MIRRORUVROW_NEON + +#ifdef HAS_ARGBMIRRORROW_NEON +void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { + asm volatile ( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2, lsl #2 \n" + "sub %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #4 \n" // 4 pixels per loop. + "vrev64.32 q0, q0 \n" + MEMACCESS(1) + "vst1.8 {d1}, [%1]! \n" // dst += 16 + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0" + ); +} +#endif // HAS_ARGBMIRRORROW_NEON + +#ifdef HAS_RGB24TOARGBROW_NEON +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d4, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} +#endif // HAS_RGB24TOARGBROW_NEON + +#ifdef HAS_RAWTOARGBROW_NEON +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d4, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + MEMACCESS(1) + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} +#endif // HAS_RAWTOARGBROW_NEON + +#define RGB565TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ + "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ + "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +#ifdef HAS_RGB565TOARGBROW_NEON +void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} +#endif // HAS_RGB565TOARGBROW_NEON + +#define ARGB1555TOARGB \ + "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ + "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ + "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ + "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ + "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ + "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ + "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ + "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ + "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ + "vorr.u8 q1, q1, q3 \n" /* R,A */ \ + "vorr.u8 q0, q0, q2 \n" /* B,G */ \ + +// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. +#define RGB555TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ + "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ + "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +#ifdef HAS_ARGB1555TOARGBROW_NEON +void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, + int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} +#endif // HAS_ARGB1555TOARGBROW_NEON + +#define ARGB4444TOARGB \ + "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ + "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ + "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ + "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ + "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ + "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ + "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ + "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ + +#ifdef HAS_ARGB4444TOARGBROW_NEON +void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, + int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} +#endif // HAS_ARGB4444TOARGBROW_NEON + +#ifdef HAS_ARGBTORGB24ROW_NEON +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} +#endif // HAS_ARGBTORGB24ROW_NEON + +#ifdef HAS_ARGBTORAWROW_NEON +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + MEMACCESS(1) + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} +#endif // HAS_ARGBTORAWROW_NEON + +#ifdef HAS_YUY2TOYROW_NEON +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} +#endif // HAS_YUY2TOYROW_NEON + +#ifdef HAS_UYVYTOYROW_NEON +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} +#endif // HAS_UYVYTOYROW_NEON + +#ifdef HAS_YUY2TOUV422ROW_NEON +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "vst1.8 {d1}, [%1]! \n" // store 8 U. + MEMACCESS(2) + "vst1.8 {d3}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} +#endif // HAS_YUY2TOUV422ROW_NEON + +#ifdef HAS_UYVYTOUV422ROW_NEON +void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 U. + MEMACCESS(2) + "vst1.8 {d2}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} +#endif // HAS_UYVYTOUV422ROW_NEON + +#ifdef HAS_YUY2TOUVROW_NEON +void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // stride + src_yuy2 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 d1, d1, d5 \n" // average rows of U + "vrhadd.u8 d3, d3, d7 \n" // average rows of V + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" // store 8 U. + MEMACCESS(3) + "vst1.8 {d3}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + ); +} +#endif // HAS_YUY2TOUVROW_NEON + +#ifdef HAS_UYVYTOUVROW_NEON +void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // stride + src_uyvy + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "vrhadd.u8 d0, d0, d4 \n" // average rows of U + "vrhadd.u8 d2, d2, d6 \n" // average rows of V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 U. + MEMACCESS(3) + "vst1.8 {d2}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(stride_uyvy), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + ); +} +#endif // HAS_UYVYTOUVROW_NEON + +#ifdef HAS_HALFROW_NEON +void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %0 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. + "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. + "vrhadd.u8 q0, q1 \n" // average row 1 and 2 + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(src_uv_stride), // %1 + "+r"(dst_uv), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} +#endif // HAS_HALFROW_NEON + +// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG +#ifdef HAS_ARGBTOBAYERROW_NEON +void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + asm volatile ( + "vmov.u32 d6[0], %3 \n" // selector + "1: \n" + MEMACCESS(0) + "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop + "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels + "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels + "vtrn.u32 d4, d5 \n" // combine 8 pixels + MEMACCESS(1) + "vst1.8 {d4}, [%1]! \n" // store 8. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : "r"(selector) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} +#endif // HAS_ARGBTOBAYERROW_NEON + +// Select G channels from ARGB. e.g. GGGGGGGG +#ifdef HAS_ARGBTOBAYERGGROW_NEON +void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 /*selector*/, int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "vst1.8 {d1}, [%1]! \n" // store 8 G's. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} +#endif // HAS_ARGBTOBAYERGGROW_NEON + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +#ifdef HAS_ARGBSHUFFLEROW_NEON +void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {q2}, [%3] \n" // shuffler + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels + "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" // store 4. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} +#endif // HAS_ARGBSHUFFLEROW_NEON + +#ifdef HAS_I422TOYUY2ROW_NEON +void I422ToYUY2Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + MEMACCESS(1) + "vld1.8 {d1}, [%1]! \n" // load 8 Us + MEMACCESS(2) + "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3" + ); +} +#endif // HAS_I422TOYUY2ROW_NEON + +#ifdef HAS_I422TOUYVYROW_NEON +void I422ToUYVYRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + MEMACCESS(1) + "vld1.8 {d0}, [%1]! \n" // load 8 Us + MEMACCESS(2) + "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3" + ); +} +#endif // HAS_I422TOUYVYROW_NEON + +#ifdef HAS_ARGBTORGB565ROW_NEON +void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} +#endif // HAS_ARGBTORGB565ROW_NEON + +#ifdef HAS_ARGBTOARGB1555ROW_NEON +void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} +#endif // HAS_ARGBTOARGB1555ROW_NEON + +#ifdef HAS_ARGBTOARGB4444ROW_NEON +void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, + int pix) { + asm volatile ( + "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} +#endif // HAS_ARGBTOARGB4444ROW_NEON + +#ifdef HAS_ARGBTOYROW_NEON +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + ); +} +#endif // HAS_ARGBTOYROW_NEON + +#ifdef HAS_ARGBTOYJROW_NEON +void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + ); +} +#endif // HAS_ARGBTOYJROW_NEON + +// 8x1 pixels. +#ifdef HAS_ARGBTOUV444ROW_NEON +void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient + "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient + "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient + "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient + "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlsl.u8 q2, d1, d25 \n" // G + "vmlsl.u8 q2, d2, d26 \n" // R + "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned + + "vmull.u8 q3, d2, d24 \n" // R + "vmlsl.u8 q3, d1, d28 \n" // G + "vmlsl.u8 q3, d0, d27 \n" // B + "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned + + "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V + + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGBTOUV444ROW_NEON + +// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGBTOUV422ROW_NEON +void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + + "subs %3, %3, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q0, q10 \n" // B + "vmls.s16 q8, q1, q11 \n" // G + "vmls.s16 q8, q2, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + + "vmul.s16 q9, q2, q10 \n" // R + "vmls.s16 q9, q1, q14 \n" // G + "vmls.s16 q9, q0, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGBTOUV422ROW_NEON + +// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. +#ifdef HAS_ARGBTOUV411ROW_NEON +void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(0) + "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. + MEMACCESS(0) + "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. + "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts. + + "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts. + "vpadd.u16 d1, d8, d9 \n" // B + "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts. + "vpadd.u16 d3, d10, d11 \n" // G + "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts. + "vpadd.u16 d5, d12, d13 \n" // R + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %3, %3, #32 \n" // 32 processed per loop. + "vmul.s16 q8, q0, q10 \n" // B + "vmls.s16 q8, q1, q11 \n" // G + "vmls.s16 q8, q2, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q2, q10 \n" // R + "vmls.s16 q9, q1, q14 \n" // G + "vmls.s16 q9, q0, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGBTOUV411ROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#define RGBTOUV(QB, QG, QR) \ + "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ + "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ + "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ + "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ + "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ + "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ + "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ + "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ + "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ + "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ + +// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. +#ifdef HAS_ARGBTOUVROW_NEON +void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGBTOUVROW_NEON + +// TODO(fbarchard): Subsample match C code. +#ifdef HAS_ARGBTOUVJROW_NEON +void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGBTOUVJROW_NEON + +#ifdef HAS_BGRATOUVROW_NEON +void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_bgra + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. + "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. + "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q1, q1, #1 \n" // 2x average + "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q3, q3, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q3, q2, q1) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(src_stride_bgra), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_BGRATOUVROW_NEON + +#ifdef HAS_ABGRTOUVROW_NEON +void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_abgr + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q2, q1, q0) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_stride_abgr), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ABGRTOUVROW_NEON + +#ifdef HAS_RGBATOUVROW_NEON +void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgba + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. + "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. + "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(src_stride_rgba), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_RGBATOUVROW_NEON + +#ifdef HAS_RGB24TOUVROW_NEON +void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgb24 + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + MEMACCESS(0) + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + MEMACCESS(1) + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_stride_rgb24), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_RGB24TOUVROW_NEON + +#ifdef HAS_RAWTOUVROW_NEON +void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_raw + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + MEMACCESS(0) + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + MEMACCESS(1) + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q2, q1, q0) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_stride_raw), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_RAWTOUVROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_RGB565TOUVROW_NEON +void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_stride_rgb565), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_RGB565TOUVROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGB1555TOUVROW_NEON +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_stride_argb1555), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGB1555TOUVROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGB4444TOUVROW_NEON +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_stride_argb4444), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGB4444TOUVROW_NEON + +#ifdef HAS_RGB565TOYROW_NEON +void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} +#endif // HAS_RGB565TOYROW_NEON + +#ifdef HAS_ARGB1555TOYROW_NEON +void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} +#endif // HAS_ARGB1555TOYROW_NEON + +#ifdef HAS_ARGB4444TOYROW_NEON +void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} +#endif // HAS_ARGB4444TOYROW_NEON + +#ifdef HAS_BGRATOYROW_NEON +void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // R + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} +#endif // HAS_BGRATOYROW_NEON + +#ifdef HAS_ABGRTOYROW_NEON +void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // R + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} +#endif // HAS_ABGRTOYROW_NEON + +#ifdef HAS_RGBATOYROW_NEON +void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // B + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} +#endif // HAS_RGBATOYROW_NEON + +#ifdef HAS_RGB24TOYROW_NEON +void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} +#endif // HAS_RGB24TOYROW_NEON + +#ifdef HAS_RAWTOYROW_NEON +void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} +#endif // HAS_RAWTOYROW_NEON + +// Bilinear filter 16x2 -> 16x1 +#ifdef HAS_INTERPOLATEROW_NEON +void InterpolateRow_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + asm volatile ( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" + ); +} +#endif // HAS_INTERPOLATEROW_NEON + +// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr +#ifdef HAS_ARGBBLENDROW_NEON +void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "subs %3, #8 \n" + "blt 89f \n" + // Blend 8 pixels. + "8: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. + "bge 8b \n" + + "89: \n" + "adds %3, #8-1 \n" + "blt 99f \n" + + // Blend 1 pixels. + "1: \n" + MEMACCESS(0) + "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + MEMACCESS(1) + "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. + "subs %3, %3, #1 \n" // 1 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + MEMACCESS(2) + "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. + "bge 1b \n" + + "99: \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12" + ); +} +#endif // HAS_ARGBBLENDROW_NEON + +// Attenuate 8 pixels at a time. +#ifdef HAS_ARGBATTENUATEROW_NEON +void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + // Attenuate 8 pixels. + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d0, d3 \n" // b * a + "vmull.u8 q11, d1, d3 \n" // g * a + "vmull.u8 q12, d2, d3 \n" // r * a + "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 + "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 + "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q10", "q11", "q12" + ); +} +#endif // HAS_ARGBATTENUATEROW_NEON + +// Quantize 8 ARGB pixels (32 bytes). +// dst = (dst * scale >> 16) * interval_size + interval_offset; +#ifdef HAS_ARGBQUANTIZEROW_NEON +void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + asm volatile ( + "vdup.u16 q8, %2 \n" + "vshr.u16 q8, q8, #1 \n" // scale >>= 1 + "vdup.u16 q9, %3 \n" // interval multiply. + "vdup.u16 q10, %4 \n" // interval add + + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmovl.u8 q0, d0 \n" // b (0 .. 255) + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q2, d4 \n" + "vqdmulh.s16 q0, q0, q8 \n" // b * scale + "vqdmulh.s16 q1, q1, q8 \n" // g + "vqdmulh.s16 q2, q2, q8 \n" // r + "vmul.u16 q0, q0, q9 \n" // b * interval_size + "vmul.u16 q1, q1, q9 \n" // g + "vmul.u16 q2, q2, q9 \n" // r + "vadd.u16 q0, q0, q10 \n" // b + interval_offset + "vadd.u16 q1, q1, q10 \n" // g + "vadd.u16 q2, q2, q10 \n" // r + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d2, q1 \n" + "vqmovn.u16 d4, q2 \n" + MEMACCESS(0) + "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" + ); +} +#endif // HAS_ARGBQUANTIZEROW_NEON + +// Shade 8 pixels at a time by specified value. +// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. +// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. +#ifdef HAS_ARGBSHADEROW_NEON +void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + asm volatile ( + "vdup.u32 q0, %3 \n" // duplicate scale value. + "vzip.u8 d0, d1 \n" // d0 aarrggbb. + "vshr.u16 q0, q0, #1 \n" // scale / 2. + + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q10, d20 \n" // b (0 .. 255) + "vmovl.u8 q11, d22 \n" + "vmovl.u8 q12, d24 \n" + "vmovl.u8 q13, d26 \n" + "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 + "vqrdmulh.s16 q11, q11, d0[1] \n" // g + "vqrdmulh.s16 q12, q12, d0[2] \n" // r + "vqrdmulh.s16 q13, q13, d0[3] \n" // a + "vqmovn.u16 d20, q10 \n" + "vqmovn.u16 d22, q11 \n" + "vqmovn.u16 d24, q12 \n" + "vqmovn.u16 d26, q13 \n" + MEMACCESS(1) + "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "q0", "q10", "q11", "q12", "q13" + ); +} +#endif // HAS_ARGBSHADEROW_NEON + +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +// Similar to ARGBToYJ but stores ARGB. +// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; +#ifdef HAS_ARGBGRAYROW_NEON +void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B + "vmov d1, d0 \n" // G + "vmov d2, d0 \n" // R + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + ); +} +#endif // HAS_ARGBGRAYROW_NEON + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 + +#ifdef HAS_ARGBSEPIAROW_NEON +void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d20, #17 \n" // BB coefficient + "vmov.u8 d21, #68 \n" // BG coefficient + "vmov.u8 d22, #35 \n" // BR coefficient + "vmov.u8 d24, #22 \n" // GB coefficient + "vmov.u8 d25, #88 \n" // GG coefficient + "vmov.u8 d26, #45 \n" // GR coefficient + "vmov.u8 d28, #24 \n" // BB coefficient + "vmov.u8 d29, #98 \n" // BG coefficient + "vmov.u8 d30, #50 \n" // BR coefficient + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d20 \n" // B to Sepia B + "vmlal.u8 q2, d1, d21 \n" // G + "vmlal.u8 q2, d2, d22 \n" // R + "vmull.u8 q3, d0, d24 \n" // B to Sepia G + "vmlal.u8 q3, d1, d25 \n" // G + "vmlal.u8 q3, d2, d26 \n" // R + "vmull.u8 q8, d0, d28 \n" // B to Sepia R + "vmlal.u8 q8, d1, d29 \n" // G + "vmlal.u8 q8, d2, d30 \n" // R + "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B + "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G + "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + MEMACCESS(0) + "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "q0", "q1", "q2", "q3", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGBSEPIAROW_NEON + +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// TODO(fbarchard): Was same as Sepia except matrix is provided. This function +// needs to saturate. Consider doing a non-saturating version. +#ifdef HAS_ARGBCOLORMATRIXROW_NEON +void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. + "vmovl.s8 q0, d4 \n" // B,G coefficients s16. + "vmovl.s8 q1, d5 \n" // R,A coefficients s16. + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit + "vmovl.u8 q9, d18 \n" // g + "vmovl.u8 q10, d20 \n" // r + "vmovl.u8 q15, d22 \n" // a + "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B + "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G + "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R + "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A + "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B + "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G + "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R + "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B + "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G + "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R + "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B + "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G + "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B + "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G + "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R + "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + MEMACCESS(1) + "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGBCOLORMATRIXROW_NEON + +// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +#ifdef HAS_ARGBMULTIPLYROW_NEON +void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) + "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q0, d0, d1 \n" // multiply B + "vmull.u8 q1, d2, d3 \n" // multiply G + "vmull.u8 q2, d4, d5 \n" // multiply R + "vmull.u8 q3, d6, d7 \n" // multiply A + "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B + "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G + "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R + "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} +#endif // HAS_ARGBMULTIPLYROW_NEON + +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +#ifdef HAS_ARGBADDROW_NEON +void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 q0, q0, q2 \n" // add B, G + "vqadd.u8 q1, q1, q3 \n" // add R, A + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} +#endif // HAS_ARGBADDROW_NEON + +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +#ifdef HAS_ARGBSUBTRACTROW_NEON +void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqsub.u8 q0, q0, q2 \n" // subtract B, G + "vqsub.u8 q1, q1, q3 \n" // subtract R, A + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} +#endif // HAS_ARGBSUBTRACTROW_NEON + +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +#ifdef HAS_SOBELROW_NEON +void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + MEMACCESS(1) + "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d1 \n" // add + "vmov.u8 d1, d0 \n" + "vmov.u8 d2, d0 \n" + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} +#endif // HAS_SOBELROW_NEON + +// Adds Sobel X and Sobel Y and stores Sobel into plane. +#ifdef HAS_SOBELTOPLANEROW_NEON +void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + asm volatile ( + // 16 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "vqadd.u8 q0, q0, q1 \n" // add + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} +#endif // HAS_SOBELTOPLANEROW_NEON + +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +#ifdef HAS_SOBELXYROW_NEON +void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + MEMACCESS(1) + "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d1, d0, d2 \n" // add + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} +#endif // HAS_SOBELXYROW_NEON + +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +#ifdef HAS_SOBELXROW_NEON +void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0}, [%0],%5 \n" // top + MEMACCESS(0) + "vld1.8 {d1}, [%0],%6 \n" + "vsubl.u8 q0, d0, d1 \n" + MEMACCESS(1) + "vld1.8 {d2}, [%1],%5 \n" // center * 2 + MEMACCESS(1) + "vld1.8 {d3}, [%1],%6 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + MEMACCESS(2) + "vld1.8 {d2}, [%2],%5 \n" // bottom + MEMACCESS(2) + "vld1.8 {d3}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + MEMACCESS(3) + "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2), // %5 + "r"(6) // %6 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} +#endif // HAS_SOBELXROW_NEON + +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +#ifdef HAS_SOBELYROW_NEON +void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0}, [%0],%4 \n" // left + MEMACCESS(1) + "vld1.8 {d1}, [%1],%4 \n" + "vsubl.u8 q0, d0, d1 \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0],%4 \n" // center * 2 + MEMACCESS(1) + "vld1.8 {d3}, [%1],%4 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0],%5 \n" // right + MEMACCESS(1) + "vld1.8 {d3}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1), // %4 + "r"(6) // %5 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} +#endif // HAS_SOBELYROW_NEON +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/row_posix.cc b/third_party/libyuv/source/row_posix.cc index e47708802..106fda568 100644 --- a/third_party/libyuv/source/row_posix.cc +++ b/third_party/libyuv/source/row_posix.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/libyuv/include/libyuv/row.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc index 2cfacad1b..8eb888926 100644 --- a/third_party/libyuv/source/row_win.cc +++ b/third_party/libyuv/source/row_win.cc @@ -8,15 +8,179 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/libyuv/include/libyuv/row.h" +#include "libyuv/row.h" + +#if defined (_M_X64) +#include +#include // For _mm_maddubs_epi16 +#endif #ifdef __cplusplus namespace libyuv { extern "C" { #endif -// This module is for Visual C x86. -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +// This module is for Visual C. +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) + +#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ + +#define UB 127 /* min(127,(int8)(2.018 * 64)) */ +#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ +#define UR 0 + +#define VB 0 +#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ +#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ + +// Bias +#define BB UB * 128 + VB * 128 +#define BG UG * 128 + VG * 128 +#define BR UR * 128 + VR * 128 + +static const vec8 kUVToB = { + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB +}; + +static const vec8 kUVToR = { + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR +}; + +static const vec8 kUVToG = { + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG +}; + +static const vec8 kVUToB = { + VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, +}; + +static const vec8 kVUToR = { + VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, +}; + +static const vec8 kVUToG = { + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, +}; + +static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; +static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; +static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; +static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; +static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; + +// 64 bit +#if defined(_M_X64) + +// Aligned destination version. +__declspec(align(16)) +void I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + + __m128i xmm0, xmm1, xmm2, xmm3; + const __m128i xmm5 = _mm_set1_epi8(-1); + const __m128i xmm4 = _mm_setzero_si128(); + const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; + + while (width > 0) { + xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); + xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); + xmm1 = _mm_load_si128(&xmm0); + xmm2 = _mm_load_si128(&xmm0); + xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); + xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); + xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); + xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB); + xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); + xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); + xmm3 = _mm_loadl_epi64((__m128i*)y_buf); + xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); + xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); + xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); + xmm0 = _mm_adds_epi16(xmm0, xmm3); + xmm1 = _mm_adds_epi16(xmm1, xmm3); + xmm2 = _mm_adds_epi16(xmm2, xmm3); + xmm0 = _mm_srai_epi16(xmm0, 6); + xmm1 = _mm_srai_epi16(xmm1, 6); + xmm2 = _mm_srai_epi16(xmm2, 6); + xmm0 = _mm_packus_epi16(xmm0, xmm0); + xmm1 = _mm_packus_epi16(xmm1, xmm1); + xmm2 = _mm_packus_epi16(xmm2, xmm2); + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); + xmm1 = _mm_load_si128(&xmm0); + xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); + xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); + + _mm_store_si128((__m128i *)dst_argb, xmm0); + _mm_store_si128((__m128i *)(dst_argb + 16), xmm1); + + y_buf += 8; + u_buf += 4; + dst_argb += 32; + width -= 8; + } +} + +// Unaligned destination version. +void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + + __m128i xmm0, xmm1, xmm2, xmm3; + const __m128i xmm5 = _mm_set1_epi8(-1); + const __m128i xmm4 = _mm_setzero_si128(); + const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; + + while (width > 0) { + xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); + xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); + xmm1 = _mm_load_si128(&xmm0); + xmm2 = _mm_load_si128(&xmm0); + xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); + xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); + xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); + xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB); + xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); + xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); + xmm3 = _mm_loadl_epi64((__m128i*)y_buf); + xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); + xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); + xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); + xmm0 = _mm_adds_epi16(xmm0, xmm3); + xmm1 = _mm_adds_epi16(xmm1, xmm3); + xmm2 = _mm_adds_epi16(xmm2, xmm3); + xmm0 = _mm_srai_epi16(xmm0, 6); + xmm1 = _mm_srai_epi16(xmm1, 6); + xmm2 = _mm_srai_epi16(xmm2, 6); + xmm0 = _mm_packus_epi16(xmm0, xmm0); + xmm1 = _mm_packus_epi16(xmm1, xmm1); + xmm2 = _mm_packus_epi16(xmm2, xmm2); + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); + xmm1 = _mm_load_si128(&xmm0); + xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); + xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); + + _mm_storeu_si128((__m128i *)dst_argb, xmm0); + _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); + + y_buf += 8; + u_buf += 4; + dst_argb += 32; + width -= 8; + } +} +// 32 bit +#else // defined(_M_X64) #ifdef HAS_ARGBTOYROW_SSSE3 @@ -2030,21 +2194,6 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOYROW_SSSE3 -#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ - -#define UB 127 /* min(63,(int8)(2.018 * 64)) */ -#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ -#define UR 0 - -#define VB 0 -#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ -#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ - -// Bias -#define BB UB * 128 + VB * 128 -#define BG UG * 128 + VG * 128 -#define BR UR * 128 + VR * 128 - #ifdef HAS_I422TOARGBROW_AVX2 static const lvec8 kUVToB_AVX = { @@ -2079,10 +2228,10 @@ static const lvec16 kUVBiasR_AVX = { // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) __declspec(align(16)) void I422ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { __asm { push esi push edi @@ -2150,36 +2299,6 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_I422TOARGBROW_SSSE3 -static const vec8 kUVToB = { - UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB -}; - -static const vec8 kUVToR = { - UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR -}; - -static const vec8 kUVToG = { - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG -}; - -static const vec8 kVUToB = { - VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, -}; - -static const vec8 kVUToR = { - VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, -}; - -static const vec8 kVUToG = { - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, -}; - -static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; -static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; -static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; -static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; -static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; - // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. // Read 8 UV from 444. @@ -7276,7 +7395,8 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, } #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +#endif // defined(_M_X64) +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) #ifdef __cplusplus } // extern "C" diff --git a/third_party/libyuv/source/row_x86.asm b/third_party/libyuv/source/row_x86.asm new file mode 100644 index 000000000..0cb326f8e --- /dev/null +++ b/third_party/libyuv/source/row_x86.asm @@ -0,0 +1,146 @@ +; +; Copyright 2012 The LibYuv Project Authors. All rights reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%ifdef __YASM_VERSION_ID__ +%if __YASM_VERSION_ID__ < 01020000h +%error AVX2 is supported only by yasm 1.2.0 or later. +%endif +%endif +%include "x86inc.asm" + +SECTION .text + +; cglobal numeric constants are parameters, gpr regs, mm regs + +; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) + +%macro YUY2TOYROW 2-3 +cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix +%ifidn %1,YUY2 + pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff + psrlw m2, m2, 8 +%endif + + ALIGN 4 +.convertloop: + mov%2 m0, [src_yuy2q] + mov%2 m1, [src_yuy2q + mmsize] + lea src_yuy2q, [src_yuy2q + mmsize * 2] +%ifidn %1,YUY2 + pand m0, m0, m2 ; YUY2 even bytes are Y + pand m1, m1, m2 +%else + psrlw m0, m0, 8 ; UYVY odd bytes are Y + psrlw m1, m1, 8 +%endif + packuswb m0, m0, m1 +%if cpuflag(AVX2) + vpermq m0, m0, 0xd8 +%endif + sub pixd, mmsize + mov%2 [dst_yq], m0 + lea dst_yq, [dst_yq + mmsize] + jg .convertloop + REP_RET +%endmacro + +; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version. +INIT_MMX MMX +YUY2TOYROW YUY2,a, +YUY2TOYROW YUY2,u,_Unaligned +YUY2TOYROW UYVY,a, +YUY2TOYROW UYVY,u,_Unaligned +INIT_XMM SSE2 +YUY2TOYROW YUY2,a, +YUY2TOYROW YUY2,u,_Unaligned +YUY2TOYROW UYVY,a, +YUY2TOYROW UYVY,u,_Unaligned +INIT_YMM AVX2 +YUY2TOYROW YUY2,a, +YUY2TOYROW UYVY,a, + +; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) + +%macro SplitUVRow 1-2 +cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix + pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff + psrlw m4, m4, 8 + sub dst_vq, dst_uq + + ALIGN 4 +.convertloop: + mov%1 m0, [src_uvq] + mov%1 m1, [src_uvq + mmsize] + lea src_uvq, [src_uvq + mmsize * 2] + psrlw m2, m0, 8 ; odd bytes + psrlw m3, m1, 8 + pand m0, m0, m4 ; even bytes + pand m1, m1, m4 + packuswb m0, m0, m1 + packuswb m2, m2, m3 +%if cpuflag(AVX2) + vpermq m0, m0, 0xd8 + vpermq m2, m2, 0xd8 +%endif + mov%1 [dst_uq], m0 + mov%1 [dst_uq + dst_vq], m2 + lea dst_uq, [dst_uq + mmsize] + sub pixd, mmsize + jg .convertloop + REP_RET +%endmacro + +INIT_MMX MMX +SplitUVRow a, +SplitUVRow u,_Unaligned +INIT_XMM SSE2 +SplitUVRow a, +SplitUVRow u,_Unaligned +INIT_YMM AVX2 +SplitUVRow a, + +; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +; int width); + +%macro MergeUVRow_ 1-2 +cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix + sub src_vq, src_uq + + ALIGN 4 +.convertloop: + mov%1 m0, [src_uq] + mov%1 m1, [src_vq] + lea src_uq, [src_uq + mmsize] + punpcklbw m2, m0, m1 // first 8 UV pairs + punpckhbw m0, m0, m1 // next 8 UV pairs +%if cpuflag(AVX2) + vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0 + vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0 + mov%1 [dst_uvq], m1 + mov%1 [dst_uvq + mmsize], m2 +%else + mov%1 [dst_uvq], m2 + mov%1 [dst_uvq + mmsize], m0 +%endif + lea dst_uvq, [dst_uvq + mmsize * 2] + sub pixd, mmsize + jg .convertloop + REP_RET +%endmacro + +INIT_MMX MMX +MergeUVRow_ a, +MergeUVRow_ u,_Unaligned +INIT_XMM SSE2 +MergeUVRow_ a, +MergeUVRow_ u,_Unaligned +INIT_YMM AVX2 +MergeUVRow_ a, + diff --git a/third_party/libyuv/source/scale.cc b/third_party/libyuv/source/scale.cc index 31cedf11d..5b33b5f04 100644 --- a/third_party/libyuv/source/scale.cc +++ b/third_party/libyuv/source/scale.cc @@ -8,15 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/libyuv/include/libyuv/scale.h" +#include "libyuv/scale.h" #include #include -#include "third_party/libyuv/include/libyuv/cpu_id.h" -#include "third_party/libyuv/include/libyuv/planar_functions.h" // CopyPlane -#include "third_party/libyuv/include/libyuv/row.h" -#include "third_party/libyuv/include/libyuv/scale_row.h" +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyPlane +#include "libyuv/row.h" +#include "libyuv/scale_row.h" #ifdef __cplusplus namespace libyuv { diff --git a/third_party/libyuv/source/scale_argb.cc b/third_party/libyuv/source/scale_argb.cc new file mode 100644 index 000000000..e339cd7c7 --- /dev/null +++ b/third_party/libyuv/source/scale_argb.cc @@ -0,0 +1,809 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyARGB +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// ScaleARGB ARGB, 1/2 +// This is an optimized version for scaling down a ARGB to 1/2 of +// its original size. +static void ScaleARGBDown2(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + int row_stride = src_stride * (dy >> 16); + void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) = + filtering == kFilterNone ? ScaleARGBRowDown2_C : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C : + ScaleARGBRowDown2Box_C); + assert(dx == 65536 * 2); // Test scale factor of 2. + assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. + // Advance to odd row, even column. + if (filtering == kFilterBilinear) { + src_argb += (y >> 16) * src_stride + (x >> 16) * 4; + } else { + src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4; + } + +#if defined(HAS_SCALEARGBROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : + ScaleARGBRowDown2Box_SSE2); + } +#elif defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) { + ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON : + ScaleARGBRowDown2_NEON; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } +} + +// ScaleARGB ARGB, 1/4 +// This is an optimized version for scaling down a ARGB to 1/4 of +// its original size. +static void ScaleARGBDown4Box(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy) { + int j; + // Allocate 2 rows of ARGB. + const int kRowSize = (dst_width * 2 * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + int row_stride = src_stride * (dy >> 16); + void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C; + // Advance to odd row, even column. + src_argb += (y >> 16) * src_stride + (x >> 16) * 4; + assert(dx == 65536 * 4); // Test scale factor of 4. + assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. +#if defined(HAS_SCALEARGBROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; + } +#elif defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON; + } +#endif + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); + ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, + row + kRowSize, dst_width * 2); + ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } + free_aligned_buffer_64(row); +} + +// ScaleARGB ARGB Even +// This is an optimized version for scaling down a ARGB to even +// multiple of its original size. +static void ScaleARGBDownEven(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + int col_step = dx >> 16; + int row_stride = (dy >> 16) * src_stride; + void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride, + int src_step, uint8* dst_argb, int dst_width) = + filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + src_argb += (y >> 16) * src_stride + (x >> 16) * 4; +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : + ScaleARGBRowDownEven_SSE2; + } +#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(src_argb, 4)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : + ScaleARGBRowDownEven_NEON; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } +} + +// Scale ARGB down with bilinear interpolation. +static void ScaleARGBBilinearDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C; + int64 xlast = x + (int64)(dst_width - 1) * dx; + int64 xl = (dx >= 0) ? x : xlast; + int64 xr = (dx >= 0) ? xlast : x; + int clip_src_width; + xl = (xl >> 16) & ~3; // Left edge aligned. + xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. + xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. + if (xr > src_width) { + xr = src_width; + } + clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. + src_argb += xl * 4; + x -= (int)(xl << 16); +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(clip_src_width, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row of ARGB. + { + align_buffer_64(row, clip_src_width * 4); + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8* src = src_argb + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, clip_src_width, yf); + ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx); + } + dst_argb += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); + } +} + +// Scale ARGB up with bilinear interpolation. +static void ScaleARGBBilinearUp(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; + const int max_y = (src_height - 1) << 16; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } +#endif + if (src_width >= 32768) { + ScaleARGBFilterCols = filtering ? + ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + } +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBCols_SSE2; + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBFilterCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + + { + int yi = y >> 16; + const uint8* src = src_argb + yi * src_stride; + + // Allocate 2 rows of ARGB. + const int kRowSize = (dst_width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + uint8* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_argb + yi * src_stride; + } + if (yi != lasty) { + ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + } + dst_argb += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +#ifdef YUVSCALEUP +// Scale YUV to ARGB up with bilinear interpolation. +static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(src_width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } +#endif + + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } +#endif + + void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; + if (src_width >= 32768) { + ScaleARGBFilterCols = filtering ? + ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + } +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBCols_SSE2; + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBFilterCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. + int yi = y >> 16; + int uv_yi = yi >> kYShift; + const uint8* src_row_y = src_y + yi * src_stride_y; + const uint8* src_row_u = src_u + uv_yi * src_stride_u; + const uint8* src_row_v = src_v + uv_yi * src_stride_v; + + // Allocate 2 rows of ARGB. + const int kRowSize = (dst_width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + // Allocate 1 row of ARGB for source conversion. + align_buffer_64(argb_row, src_width * 4); + + uint8* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + // TODO(fbarchard): Convert first 2 rows of YUV to ARGB. + ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx); + if (src_height > 1) { + src_row_y += src_stride_y; + if (yi & 1) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx); + if (src_height > 2) { + src_row_y += src_stride_y; + if (!(yi & 1)) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + uv_yi = yi >> kYShift; + src_row_y = src_y + yi * src_stride_y; + src_row_u = src_u + uv_yi * src_stride_u; + src_row_v = src_v + uv_yi * src_stride_v; + } + if (yi != lasty) { + // TODO(fbarchard): Convert the clipped region of row. + I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width); + ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src_row_y += src_stride_y; + if (yi & 1) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + } + dst_argb += dst_stride_argb; + y += dy; + } + free_aligned_buffer_64(row); + free_aligned_buffer_64(row_argb); +} +#endif + +// Scale ARGB to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScaleARGBSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy) { + int j; + void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBCols = ScaleARGBCols_SSE2; + } +#endif + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + for (j = 0; j < dst_height; ++j) { + ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, + dst_width, x, dx); + dst_argb += dst_stride; + y += dy; + } +} + +// ScaleARGB a ARGB. +// This function in turn calls a scaling function +// suitable for handling the desired resolutions. +static void ScaleARGB(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + int clip_x, int clip_y, int clip_width, int clip_height, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // ARGB does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, + dst_width, dst_height, + filtering); + + // Negative src_height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + if (clip_x) { + int64 clipf = (int64)(clip_x) * dx; + x += (clipf & 0xffff); + src += (clipf >> 16) * 4; + dst += clip_x * 4; + } + if (clip_y) { + int64 clipf = (int64)(clip_y) * dy; + y += (clipf & 0xffff); + src += (clipf >> 16) * src_stride; + dst += clip_y * dst_stride; + } + + // Special case for integer step values. + if (((dx | dy) & 0xffff) == 0) { + if (!dx || !dy) { // 1 pixel wide and/or tall. + filtering = kFilterNone; + } else { + // Optimized even scale down. ie 2, 4, 6, 8, 10x. + if (!(dx & 0x10000) && !(dy & 0x10000)) { + if (dx == 0x20000) { + // Optimized 1/2 downsample. + ScaleARGBDown2(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy, filtering); + return; + } + if (dx == 0x40000 && filtering == kFilterBox) { + // Optimized 1/4 box downsample. + ScaleARGBDown4Box(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy); + return; + } + ScaleARGBDownEven(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy, filtering); + return; + } + // Optimized odd scale down. ie 3, 5, 7, 9x. + if ((dx & 0x10000) && (dy & 0x10000)) { + filtering = kFilterNone; + if (dx == 0x10000 && dy == 0x10000) { + // Straight copy. + ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride, + dst, dst_stride, clip_width, clip_height); + return; + } + } + } + } + if (dx == 0x10000 && (x & 0xffff) == 0) { + // Arbitrary scale vertically, but unscaled vertically. + ScalePlaneVertical(src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, y, dy, 4, filtering); + return; + } + if (filtering && dy < 65536) { + ScaleARGBBilinearUp(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy, filtering); + return; + } + if (filtering) { + ScaleARGBBilinearDown(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy, filtering); + return; + } + ScaleARGBSimple(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy); +} + +LIBYUV_API +int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + int clip_x, int clip_y, int clip_width, int clip_height, + enum FilterMode filtering) { + if (!src_argb || src_width == 0 || src_height == 0 || + !dst_argb || dst_width <= 0 || dst_height <= 0 || + clip_x < 0 || clip_y < 0 || + (clip_x + clip_width) > dst_width || + (clip_y + clip_height) > dst_height) { + return -1; + } + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, + dst_argb, dst_stride_argb, dst_width, dst_height, + clip_x, clip_y, clip_width, clip_height, filtering); + return 0; +} + +// Scale an ARGB image. +LIBYUV_API +int ARGBScale(const uint8* src_argb, int src_stride_argb, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + enum FilterMode filtering) { + if (!src_argb || src_width == 0 || src_height == 0 || + !dst_argb || dst_width <= 0 || dst_height <= 0) { + return -1; + } + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, + dst_argb, dst_stride_argb, dst_width, dst_height, + 0, 0, dst_width, dst_height, filtering); + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/scale_common.cc b/third_party/libyuv/source/scale_common.cc index 595ad66ba..e4b2acc41 100644 --- a/third_party/libyuv/source/scale_common.cc +++ b/third_party/libyuv/source/scale_common.cc @@ -8,15 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/libyuv/include/libyuv/scale.h" +#include "libyuv/scale.h" #include #include -#include "third_party/libyuv/include/libyuv/cpu_id.h" -#include "third_party/libyuv/include/libyuv/planar_functions.h" // CopyARGB -#include "third_party/libyuv/include/libyuv/row.h" -#include "third_party/libyuv/include/libyuv/scale_row.h" +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyARGB +#include "libyuv/row.h" +#include "libyuv/scale_row.h" #ifdef __cplusplus namespace libyuv { diff --git a/third_party/libyuv/source/scale_mips.cc b/third_party/libyuv/source/scale_mips.cc index 5722dea80..3eb4f27c4 100644 --- a/third_party/libyuv/source/scale_mips.cc +++ b/third_party/libyuv/source/scale_mips.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/libyuv/include/libyuv/basic_types.h" -#include "third_party/libyuv/include/libyuv/row.h" +#include "libyuv/basic_types.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -18,7 +18,8 @@ extern "C" { // This module is for GCC MIPS DSPR2 #if !defined(LIBYUV_DISABLE_MIPS) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) + defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { diff --git a/third_party/libyuv/source/scale_neon.cc b/third_party/libyuv/source/scale_neon.cc index 704cfd251..1b8a5ba58 100644 --- a/third_party/libyuv/source/scale_neon.cc +++ b/third_party/libyuv/source/scale_neon.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/libyuv/include/libyuv/row.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -28,8 +28,10 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ".p2align 2 \n" "1: \n" // load even pixels into q0, odd into q1 + MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" "subs %2, %2, #16 \n" // 16 processed per loop + MEMACCESS(1) "vst1.8 {q1}, [%1]! \n" // store odd pixels "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -48,7 +50,9 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "add %1, %0 \n" ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + MEMACCESS(1) "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc "subs %3, %3, #16 \n" // 16 processed per loop "vpaddl.u8 q0, q0 \n" // row 1 add adjacent @@ -57,6 +61,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "vpadal.u8 q1, q3 \n" "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack "vrshrn.u16 d1, q1, #2 \n" + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -73,8 +78,10 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) "vst1.8 {d2}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -87,16 +94,20 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - asm volatile ( - "add r4, %0, %3 \n" - "add r5, r4, %3 \n" - "add %3, r5, %3 \n" + const uint8* src_ptr1 = src_ptr + src_stride; + const uint8* src_ptr2 = src_ptr + src_stride * 2; + const uint8* src_ptr3 = src_ptr + src_stride * 3; +asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load up 16x4 - "vld1.8 {q1}, [r4]! \n" - "vld1.8 {q2}, [r5]! \n" - "vld1.8 {q3}, [%3]! \n" + MEMACCESS(3) + "vld1.8 {q1}, [%3]! \n" + MEMACCESS(4) + "vld1.8 {q2}, [%4]! \n" + MEMACCESS(5) + "vld1.8 {q3}, [%5]! \n" "subs %2, %2, #4 \n" "vpaddl.u8 q0, q0 \n" "vpadal.u8 q0, q1 \n" @@ -105,13 +116,17 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "vpaddl.u16 q0, q0 \n" "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding "vmovn.u16 d0, q0 \n" + MEMACCESS(1) "vst1.32 {d0[0]}, [%1]! \n" "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 - : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_ptr1), // %3 + "+r"(src_ptr2), // %4 + "+r"(src_ptr3) // %5 + : + : "q0", "q1", "q2", "q3", "memory", "cc" ); } @@ -124,9 +139,11 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #24 \n" "vmov d2, d3 \n" // order d0, d1, d2 + MEMACCESS(1) "vst3.8 {d0, d1, d2}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -145,7 +162,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, "add %3, %0 \n" ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + MEMACCESS(3) "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "subs %2, %2, #24 \n" @@ -182,6 +201,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, "vmlal.u8 q8, d3, d24 \n" "vqrshrn.u16 d2, q8, #2 \n" + MEMACCESS(1) "vst3.8 {d0, d1, d2}, [%1]! \n" "bgt 1b \n" @@ -202,7 +222,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, "add %3, %0 \n" ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + MEMACCESS(3) "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "subs %2, %2, #24 \n" // average src line 0 with src line 1 @@ -222,6 +244,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, "vmlal.u8 q3, d3, d24 \n" "vqrshrn.u16 d2, q3, #2 \n" + MEMACCESS(1) "vst3.8 {d0, d1, d2}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -250,14 +273,18 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( + MEMACCESS(3) "vld1.8 {q3}, [%3] \n" ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0, d1, d2, d3}, [%0]! \n" "subs %2, %2, #12 \n" "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + MEMACCESS(1) "vst1.8 {d4}, [%1]! \n" + MEMACCESS(1) "vst1.32 {d5[0]}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -272,11 +299,15 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride * 2; + asm volatile ( - "vld1.16 {q13}, [%4] \n" - "vld1.8 {q14}, [%5] \n" - "vld1.8 {q15}, [%6] \n" - "add r4, %0, %3, lsl #1 \n" + MEMACCESS(5) + "vld1.16 {q13}, [%5] \n" + MEMACCESS(6) + "vld1.8 {q14}, [%6] \n" + MEMACCESS(7) + "vld1.8 {q15}, [%7] \n" "add %3, %0 \n" ".p2align 2 \n" "1: \n" @@ -285,9 +316,12 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + MEMACCESS(3) "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "vld4.8 {d16, d17, d18, d19}, [r4]! \n" + MEMACCESS(4) + "vld4.8 {d16, d17, d18, d19}, [%4]! \n" "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data @@ -364,18 +398,20 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, "vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + MEMACCESS(1) "vst1.8 {d3}, [%1]! \n" + MEMACCESS(1) "vst1.32 {d4[0]}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2), // %5 - "r"(&kMult38_Div9) // %6 - : "r4", "q0", "q1", "q2", "q3", "q8", "q9", - "q13", "q14", "q15", "memory", "cc" + "+r"(src_stride), // %3 + "+r"(src_ptr1) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" ); } @@ -384,7 +420,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( + MEMACCESS(4) "vld1.16 {q13}, [%4] \n" + MEMACCESS(5) "vld1.8 {q14}, [%5] \n" "add %3, %0 \n" ".p2align 2 \n" @@ -394,7 +432,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + MEMACCESS(3) "vld4.8 {d4, d5, d6, d7}, [%3]! \n" "subs %2, %2, #12 \n" @@ -461,7 +501,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, "vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + MEMACCESS(1) "vst1.8 {d3}, [%1]! \n" + MEMACCESS(1) "vst1.32 {d4[0]}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -494,7 +536,9 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vmull.u8 q13, d0, d4 \n" @@ -503,50 +547,63 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "vmlal.u8 q14, d3, d5 \n" "vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d1, q14, #8 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 1b \n" "b 99f \n" // Blend 25 / 75. "25: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 25b \n" "b 99f \n" // Blend 50 / 50. "50: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 50b \n" "b 99f \n" // Blend 75 / 25. "75: \n" + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q0}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 75b \n" "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" "subs %3, %3, #16 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 100b \n" "99: \n" + MEMACCESS(0) "vst1.8 {d1[7]}, [%0] \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 @@ -564,10 +621,14 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ".p2align 2 \n" "1: \n" // load even pixels into q0, odd into q1 + MEMACCESS(0) "vld2.32 {q0, q1}, [%0]! \n" + MEMACCESS(0) "vld2.32 {q2, q3}, [%0]! \n" "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) "vst1.8 {q1}, [%1]! \n" // store odd pixels + MEMACCESS(1) "vst1.8 {q3}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -585,14 +646,18 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "add %1, %1, %0 \n" ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. @@ -602,6 +667,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "vrshrn.u16 d1, q1, #2 \n" "vrshrn.u16 d2, q2, #2 \n" "vrshrn.u16 d3, q3, #2 \n" + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -621,11 +687,16 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, "mov r12, %3, lsl #2 \n" ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.32 {d0[0]}, [%0], r12 \n" + MEMACCESS(0) "vld1.32 {d0[1]}, [%0], r12 \n" + MEMACCESS(0) "vld1.32 {d1[0]}, [%0], r12 \n" + MEMACCESS(0) "vld1.32 {d1[1]}, [%0], r12 \n" "subs %2, %2, #4 \n" // 4 pixels per loop. + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" "bgt 1b \n" : "+r"(src_argb), // %0 @@ -646,13 +717,21 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, "add %1, %1, %0 \n" ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 + MEMACCESS(1) "vld1.8 {d1}, [%1], r12 \n" + MEMACCESS(0) "vld1.8 {d2}, [%0], r12 \n" + MEMACCESS(1) "vld1.8 {d3}, [%1], r12 \n" + MEMACCESS(0) "vld1.8 {d4}, [%0], r12 \n" + MEMACCESS(1) "vld1.8 {d5}, [%1], r12 \n" + MEMACCESS(0) "vld1.8 {d6}, [%0], r12 \n" + MEMACCESS(1) "vld1.8 {d7}, [%1], r12 \n" "vaddl.u8 q0, d0, d1 \n" "vaddl.u8 q1, d2, d3 \n" @@ -665,6 +744,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. "subs %3, %3, #4 \n" // 4 pixels per loop. + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" "bgt 1b \n" : "+r"(src_argb), // %0 diff --git a/third_party/libyuv/source/scale_posix.cc b/third_party/libyuv/source/scale_posix.cc index 18b081026..352e66782 100644 --- a/third_party/libyuv/source/scale_posix.cc +++ b/third_party/libyuv/source/scale_posix.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/libyuv/include/libyuv/row.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { diff --git a/third_party/libyuv/source/scale_win.cc b/third_party/libyuv/source/scale_win.cc index bd5cca8af..840b9738d 100644 --- a/third_party/libyuv/source/scale_win.cc +++ b/third_party/libyuv/source/scale_win.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/libyuv/include/libyuv/row.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { diff --git a/third_party/libyuv/source/video_common.cc b/third_party/libyuv/source/video_common.cc new file mode 100644 index 000000000..efbedf46e --- /dev/null +++ b/third_party/libyuv/source/video_common.cc @@ -0,0 +1,64 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0])) + +struct FourCCAliasEntry { + uint32 alias; + uint32 canonical; +}; + +static const struct FourCCAliasEntry kFourCCAliases[] = { + {FOURCC_IYUV, FOURCC_I420}, + {FOURCC_YU16, FOURCC_I422}, + {FOURCC_YU24, FOURCC_I444}, + {FOURCC_YUYV, FOURCC_YUY2}, + {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs + {FOURCC_HDYC, FOURCC_UYVY}, + {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 + {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. + {FOURCC_DMB1, FOURCC_MJPG}, + {FOURCC_BA81, FOURCC_BGGR}, + {FOURCC_RGB3, FOURCC_RAW }, + {FOURCC_BGR3, FOURCC_24BG}, + {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB + {FOURCC_CM24, FOURCC_RAW }, // kCMPixelFormat_24RGB + {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 + {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 + {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 +}; +// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB. +// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA + +LIBYUV_API +uint32 CanonicalFourCC(uint32 fourcc) { + int i; + for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) { + if (kFourCCAliases[i].alias == fourcc) { + return kFourCCAliases[i].canonical; + } + } + // Not an alias, so return it as-is. + return fourcc; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/third_party/libyuv/source/x86inc.asm b/third_party/libyuv/source/x86inc.asm new file mode 100644 index 000000000..cb5c32df3 --- /dev/null +++ b/third_party/libyuv/source/x86inc.asm @@ -0,0 +1,1136 @@ +;***************************************************************************** +;* x86inc.asm: x264asm abstraction layer +;***************************************************************************** +;* Copyright (C) 2005-2012 x264 project +;* +;* Authors: Loren Merritt +;* Anton Mitrofanov +;* Jason Garrett-Glaser +;* Henrik Gramner +;* +;* Permission to use, copy, modify, and/or distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;***************************************************************************** + +; This is a header file for the x264ASM assembly language, which uses +; NASM/YASM syntax combined with a large number of macros to provide easy +; abstraction between different calling conventions (x86_32, win64, linux64). +; It also has various other useful features to simplify writing the kind of +; DSP functions that are most often used in x264. + +; Unlike the rest of x264, this file is available under an ISC license, as it +; has significant usefulness outside of x264 and we want it to be available +; to the largest audience possible. Of course, if you modify it for your own +; purposes to add a new feature, we strongly encourage contributing a patch +; as this feature might be useful for others as well. Send patches or ideas +; to x264-devel@videolan.org . + +; Local changes for libyuv: +; remove %define program_name and references in labels +; rename cpus to uppercase + +%define WIN64 0 +%define UNIX64 0 +%if ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win32 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,win64 + %define WIN64 1 + %else + %define UNIX64 1 + %endif +%endif + +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + +; Name of the .rodata section. +; Kludge: Something on OS X fails to align .rodata even given an align attribute, +; so use a different read-only section. +%macro SECTION_RODATA 0-1 16 + %ifidn __OUTPUT_FORMAT__,macho64 + SECTION .text align=%1 + %elifidn __OUTPUT_FORMAT__,macho + SECTION .text align=%1 + fakegot: + %elifidn __OUTPUT_FORMAT__,aout + section .text + %else + SECTION .rodata align=%1 + %endif +%endmacro + +; aout does not support align= +%macro SECTION_TEXT 0-1 16 + %ifidn __OUTPUT_FORMAT__,aout + SECTION .text + %else + SECTION .text align=%1 + %endif +%endmacro + +%if WIN64 + %define PIC +%elif ARCH_X86_64 == 0 +; x86_32 doesn't require PIC. +; Some distros prefer shared objects to be PIC, but nothing breaks if +; the code contains a few textrels, so we'll skip that complexity. + %undef PIC +%endif +%ifdef PIC + default rel +%endif + +; Always use long nops (reduces 0x90 spam in disassembly on x86_32) +CPU amdnop + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most of x264's asm. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. +; %4 = list of names to define to registers +; PROLOGUE can also be invoked by adding the same options to cglobal + +; e.g. +; cglobal foo, 2,3,0, dst, src, tmp +; declares a function (foo), taking two args (dst and src) and one local variable (tmp) + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE, and returns. + +; REP_RET: +; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons +; which are slow when a normal ret follows a branch. + +; registers: +; rN and rNq are the native-size register holding function argument N +; rNd, rNw, rNb are dword, word, and byte size +; rNh is the high 8 bits of the word size +; rNm is the original location of arg N (a register or on the stack), dword +; rNmp is native size + +%macro DECLARE_REG 2-3 + %define r%1q %2 + %define r%1d %2d + %define r%1w %2w + %define r%1b %2b + %define r%1h %2h + %if %0 == 2 + %define r%1m %2d + %define r%1mp %2 + %elif ARCH_X86_64 ; memory + %define r%1m [rsp + stack_offset + %3] + %define r%1mp qword r %+ %1m + %else + %define r%1m [esp + stack_offset + %3] + %define r%1mp dword r %+ %1m + %endif + %define r%1 %2 +%endmacro + +%macro DECLARE_REG_SIZE 3 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 + %define r%1h %3 + %define e%1h %3 + %define r%1b %2 + %define e%1b %2 +%if ARCH_X86_64 == 0 + %define r%1 e%1 +%endif +%endmacro + +DECLARE_REG_SIZE ax, al, ah +DECLARE_REG_SIZE bx, bl, bh +DECLARE_REG_SIZE cx, cl, ch +DECLARE_REG_SIZE dx, dl, dh +DECLARE_REG_SIZE si, sil, null +DECLARE_REG_SIZE di, dil, null +DECLARE_REG_SIZE bp, bpl, null + +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1h t%1 %+ h + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 + +%if ARCH_X86_64 + %define gprsize 8 +%else + %define gprsize 4 +%endif + +%macro PUSH 1 + push %1 + %assign stack_offset stack_offset+gprsize +%endmacro + +%macro POP 1 + pop %1 + %assign stack_offset stack_offset-gprsize +%endmacro + +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rsp + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rsp + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assert failed + %endif +%endmacro + +%macro DEFINE_ARGS 0-* + %ifdef n_arg_names + %assign %%i 0 + %rep n_arg_names + CAT_UNDEF arg_name %+ %%i, q + CAT_UNDEF arg_name %+ %%i, d + CAT_UNDEF arg_name %+ %%i, w + CAT_UNDEF arg_name %+ %%i, h + CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m + CAT_UNDEF arg_name %+ %%i, mp + CAT_UNDEF arg_name, %%i + %assign %%i %%i+1 + %endrep + %endif + + %xdefine %%stack_offset stack_offset + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine + %assign %%i 0 + %rep %0 + %xdefine %1q r %+ %%i %+ q + %xdefine %1d r %+ %%i %+ d + %xdefine %1w r %+ %%i %+ w + %xdefine %1h r %+ %%i %+ h + %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m + %xdefine %1mp r %+ %%i %+ mp + CAT_XDEFINE arg_name, %%i, %1 + %assign %%i %%i+1 + %rotate 1 + %endrep + %xdefine stack_offset %%stack_offset + %assign n_arg_names %0 +%endmacro + +%if WIN64 ; Windows x64 ;================================================= + +DECLARE_REG 0, rcx +DECLARE_REG 1, rdx +DECLARE_REG 2, R8 +DECLARE_REG 3, R9 +DECLARE_REG 4, R10, 40 +DECLARE_REG 5, R11, 48 +DECLARE_REG 6, rax, 56 +DECLARE_REG 7, rdi, 64 +DECLARE_REG 8, rsi, 72 +DECLARE_REG 9, rbx, 80 +DECLARE_REG 10, rbp, 88 +DECLARE_REG 11, R12, 96 +DECLARE_REG 12, R13, 104 +DECLARE_REG 13, R14, 112 +DECLARE_REG 14, R15, 120 + +%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 + %if mmsize == 8 + %assign xmm_regs_used 0 + %else + WIN64_SPILL_XMM %3 + %endif + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS %4 +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + %if xmm_regs_used > 6 + SUB rsp, (xmm_regs_used-6)*16+16 + %assign %%i xmm_regs_used + %rep (xmm_regs_used-6) + %assign %%i %%i-1 + movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i + %endrep + %endif +%endmacro + +%macro WIN64_RESTORE_XMM_INTERNAL 1 + %if xmm_regs_used > 6 + %assign %%i xmm_regs_used + %rep (xmm_regs_used-6) + %assign %%i %%i-1 + movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] + %endrep + add %1, (xmm_regs_used-6)*16+16 + %endif +%endmacro + +%macro WIN64_RESTORE_XMM 1 + WIN64_RESTORE_XMM_INTERNAL %1 + %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 + %assign xmm_regs_used 0 +%endmacro + +%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 + +%macro RET 0 + WIN64_RESTORE_XMM_INTERNAL rsp + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 +%if mmsize == 32 + vzeroupper +%endif + ret +%endmacro + +%elif ARCH_X86_64 ; *nix x64 ;============================================= + +DECLARE_REG 0, rdi +DECLARE_REG 1, rsi +DECLARE_REG 2, rdx +DECLARE_REG 3, rcx +DECLARE_REG 4, R8 +DECLARE_REG 5, R9 +DECLARE_REG 6, rax, 8 +DECLARE_REG 7, R10, 16 +DECLARE_REG 8, R11, 24 +DECLARE_REG 9, rbx, 32 +DECLARE_REG 10, rbp, 40 +DECLARE_REG 11, R12, 48 +DECLARE_REG 12, R13, 56 +DECLARE_REG 13, R14, 64 +DECLARE_REG 14, R15, 72 + +%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS %4 +%endmacro + +%define has_epilogue regs_used > 9 || mmsize == 32 + +%macro RET 0 + POP_IF_USED 14, 13, 12, 11, 10, 9 +%if mmsize == 32 + vzeroupper +%endif + ret +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, 4 +DECLARE_REG 1, ecx, 8 +DECLARE_REG 2, edx, 12 +DECLARE_REG 3, ebx, 16 +DECLARE_REG 4, esi, 20 +DECLARE_REG 5, edi, 24 +DECLARE_REG 6, ebp, 28 +%define rsp esp + +%macro DECLARE_ARG 1-* + %rep %0 + %define r%1m [esp + stack_offset + 4*%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep +%endmacro + +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 + +%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... + %assign num_args %1 + %assign regs_used %2 + %if regs_used > 7 + %assign regs_used 7 + %endif + ASSERT regs_used >= num_args + PUSH_IF_USED 3, 4, 5, 6 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 + DEFINE_ARGS %4 +%endmacro + +%define has_epilogue regs_used > 3 || mmsize == 32 + +%macro RET 0 + POP_IF_USED 6, 5, 4, 3 +%if mmsize == 32 + vzeroupper +%endif + ret +%endmacro + +%endif ;====================================================================== + +%if WIN64 == 0 +%macro WIN64_SPILL_XMM 1 +%endmacro +%macro WIN64_RESTORE_XMM 1 +%endmacro +%endif + +%macro REP_RET 0 + %if has_epilogue + RET + %else + rep ret + %endif +%endmacro + +%macro TAIL_CALL 2 ; callee, is_nonadjacent + %if has_epilogue + call %1 + RET + %elif %2 + jmp %1 + %endif +%endmacro + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Begin a function. +; Applies any symbol mangling needed for C linkage, and sets up a define such that +; subsequent uses of the function name automatically refer to the mangled version. +; Appends cpuflags to the function name if cpuflags has been specified. +%macro cglobal 1-2+ ; name, [PROLOGUE args] +%if %0 == 1 + cglobal_internal %1 %+ SUFFIX +%else + cglobal_internal %1 %+ SUFFIX, %2 +%endif +%endmacro +%macro cglobal_internal 1-2+ + %ifndef cglobaled_%1 + %xdefine %1 mangle(%1) + %xdefine %1.skip_prologue %1 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %1, 1 + %endif + %xdefine current_function %1 + %ifidn __OUTPUT_FORMAT__,elf + global %1:function hidden + %else + global %1 + %endif + align function_align + %1: + RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer + %assign stack_offset 0 + %if %0 > 1 + PROLOGUE %2 + %endif +%endmacro + +%macro cextern 1 + %xdefine %1 mangle(%1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +; like cextern, but without the prefix +%macro cextern_naked 1 + %xdefine %1 mangle(%1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +%macro const 2+ + %xdefine %1 mangle(%1) + global %1 + %1: %2 +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is +; executable by default. +%ifidn __OUTPUT_FORMAT__,elf +SECTION .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif + +; cpuflags + +%assign cpuflags_MMX (1<<0) +%assign cpuflags_MMX2 (1<<1) | cpuflags_MMX +%assign cpuflags_3dnow (1<<2) | cpuflags_MMX +%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow +%assign cpuflags_SSE (1<<4) | cpuflags_MMX2 +%assign cpuflags_SSE2 (1<<5) | cpuflags_SSE +%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2 +%assign cpuflags_SSE3 (1<<7) | cpuflags_SSE2 +%assign cpuflags_SSSE3 (1<<8) | cpuflags_SSE3 +%assign cpuflags_SSE4 (1<<9) | cpuflags_SSSE3 +%assign cpuflags_SSE42 (1<<10)| cpuflags_SSE4 +%assign cpuflags_AVX (1<<11)| cpuflags_SSE42 +%assign cpuflags_xop (1<<12)| cpuflags_AVX +%assign cpuflags_fma4 (1<<13)| cpuflags_AVX +%assign cpuflags_AVX2 (1<<14)| cpuflags_AVX +%assign cpuflags_fma3 (1<<15)| cpuflags_AVX + +%assign cpuflags_cache32 (1<<16) +%assign cpuflags_cache64 (1<<17) +%assign cpuflags_slowctz (1<<18) +%assign cpuflags_lzcnt (1<<19) +%assign cpuflags_misalign (1<<20) +%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<22) +%assign cpuflags_bmi1 (1<<23) +%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1 +%assign cpuflags_tbm (1<<25)|cpuflags_bmi1 + +%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) +%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) + +; Takes up to 2 cpuflags from the above list. +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. +; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. +%macro INIT_CPUFLAGS 0-2 + %if %0 >= 1 + %xdefine cpuname %1 + %assign cpuflags cpuflags_%1 + %if %0 >= 2 + %xdefine cpuname %1_%2 + %assign cpuflags cpuflags | cpuflags_%2 + %endif + %xdefine SUFFIX _ %+ cpuname + %if cpuflag(AVX) + %assign AVX_enabled 1 + %endif + %if mmsize == 16 && notcpuflag(SSE2) + %define mova movaps + %define movu movups + %define movnta movntps + %endif + %if cpuflag(aligned) + %define movu mova + %elifidn %1, SSE3 + %define movu lddqu + %endif + %else + %xdefine SUFFIX + %undef cpuname + %undef cpuflags + %endif +%endmacro + +; merge MMX and SSE* + +%macro CAT_XDEFINE 3 + %xdefine %1%2 %3 +%endmacro + +%macro CAT_UNDEF 2 + %undef %1%2 +%endmacro + +%macro INIT_MMX 0-1+ + %assign AVX_enabled 0 + %define RESET_MM_PERMUTATION INIT_MMX %1 + %define mmsize 8 + %define num_mmregs 8 + %define mova movq + %define movu movq + %define movh movd + %define movnta movntq + %assign %%i 0 + %rep 8 + CAT_XDEFINE m, %%i, mm %+ %%i + CAT_XDEFINE nmm, %%i, %%i + %assign %%i %%i+1 + %endrep + %rep 8 + CAT_UNDEF m, %%i + CAT_UNDEF nmm, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_XMM 0-1+ + %assign AVX_enabled 0 + %define RESET_MM_PERMUTATION INIT_XMM %1 + %define mmsize 16 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %define movh movq + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, xmm %+ %%i + CAT_XDEFINE nxmm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_YMM 0-1+ + %assign AVX_enabled 1 + %define RESET_MM_PERMUTATION INIT_YMM %1 + %define mmsize 32 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova vmovaps + %define movu vmovups + %undef movh + %define movnta vmovntps + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, ymm %+ %%i + CAT_XDEFINE nymm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +INIT_XMM + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap +%rep %0/2 + %xdefine tmp%2 m%2 + %xdefine ntmp%2 nm%2 + %rotate 2 +%endrep +%rep %0/2 + %xdefine m%1 tmp%2 + %xdefine nm%1 ntmp%2 + %undef tmp%2 + %undef ntmp%2 + %rotate 2 +%endrep +%endmacro + +%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) +%rep %0-1 +%ifdef m%1 + %xdefine tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 tmp + CAT_XDEFINE n, m%1, %1 + CAT_XDEFINE n, m%2, %2 +%else + ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. + ; Be careful using this mode in nested macros though, as in some cases there may be + ; other copies of m# that have already been dereferenced and don't get updated correctly. + %xdefine %%n1 n %+ %1 + %xdefine %%n2 n %+ %2 + %xdefine tmp m %+ %%n1 + CAT_XDEFINE m, %%n1, m %+ %%n2 + CAT_XDEFINE m, %%n2, tmp + CAT_XDEFINE n, m %+ %%n1, %%n1 + CAT_XDEFINE n, m %+ %%n2, %%n2 +%endif + %undef tmp + %rotate 1 +%endrep +%endmacro + +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later +; calls to that function will automatically load the permutation, so values can +; be returned in mmregs. +%macro SAVE_MM_PERMUTATION 0-1 + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE %%f, %%i, m %+ %%i + %assign %%i %%i+1 + %endrep +%endmacro + +%macro LOAD_MM_PERMUTATION 1 ; name to load from + %ifdef %1_m0 + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1_m %+ %%i + CAT_XDEFINE n, m %+ %%i, %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't +%macro call 1 + call_internal %1, %1 %+ SUFFIX +%endmacro +%macro call_internal 2 + %xdefine %%i %1 + %ifndef cglobaled_%1 + %ifdef cglobaled_%2 + %xdefine %%i %2 + %endif + %endif + call %%i + LOAD_MM_PERMUTATION %%i +%endmacro + +; Substitutions that reduce instruction size but are functionally equivalent +%macro add 2 + %ifnum %2 + %if %2==128 + sub %1, -128 + %else + add %1, %2 + %endif + %else + add %1, %2 + %endif +%endmacro + +%macro sub 2 + %ifnum %2 + %if %2==128 + add %1, -128 + %else + sub %1, %2 + %endif + %else + sub %1, %2 + %endif +%endmacro + +;============================================================================= +; AVX abstraction layer +;============================================================================= + +%assign i 0 +%rep 16 + %if i < 8 + CAT_XDEFINE sizeofmm, i, 8 + %endif + CAT_XDEFINE sizeofxmm, i, 16 + CAT_XDEFINE sizeofymm, i, 32 +%assign i i+1 +%endrep +%undef i + +%macro CHECK_AVX_INSTR_EMU 3-* + %xdefine %%opcode %1 + %xdefine %%dst %2 + %rep %0-2 + %ifidn %%dst, %3 + %error non-AVX emulation of ``%%opcode'' is not supported + %endif + %rotate 1 + %endrep +%endmacro + +;%1 == instruction +;%2 == 1 if float, 0 if int +;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) +;%4 == number of operands given +;%5+: operands +%macro RUN_AVX_INSTR 6-7+ + %ifid %6 + %define %%sizeofreg sizeof%6 + %elifid %5 + %define %%sizeofreg sizeof%5 + %else + %define %%sizeofreg mmsize + %endif + %if %%sizeofreg==32 + %if %4>=3 + v%1 %5, %6, %7 + %else + v%1 %5, %6 + %endif + %else + %if %%sizeofreg==8 + %define %%regmov movq + %elif %2 + %define %%regmov movaps + %else + %define %%regmov movdqa + %endif + + %if %4>=3+%3 + %ifnidn %5, %6 + %if AVX_enabled && %%sizeofreg==16 + v%1 %5, %6, %7 + %else + CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 + %%regmov %5, %6 + %1 %5, %7 + %endif + %else + %1 %5, %7 + %endif + %elif %4>=3 + %1 %5, %6, %7 + %else + %1 %5, %6 + %endif + %endif +%endmacro + +; 3arg AVX ops with a memory arg can only have it in src2, +; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). +; So, if the op is symmetric and the wrong one is memory, swap them. +%macro RUN_AVX_INSTR1 8 + %assign %%swap 0 + %if AVX_enabled + %ifnid %6 + %assign %%swap 1 + %endif + %elifnidn %5, %6 + %ifnid %7 + %assign %%swap 1 + %endif + %endif + %if %%swap && %3 == 0 && %8 == 1 + RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 + %else + RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 + %endif +%endmacro + +;%1 == instruction +;%2 == 1 if float, 0 if int +;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) +;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 4 + %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 + %ifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 + %elifidn %4, fnord + RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 + %elifidn %5, fnord + RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 + %else + RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 + %endif + %endmacro +%endmacro + +AVX_INSTR addpd, 1, 0, 1 +AVX_INSTR addps, 1, 0, 1 +AVX_INSTR addsd, 1, 0, 1 +AVX_INSTR addss, 1, 0, 1 +AVX_INSTR addsubpd, 1, 0, 0 +AVX_INSTR addsubps, 1, 0, 0 +AVX_INSTR andpd, 1, 0, 1 +AVX_INSTR andps, 1, 0, 1 +AVX_INSTR andnpd, 1, 0, 0 +AVX_INSTR andnps, 1, 0, 0 +AVX_INSTR blendpd, 1, 0, 0 +AVX_INSTR blendps, 1, 0, 0 +AVX_INSTR blendvpd, 1, 0, 0 +AVX_INSTR blendvps, 1, 0, 0 +AVX_INSTR cmppd, 1, 0, 0 +AVX_INSTR cmpps, 1, 0, 0 +AVX_INSTR cmpsd, 1, 0, 0 +AVX_INSTR cmpss, 1, 0, 0 +AVX_INSTR cvtdq2ps, 1, 0, 0 +AVX_INSTR cvtps2dq, 1, 0, 0 +AVX_INSTR divpd, 1, 0, 0 +AVX_INSTR divps, 1, 0, 0 +AVX_INSTR divsd, 1, 0, 0 +AVX_INSTR divss, 1, 0, 0 +AVX_INSTR dppd, 1, 1, 0 +AVX_INSTR dpps, 1, 1, 0 +AVX_INSTR haddpd, 1, 0, 0 +AVX_INSTR haddps, 1, 0, 0 +AVX_INSTR hsubpd, 1, 0, 0 +AVX_INSTR hsubps, 1, 0, 0 +AVX_INSTR maxpd, 1, 0, 1 +AVX_INSTR maxps, 1, 0, 1 +AVX_INSTR maxsd, 1, 0, 1 +AVX_INSTR maxss, 1, 0, 1 +AVX_INSTR minpd, 1, 0, 1 +AVX_INSTR minps, 1, 0, 1 +AVX_INSTR minsd, 1, 0, 1 +AVX_INSTR minss, 1, 0, 1 +AVX_INSTR movhlps, 1, 0, 0 +AVX_INSTR movlhps, 1, 0, 0 +AVX_INSTR movsd, 1, 0, 0 +AVX_INSTR movss, 1, 0, 0 +AVX_INSTR mpsadbw, 0, 1, 0 +AVX_INSTR mulpd, 1, 0, 1 +AVX_INSTR mulps, 1, 0, 1 +AVX_INSTR mulsd, 1, 0, 1 +AVX_INSTR mulss, 1, 0, 1 +AVX_INSTR orpd, 1, 0, 1 +AVX_INSTR orps, 1, 0, 1 +AVX_INSTR pabsb, 0, 0, 0 +AVX_INSTR pabsw, 0, 0, 0 +AVX_INSTR pabsd, 0, 0, 0 +AVX_INSTR packsswb, 0, 0, 0 +AVX_INSTR packssdw, 0, 0, 0 +AVX_INSTR packuswb, 0, 0, 0 +AVX_INSTR packusdw, 0, 0, 0 +AVX_INSTR paddb, 0, 0, 1 +AVX_INSTR paddw, 0, 0, 1 +AVX_INSTR paddd, 0, 0, 1 +AVX_INSTR paddq, 0, 0, 1 +AVX_INSTR paddsb, 0, 0, 1 +AVX_INSTR paddsw, 0, 0, 1 +AVX_INSTR paddusb, 0, 0, 1 +AVX_INSTR paddusw, 0, 0, 1 +AVX_INSTR palignr, 0, 1, 0 +AVX_INSTR pand, 0, 0, 1 +AVX_INSTR pandn, 0, 0, 0 +AVX_INSTR pavgb, 0, 0, 1 +AVX_INSTR pavgw, 0, 0, 1 +AVX_INSTR pblendvb, 0, 0, 0 +AVX_INSTR pblendw, 0, 1, 0 +AVX_INSTR pcmpestri, 0, 0, 0 +AVX_INSTR pcmpestrm, 0, 0, 0 +AVX_INSTR pcmpistri, 0, 0, 0 +AVX_INSTR pcmpistrm, 0, 0, 0 +AVX_INSTR pcmpeqb, 0, 0, 1 +AVX_INSTR pcmpeqw, 0, 0, 1 +AVX_INSTR pcmpeqd, 0, 0, 1 +AVX_INSTR pcmpeqq, 0, 0, 1 +AVX_INSTR pcmpgtb, 0, 0, 0 +AVX_INSTR pcmpgtw, 0, 0, 0 +AVX_INSTR pcmpgtd, 0, 0, 0 +AVX_INSTR pcmpgtq, 0, 0, 0 +AVX_INSTR phaddw, 0, 0, 0 +AVX_INSTR phaddd, 0, 0, 0 +AVX_INSTR phaddsw, 0, 0, 0 +AVX_INSTR phsubw, 0, 0, 0 +AVX_INSTR phsubd, 0, 0, 0 +AVX_INSTR phsubsw, 0, 0, 0 +AVX_INSTR pmaddwd, 0, 0, 1 +AVX_INSTR pmaddubsw, 0, 0, 0 +AVX_INSTR pmaxsb, 0, 0, 1 +AVX_INSTR pmaxsw, 0, 0, 1 +AVX_INSTR pmaxsd, 0, 0, 1 +AVX_INSTR pmaxub, 0, 0, 1 +AVX_INSTR pmaxuw, 0, 0, 1 +AVX_INSTR pmaxud, 0, 0, 1 +AVX_INSTR pminsb, 0, 0, 1 +AVX_INSTR pminsw, 0, 0, 1 +AVX_INSTR pminsd, 0, 0, 1 +AVX_INSTR pminub, 0, 0, 1 +AVX_INSTR pminuw, 0, 0, 1 +AVX_INSTR pminud, 0, 0, 1 +AVX_INSTR pmovmskb, 0, 0, 0 +AVX_INSTR pmulhuw, 0, 0, 1 +AVX_INSTR pmulhrsw, 0, 0, 1 +AVX_INSTR pmulhw, 0, 0, 1 +AVX_INSTR pmullw, 0, 0, 1 +AVX_INSTR pmulld, 0, 0, 1 +AVX_INSTR pmuludq, 0, 0, 1 +AVX_INSTR pmuldq, 0, 0, 1 +AVX_INSTR por, 0, 0, 1 +AVX_INSTR psadbw, 0, 0, 1 +AVX_INSTR pshufb, 0, 0, 0 +AVX_INSTR pshufd, 0, 1, 0 +AVX_INSTR pshufhw, 0, 1, 0 +AVX_INSTR pshuflw, 0, 1, 0 +AVX_INSTR psignb, 0, 0, 0 +AVX_INSTR psignw, 0, 0, 0 +AVX_INSTR psignd, 0, 0, 0 +AVX_INSTR psllw, 0, 0, 0 +AVX_INSTR pslld, 0, 0, 0 +AVX_INSTR psllq, 0, 0, 0 +AVX_INSTR pslldq, 0, 0, 0 +AVX_INSTR psraw, 0, 0, 0 +AVX_INSTR psrad, 0, 0, 0 +AVX_INSTR psrlw, 0, 0, 0 +AVX_INSTR psrld, 0, 0, 0 +AVX_INSTR psrlq, 0, 0, 0 +AVX_INSTR psrldq, 0, 0, 0 +AVX_INSTR psubb, 0, 0, 0 +AVX_INSTR psubw, 0, 0, 0 +AVX_INSTR psubd, 0, 0, 0 +AVX_INSTR psubq, 0, 0, 0 +AVX_INSTR psubsb, 0, 0, 0 +AVX_INSTR psubsw, 0, 0, 0 +AVX_INSTR psubusb, 0, 0, 0 +AVX_INSTR psubusw, 0, 0, 0 +AVX_INSTR ptest, 0, 0, 0 +AVX_INSTR punpckhbw, 0, 0, 0 +AVX_INSTR punpckhwd, 0, 0, 0 +AVX_INSTR punpckhdq, 0, 0, 0 +AVX_INSTR punpckhqdq, 0, 0, 0 +AVX_INSTR punpcklbw, 0, 0, 0 +AVX_INSTR punpcklwd, 0, 0, 0 +AVX_INSTR punpckldq, 0, 0, 0 +AVX_INSTR punpcklqdq, 0, 0, 0 +AVX_INSTR pxor, 0, 0, 1 +AVX_INSTR shufps, 1, 1, 0 +AVX_INSTR subpd, 1, 0, 0 +AVX_INSTR subps, 1, 0, 0 +AVX_INSTR subsd, 1, 0, 0 +AVX_INSTR subss, 1, 0, 0 +AVX_INSTR unpckhpd, 1, 0, 0 +AVX_INSTR unpckhps, 1, 0, 0 +AVX_INSTR unpcklpd, 1, 0, 0 +AVX_INSTR unpcklps, 1, 0, 0 +AVX_INSTR xorpd, 1, 0, 1 +AVX_INSTR xorps, 1, 0, 1 + +; 3DNow instructions, for sharing code between AVX, SSE and 3DN +AVX_INSTR pfadd, 1, 0, 1 +AVX_INSTR pfsub, 1, 0, 0 +AVX_INSTR pfmul, 1, 0, 1 + +; base-4 constants for shuffles +%assign i 0 +%rep 256 + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) + %if j < 10 + CAT_XDEFINE q000, j, i + %elif j < 100 + CAT_XDEFINE q00, j, i + %elif j < 1000 + CAT_XDEFINE q0, j, i + %else + CAT_XDEFINE q, j, i + %endif +%assign i i+1 +%endrep +%undef i +%undef j + +%macro FMA_INSTR 3 + %macro %1 4-7 %1, %2, %3 + %if cpuflag(xop) + v%5 %1, %2, %3, %4 + %else + %6 %1, %2, %3 + %7 %1, %4 + %endif + %endmacro +%endmacro + +FMA_INSTR pmacsdd, pmulld, paddd +FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmadcswd, pmaddwd, paddd + +; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. +; This lets us use tzcnt without bumping the yasm version requirement yet. +%define tzcnt rep bsf diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm index 213467662..99453a998 100644 --- a/third_party/x86inc/x86inc.asm +++ b/third_party/x86inc/x86inc.asm @@ -234,10 +234,10 @@ ALIGNMODE k7 %define r%1mp %2 %elif ARCH_X86_64 ; memory %define r%1m [rsp + stack_offset + %6] - %define r%1mp qword r %+ %1m + %define r%1mp qword r %+ %1 %+ m %else %define r%1m [esp + stack_offset + %6] - %define r%1mp dword r %+ %1m + %define r%1mp dword r %+ %1 %+ m %endif %define r%1 %2 %endmacro diff --git a/tools_common.c b/tools_common.c index 87fa317ce..66dbeee99 100644 --- a/tools_common.c +++ b/tools_common.c @@ -255,7 +255,7 @@ int vpx_img_read(vpx_image_t *img, FILE *file) { #endif for (y = 0; y < h; ++y) { - if (fread(buf, 1, w, file) != w) + if (fread(buf, 1, w, file) != (size_t)w) return 0; buf += stride; } diff --git a/usage.dox b/usage.dox index 92fd6b26e..237b8dc42 100644 --- a/usage.dox +++ b/usage.dox @@ -57,9 +57,6 @@ the vpx_codec_get_caps() method. Attempts to invoke features not supported by an algorithm will generally result in #VPX_CODEC_INCAPABLE. - Currently defined features available in both encoders and decoders include: - - \subpage usage_xma - \if decoder Currently defined decoder features include: - \ref usage_cb @@ -70,9 +67,7 @@ To initialize a codec instance, the address of the codec context and interface structures are passed to an initialization function. Depending on the \ref usage_features that the codec supports, the codec could be - initialized in different modes. Most notably, the application may choose to - use \ref usage_xma mode to gain fine grained control over how and where - memory is allocated for the codec. + initialized in different modes. To prevent cases of confusion where the ABI of the library changes, the ABI is versioned. The ABI version number must be passed at @@ -136,73 +131,3 @@ possible." */ - - -/*! \page usage_xma External Memory Allocation - Applications that wish to have fine grained control over how and where - decoders allocate memory \ref MAY make use of the eXternal Memory Allocation - (XMA) interface. Not all codecs support the XMA \ref usage_features. - - To use a decoder in XMA mode, the decoder \ref MUST be initialized with the - vpx_codec_xma_init_ver() function. The amount of memory a decoder needs to - allocate is heavily dependent on the size of the encoded video frames. The - size of the video must be known before requesting the decoder's memory map. - This stream information can be obtained with the vpx_codec_peek_stream_info() - function, which does not require a constructed decoder context. If the exact - stream is not known, a stream info structure can be created that reflects - the maximum size that the decoder instance is required to support. - - Once the decoder instance has been initialized and the stream information - determined, the application calls the vpx_codec_get_mem_map() iterator - repeatedly to get a list of the memory segments requested by the decoder. - The iterator value should be initialized to NULL to request the first - element, and the function will return #VPX_CODEC_LIST_END to signal the end of - the list. - - After each segment is identified, it must be passed to the codec through the - vpx_codec_set_mem_map() function. Segments \ref MUST be passed in the same - order as they are returned from vpx_codec_get_mem_map(), but there is no - requirement that vpx_codec_get_mem_map() must finish iterating before - vpx_codec_set_mem_map() is called. For instance, some applications may choose - to get a list of all requests, construct an optimal heap, and then set all - maps at once with one call. Other applications may set one map at a time, - allocating it immediately after it is returned from vpx_codec_get_mem_map(). - - After all segments have been set using vpx_codec_set_mem_map(), the codec may - be used as it would be in normal internal allocation mode. - - \section usage_xma_seg_id Segment Identifiers - Each requested segment is identified by an identifier unique to - that decoder type. Some of these identifiers are private, while others are - enumerated for application use. Identifiers not enumerated publicly are - subject to change. Identifiers are non-consecutive. - - \section usage_xma_seg_szalign Segment Size and Alignment - The sz (size) and align (alignment) parameters describe the required size - and alignment of the requested segment. Alignment will always be a power of - two. Applications \ref MUST honor the alignment requested. Failure to do so - could result in program crashes or may incur a speed penalty. - - \section usage_xma_seg_flags Segment Flags - The flags member of the segment structure indicates any requirements or - desires of the codec for the particular segment. The #VPX_CODEC_MEM_ZERO flag - indicates that the segment \ref MUST be zeroed by the application prior to - passing it to the application. The #VPX_CODEC_MEM_WRONLY flag indicates that - the segment will only be written into by the decoder, not read. If this flag - is not set, the application \ref MUST insure that the memory segment is - readable. On some platforms, framebuffer memory is writable but not - readable, for example. The #VPX_CODEC_MEM_FAST flag indicates that the segment - will be frequently accessed, and that it should be placed into fast memory, - if any is available. The application \ref MAY choose to place other segments - in fast memory as well, but the most critical segments will be identified by - this flag. - - \section usage_xma_seg_basedtor Segment Base Address and Destructor - For each requested memory segment, the application must determine the - address of a memory segment that meets the requirements of the codec. This - address is set in the base member of the #vpx_codec_mmap - structure. If the application requires processing when the segment is no - longer used by the codec (for instance to deallocate it or close an - associated file descriptor) the dtor and priv - members can be set. -*/ diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index 7d9441d54..a46fbfbbd 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -108,8 +108,8 @@ extern "C" * For temporal denoiser: noise_sensitivity = 0 means off, * noise_sensitivity = 1 means temporal denoiser on for Y channel only, * noise_sensitivity = 2 means temporal denoiser on for all channels. - * noise_sensitivity = 3 will be used for aggressive mode in future. - * Temporal denoiser is enabled via the build option + * noise_sensitivity >= 3 means aggressive denoising mode. + * Temporal denoiser is enabled via the configuration option: * CONFIG_TEMPORAL_DENOISING. * For spatial denoiser: noise_sensitivity controls the amount of * pre-processing blur: noise_sensitivity = 0 means off. diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index f5870797b..fd9afd2ac 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -220,7 +220,8 @@ $vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6; $vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2; add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media neon dspr2/; +# Disable neon while investigating https://code.google.com/p/webm/issues/detail?id=817 +specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media dspr2/; $vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6; $vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2; diff --git a/vp8/encoder/arm/neon/denoising_neon.c b/vp8/encoder/arm/neon/denoising_neon.c index b7bb40c27..08be76e43 100644 --- a/vp8/encoder/arm/neon/denoising_neon.c +++ b/vp8/encoder/arm/neon/denoising_neon.c @@ -279,8 +279,8 @@ int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, { const uint32x2_t _7654_3210 = vpaddl_u16(v_sum_block); const uint64x1_t _76543210 = vpaddl_u32(_7654_3210); - const unsigned int sum_block = - vget_lane_u32(vreinterpret_u32_u64(_76543210), 0); + const int sum_block = + vget_lane_s32(vreinterpret_s32_u64(_76543210), 0); if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { return COPY_BLOCK; } diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h index 611421575..7c012a829 100644 --- a/vp8/encoder/boolhuff.h +++ b/vp8/encoder/boolhuff.h @@ -35,10 +35,6 @@ typedef struct unsigned char *buffer; unsigned char *buffer_end; struct vpx_internal_error_info *error; - - /* Variables used to track bit costs without outputing to the bitstream */ - unsigned int measure_cost; - unsigned long bit_counter; } BOOL_CODER; extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer, unsigned char *buffer_end); diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c index 0f0a36a61..75401fc2b 100644 --- a/vp8/encoder/denoising.c +++ b/vp8/encoder/denoising.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include + #include "denoising.h" #include "vp8/common/reconinter.h" @@ -333,8 +335,36 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, return FILTER_BLOCK; } +void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode) { + assert(mode > 0); // Denoiser is allocated only if mode > 0. + if (mode == 1) { + denoiser->denoiser_mode = kDenoiserOnYOnly; + } else if (mode == 2) { + denoiser->denoiser_mode = kDenoiserOnYUV; + } else { + denoiser->denoiser_mode = kDenoiserOnYUVAggressive; + } + if (denoiser->denoiser_mode != kDenoiserOnYUVAggressive) { + denoiser->denoise_pars.scale_sse_thresh = 1; + denoiser->denoise_pars.scale_motion_thresh = 8; + denoiser->denoise_pars.scale_increase_filter = 0; + denoiser->denoise_pars.denoise_mv_bias = 95; + denoiser->denoise_pars.pickmode_mv_bias = 100; + denoiser->denoise_pars.qp_thresh = 0; + denoiser->denoise_pars.consec_zerolast = UINT_MAX; + } else { + denoiser->denoise_pars.scale_sse_thresh = 2; + denoiser->denoise_pars.scale_motion_thresh = 16; + denoiser->denoise_pars.scale_increase_filter = 1; + denoiser->denoise_pars.denoise_mv_bias = 60; + denoiser->denoise_pars.pickmode_mv_bias = 60; + denoiser->denoise_pars.qp_thresh = 100; + denoiser->denoise_pars.consec_zerolast = 10; + } +} + int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, - int num_mb_rows, int num_mb_cols) + int num_mb_rows, int num_mb_cols, int mode) { int i; assert(denoiser); @@ -369,10 +399,11 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1); vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols)); - + vp8_denoiser_set_parameters(denoiser, mode); return 0; } + void vp8_denoiser_free(VP8_DENOISER *denoiser) { int i; @@ -396,11 +427,12 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, loop_filter_info_n *lfi_n, int mb_row, int mb_col, - int block_index, - int uv_denoise) + int block_index) + { int mv_row; int mv_col; + unsigned int motion_threshold; unsigned int motion_magnitude2; unsigned int sse_thresh; int sse_diff_thresh = 0; @@ -411,8 +443,8 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame; enum vp8_denoiser_decision decision = FILTER_BLOCK; - enum vp8_denoiser_decision decision_u = FILTER_BLOCK; - enum vp8_denoiser_decision decision_v = FILTER_BLOCK; + enum vp8_denoiser_decision decision_u = COPY_BLOCK; + enum vp8_denoiser_decision decision_v = COPY_BLOCK; if (zero_frame) { @@ -424,7 +456,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi; int sse_diff = 0; // Bias on zero motion vector sse. - int zero_bias = 95; + const int zero_bias = denoiser->denoise_pars.denoise_mv_bias; zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100); sse_diff = zero_mv_sse - best_sse; @@ -502,14 +534,19 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, mv_row = x->best_sse_mv.as_mv.row; mv_col = x->best_sse_mv.as_mv.col; motion_magnitude2 = mv_row * mv_row + mv_col * mv_col; - sse_thresh = SSE_THRESHOLD; - if (x->increase_denoising) sse_thresh = SSE_THRESHOLD_HIGH; + motion_threshold = denoiser->denoise_pars.scale_motion_thresh * + NOISE_MOTION_THRESHOLD; - if (best_sse > sse_thresh || motion_magnitude2 - > 8 * NOISE_MOTION_THRESHOLD) - { - decision = COPY_BLOCK; - } + if (motion_magnitude2 < + denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD) + x->increase_denoising = 1; + + sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD; + if (x->increase_denoising) + sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH; + + if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold) + decision = COPY_BLOCK; if (decision == FILTER_BLOCK) { @@ -528,7 +565,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, denoiser->denoise_state[block_index] = motion_magnitude2 > 0 ? kFilterNonZeroMV : kFilterZeroMV; // Only denoise UV for zero motion, and if y channel was denoised. - if (uv_denoise && + if (denoiser->denoiser_mode != kDenoiserOnYOnly && motion_magnitude2 == 0 && decision == FILTER_BLOCK) { unsigned char *mc_running_avg_u = @@ -565,7 +602,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, denoiser->yv12_running_avg[INTRA_FRAME].y_stride); denoiser->denoise_state[block_index] = kNoFilter; } - if (uv_denoise) { + if (denoiser->denoiser_mode != kDenoiserOnYOnly) { if (decision_u == COPY_BLOCK) { vp8_copy_mem8x8( x->block[16].src + *x->block[16].base_src, x->block[16].src_stride, diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h index a1f195b72..89832d3c2 100644 --- a/vp8/encoder/denoising.h +++ b/vp8/encoder/denoising.h @@ -39,16 +39,47 @@ enum vp8_denoiser_filter_state { kFilterNonZeroMV }; +enum vp8_denoiser_mode { + kDenoiserOff, + kDenoiserOnYOnly, + kDenoiserOnYUV, + kDenoiserOnYUVAggressive +}; + +typedef struct { + // Scale factor on sse threshold above which no denoising is done. + unsigned int scale_sse_thresh; + // Scale factor on motion magnitude threshold above which no + // denoising is done. + unsigned int scale_motion_thresh; + // Scale factor on motion magnitude below which we increase the strength of + // the temporal filter (in function vp8_denoiser_filter). + unsigned int scale_increase_filter; + // Scale factor to bias to ZEROMV for denoising. + unsigned int denoise_mv_bias; + // Scale factor to bias to ZEROMV for coding mode selection. + unsigned int pickmode_mv_bias; + // Quantizer threshold below which we use the segmentation map to switch off + // loop filter for blocks that have been coded as ZEROMV-LAST a certain number + // (consec_zerolast) of consecutive frames. Note that the delta-QP is set to + // 0 when segmentation map is used for shutting off loop filter. + unsigned int qp_thresh; + // Threshold for number of consecutive frames for blocks coded as ZEROMV-LAST. + unsigned int consec_zerolast; +} denoise_params; + typedef struct vp8_denoiser { YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES]; YV12_BUFFER_CONFIG yv12_mc_running_avg; unsigned char* denoise_state; int num_mb_cols; + int denoiser_mode; + denoise_params denoise_pars; } VP8_DENOISER; int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, - int num_mb_rows, int num_mb_cols); + int num_mb_rows, int num_mb_cols, int mode); void vp8_denoiser_free(VP8_DENOISER *denoiser); @@ -61,8 +92,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, loop_filter_info_n *lfi_n, int mb_row, int mb_col, - int block_index, - int uv_denoise); + int block_index); #ifdef __cplusplus } // extern "C" diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index e6b0f9b64..aec6b9880 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -522,6 +522,19 @@ void encode_mb_row(VP8_COMP *cpi, } #endif + // Keep track of how many (consecutive) times a block is coded + // as ZEROMV_LASTREF, for base layer frames. + // Reset to 0 if its coded as anything else. + if (cpi->current_layer == 0) { + if (xd->mode_info_context->mbmi.mode == ZEROMV && + xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) { + // Increment, check for wrap-around. + if (cpi->consec_zero_last[map_index+mb_col] < 255) + cpi->consec_zero_last[map_index+mb_col] += 1; + } else { + cpi->consec_zero_last[map_index+mb_col] = 0; + } + } /* Special case code for cyclic refresh * If cyclic update enabled then copy xd->mbmi.segment_id; (which diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index d4b17cef1..7b8b51f30 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -206,6 +206,21 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data) } #endif + // Keep track of how many (consecutive) times a block + // is coded as ZEROMV_LASTREF, for base layer frames. + // Reset to 0 if its coded as anything else. + if (cpi->current_layer == 0) { + if (xd->mode_info_context->mbmi.mode == ZEROMV && + xd->mode_info_context->mbmi.ref_frame == + LAST_FRAME) { + // Increment, check for wrap-around. + if (cpi->consec_zero_last[map_index+mb_col] < 255) + cpi->consec_zero_last[map_index+mb_col] += + 1; + } else { + cpi->consec_zero_last[map_index+mb_col] = 0; + } + } /* Special case code for cyclic refresh * If cyclic update enabled then copy diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 373dbebd9..7140f2f1b 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -613,6 +613,24 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) while(block_count && i != cpi->cyclic_refresh_mode_index); cpi->cyclic_refresh_mode_index = i; + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive && + Q < (int)cpi->denoiser.denoise_pars.qp_thresh) { + // Under aggressive denoising mode, use segmentation to turn off loop + // filter below some qp thresh. The loop filter is turned off for all + // blocks that have been encoded as ZEROMV LAST x frames in a row, + // where x is set by cpi->denoiser.denoise_pars.consec_zerolast. + // This is to avoid "dot" artifacts that can occur from repeated + // loop filtering on noisy input source. + cpi->cyclic_refresh_q = Q; + lf_adjustment = -MAX_LOOP_FILTER; + for (i = 0; i < mbs_in_frame; ++i) { + seg_map[i] = (cpi->consec_zero_last[i] > + cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0; + } + } +#endif } /* Activate segmentation. */ @@ -1259,6 +1277,15 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) vpx_free(cpi->tplist); CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows)); + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + vp8_denoiser_free(&cpi->denoiser); + vp8_denoiser_allocate(&cpi->denoiser, width, height, + cm->mb_rows, cm->mb_cols, + cpi->oxcf.noise_sensitivity); + } +#endif } @@ -1405,7 +1432,7 @@ static void update_layer_contexts (VP8_COMP *cpi) double prev_layer_framerate=0; assert(oxcf->number_of_layers <= VPX_TS_MAX_LAYERS); - for (i=0; inumber_of_layers; i++) + for (i = 0; i < oxcf->number_of_layers && i < VPX_TS_MAX_LAYERS; ++i) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; @@ -1752,7 +1779,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) int width = (cpi->oxcf.Width + 15) & ~15; int height = (cpi->oxcf.Height + 15) & ~15; vp8_denoiser_allocate(&cpi->denoiser, width, height, - cpi->common.mb_rows, cpi->common.mb_cols); + cm->mb_rows, cm->mb_cols, + cpi->oxcf.noise_sensitivity); } } #endif @@ -1879,6 +1907,13 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) */ cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode; cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 5; + if (cpi->oxcf.number_of_layers == 1) { + cpi->cyclic_refresh_mode_max_mbs_perframe = + (cpi->common.mb_rows * cpi->common.mb_cols) / 20; + } else if (cpi->oxcf.number_of_layers == 2) { + cpi->cyclic_refresh_mode_max_mbs_perframe = + (cpi->common.mb_rows * cpi->common.mb_cols) / 10; + } cpi->cyclic_refresh_mode_index = 0; cpi->cyclic_refresh_q = 32; @@ -1889,6 +1924,9 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) else cpi->cyclic_refresh_map = (signed char *) NULL; + CHECK_MEM_ERROR(cpi->consec_zero_last, + vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1)); + #ifdef VP8_ENTROPY_STATS init_context_counters(); #endif @@ -2409,6 +2447,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) vpx_free(cpi->mb.ss); vpx_free(cpi->tok); vpx_free(cpi->cyclic_refresh_map); + vpx_free(cpi->consec_zero_last); vp8_remove_common(&cpi->common); vpx_free(cpi); @@ -3232,17 +3271,9 @@ static void update_reference_frames(VP8_COMP *cpi) if (cm->frame_type == KEY_FRAME) { int i; - vp8_yv12_copy_frame( - cpi->Source, - &cpi->denoiser.yv12_running_avg[LAST_FRAME]); - - vp8_yv12_extend_frame_borders( - &cpi->denoiser.yv12_running_avg[LAST_FRAME]); - - for (i = 2; i < MAX_REF_FRAMES - 1; i++) - vp8_yv12_copy_frame( - &cpi->denoiser.yv12_running_avg[LAST_FRAME], - &cpi->denoiser.yv12_running_avg[i]); + for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i) + vp8_yv12_copy_frame(cpi->Source, + &cpi->denoiser.yv12_running_avg[i]); } else /* For non key frames */ { @@ -3479,6 +3510,9 @@ static void encode_frame_to_data_rate { cpi->mb.rd_thresh_mult[i] = 128; } + + // Reset the zero_last counter to 0 on key frame. + vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols); } #if 0 @@ -3900,6 +3934,7 @@ static void encode_frame_to_data_rate #endif + #ifdef OUTPUT_YUV_SRC vp8_write_yuv_frame(yuv_file, cpi->Source); #endif @@ -3995,6 +4030,8 @@ static void encode_frame_to_data_rate else disable_segmentation(cpi); } + // Reset the consec_zero_last counter on key frame. + vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols); vp8_set_quantizer(cpi, Q); } @@ -5037,7 +5074,8 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l /* Update frame rates for each layer */ assert(cpi->oxcf.number_of_layers <= VPX_TS_MAX_LAYERS); - for (i=0; ioxcf.number_of_layers; i++) + for (i = 0; i < cpi->oxcf.number_of_layers && + i < VPX_TS_MAX_LAYERS; ++i) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; lc->framerate = cpi->ref_framerate / diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index df17dff34..7a8baca77 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -511,6 +511,8 @@ typedef struct VP8_COMP int cyclic_refresh_mode_index; int cyclic_refresh_q; signed char *cyclic_refresh_map; + // Count on how many (consecutive) times a macroblock uses ZER0MV_LAST. + unsigned char *consec_zero_last; // Frame counter for the temporal pattern. Counter is rest when the temporal // layers are changed dynamically (run-time change). diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 86108b70a..d0ad7212d 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -40,7 +40,6 @@ extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES]; extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]); - int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, int error_per_bit, @@ -694,6 +693,13 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, */ calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment); +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity) { + rd_adjustment = (int)(rd_adjustment * + cpi->denoiser.denoise_pars.pickmode_mv_bias / 100); + } +#endif + /* if we encode a new mv this is important * find the best new motion vector */ @@ -1168,7 +1174,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity) { - int uv_denoise = (cpi->oxcf.noise_sensitivity == 2) ? 1 : 0; int block_index = mb_row * cpi->common.mb_cols + mb_col; if (x->best_sse_inter_mode == DC_PRED) { @@ -1183,8 +1188,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse, recon_yoffset, recon_uvoffset, &cpi->common.lf_info, mb_row, mb_col, - block_index, uv_denoise); - + block_index); /* Reevaluate ZEROMV after denoising. */ if (best_mbmode.ref_frame == INTRA_FRAME && diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 98d60160c..2f6f5d07c 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -2511,7 +2511,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity) { - int uv_denoise = (cpi->oxcf.noise_sensitivity == 2) ? 1 : 0; int block_index = mb_row * cpi->common.mb_cols + mb_col; if (x->best_sse_inter_mode == DC_PRED) { @@ -2525,8 +2524,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse, recon_yoffset, recon_uvoffset, &cpi->common.lf_info, mb_row, mb_col, - block_index, uv_denoise); - + block_index); /* Reevaluate ZEROMV after denoising. */ if (best_mode.mbmode.ref_frame == INTRA_FRAME && diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 76c1582fd..db27ba5fc 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -1318,9 +1318,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = "vp8.fpf" /* first pass filename */ #endif VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */ -#ifdef CONFIG_SPATIAL_SVC {0}, -#endif {0}, /* ss_target_bitrate */ 1, /* ts_number_layers */ {0}, /* ts_target_bitrate */ @@ -1328,7 +1326,6 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = 0, /* ts_periodicity */ {0}, /* ts_layer_id */ }}, - { -1, {NOT_IMPLEMENTED}} }; @@ -1345,8 +1342,6 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = vp8e_init, /* vpx_codec_init_fn_t init; */ vp8e_destroy, /* vpx_codec_destroy_fn_t destroy; */ vp8e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ - NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */ - NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */ { NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */ NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */ @@ -1354,6 +1349,7 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */ }, { + 1, /* 1 cfg map */ vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */ vp8e_encode, /* vpx_codec_encode_fn_t encode; */ vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */ diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index fb3c236ce..9a0cdb79a 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -60,6 +60,7 @@ struct vpx_codec_alg_priv vpx_decrypt_cb decrypt_cb; void *decrypt_state; vpx_image_t img; + int flushed; int img_setup; struct frame_buffers yv12_frame_buffers; void *user_priv; @@ -88,6 +89,7 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx) ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si); ctx->priv->alg_priv->decrypt_cb = NULL; ctx->priv->alg_priv->decrypt_state = NULL; + ctx->priv->alg_priv->flushed = 0; ctx->priv->init_flags = ctx->init_flags; if (ctx->config.dec) @@ -328,6 +330,13 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, unsigned int resolution_change = 0; unsigned int w, h; + if (data == NULL && data_sz == 0) { + ctx->flushed = 1; + return VPX_CODEC_OK; + } + + /* Reset flushed when receiving a valid frame */ + ctx->flushed = 0; /* Update the input fragment data */ if(update_fragments(ctx, data, data_sz, &res) <= 0) @@ -737,8 +746,9 @@ static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, if (corrupted && pbi) { - *corrupted = pbi->common.frame_to_show->corrupted; - + const YV12_BUFFER_CONFIG *const frame = pbi->common.frame_to_show; + if (frame == NULL) return VPX_CODEC_ERROR; + *corrupted = frame->corrupted; return VPX_CODEC_OK; } else @@ -794,8 +804,6 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) = vp8_init, /* vpx_codec_init_fn_t init; */ vp8_destroy, /* vpx_codec_destroy_fn_t destroy; */ vp8_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ - NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */ - NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */ { vp8_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */ vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */ @@ -804,6 +812,7 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) = NOT_IMPLEMENTED, }, { /* encoder functions */ + 0, NOT_IMPLEMENTED, NOT_IMPLEMENTED, NOT_IMPLEMENTED, diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index 638d39247..a0d078d33 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -34,7 +34,7 @@ void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) { cm->mi_cols = aligned_width >> MI_SIZE_LOG2; cm->mi_rows = aligned_height >> MI_SIZE_LOG2; - cm->mi_stride = cm->mi_cols + MI_BLOCK_SIZE; + cm->mi_stride = calc_mi_size(cm->mi_cols); cm->mb_cols = (cm->mi_cols + 1) >> 1; cm->mb_rows = (cm->mi_rows + 1) >> 1; @@ -60,16 +60,18 @@ static int alloc_mi(VP9_COMMON *cm, int mi_size) { for (i = 0; i < 2; ++i) { cm->mip_array[i] = - (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip)); + (MODE_INFO *)vpx_calloc(mi_size, sizeof(MODE_INFO)); if (cm->mip_array[i] == NULL) return 1; cm->mi_grid_base_array[i] = - (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); + (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*)); if (cm->mi_grid_base_array[i] == NULL) return 1; } + cm->mi_alloc_size = mi_size; + // Init the index. cm->mi_idx = 0; cm->prev_mi_idx = 1; @@ -131,7 +133,8 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { vp9_free_context_buffers(cm); vp9_set_mb_mi(cm, width, height); - if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE))) goto fail; + if (alloc_mi(cm, cm->mi_stride * calc_mi_size(cm->mi_rows))) + goto fail; cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1); if (!cm->last_frame_seg_map) goto fail; diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c index 2404cfcb7..dab8f9617 100644 --- a/vp9/common/vp9_blockd.c +++ b/vp9/common/vp9_blockd.c @@ -146,10 +146,4 @@ void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) { xd->plane[i].subsampling_x = i ? ss_x : 0; xd->plane[i].subsampling_y = i ? ss_y : 0; } -#if CONFIG_ALPHA - // TODO(jkoleszar): Using the Y w/h for now - xd->plane[3].plane_type = PLANE_TYPE_Y; - xd->plane[3].subsampling_x = 0; - xd->plane[3].subsampling_y = 0; -#endif } diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index b9a04dae0..bc6f9ebc5 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -125,9 +125,9 @@ typedef struct { BLOCK_SIZE sb_type; PREDICTION_MODE mode; TX_SIZE tx_size; - uint8_t skip; - uint8_t segment_id; - uint8_t seg_id_predicted; // valid only when temporal_update is enabled + int8_t skip; + int8_t segment_id; + int8_t seg_id_predicted; // valid only when temporal_update is enabled // Only for INTRA blocks PREDICTION_MODE uv_mode; @@ -169,11 +169,7 @@ enum mv_precision { MV_PRECISION_Q4 }; -#if CONFIG_ALPHA -enum { MAX_MB_PLANE = 4 }; -#else enum { MAX_MB_PLANE = 3 }; -#endif struct buf_2d { uint8_t *buf; diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h index 8248cc532..feb90ce66 100644 --- a/vp9/common/vp9_common.h +++ b/vp9/common/vp9_common.h @@ -49,7 +49,7 @@ extern "C" { #define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest)) static INLINE uint8_t clip_pixel(int val) { - return (val > 255) ? 255u : (val < 0) ? 0u : val; + return (val > 255) ? 255 : (val < 0) ? 0 : val; } static INLINE int clamp(int value, int low, int high) { diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index eb0692048..2aadf0ee8 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -205,12 +205,13 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, break; default: assert(0 && "Invalid transform size."); + break; } return combine_entropy_contexts(above_ec, left_ec); } -static const INLINE scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, +static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, PLANE_TYPE type, int block_idx) { const MODE_INFO *const mi = xd->mi[0]; diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h index 9036c54cf..8817fdbb9 100644 --- a/vp9/common/vp9_enums.h +++ b/vp9/common/vp9_enums.h @@ -26,11 +26,11 @@ extern "C" { #define MI_MASK (MI_BLOCK_SIZE - 1) // Bitstream profiles indicated by 2-3 bits in the uncompressed header. -// 00: Profile 0. 8-bit color with 4:2:0 chroma sampling only. -// 10: Profile 1. 8-bit color with 4:4:4, 4:2:2, or 4:4:0 chroma sampling. -// 01: Profile 2. 10-bit and 12-bit color, with 4:2:0 chroma sampling. -// 110: Profile 3. 10-bit and 12-bit color, with 4:2:2, 4:4:4, or 4:4:0 -// chroma sampling. +// 00: Profile 0. 8-bit 4:2:0 only. +// 10: Profile 1. 8-bit 4:4:4, 4:2:2, and 4:4:0. +// 01: Profile 2. 10-bit and 12-bit color only, with 4:2:0 sampling. +// 110: Profile 3. 10-bit and 12-bit color only, with 4:2:2/4:4:4/4:4:0 +// sampling. // 111: Undefined profile. typedef enum BITSTREAM_PROFILE { PROFILE_0, diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index eaa4a5210..cd0a962e4 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -89,6 +89,22 @@ static const tran_high_t sinpi_4_9 = 15212; static INLINE tran_low_t dct_const_round_shift(tran_high_t input) { tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + #if CONFIG_VP9_HIGH + // For valid highbitdepth VP9 streams, intermediate stage coefficients will + // stay within the ranges: + // - 8 bit: signed 16 bit integer + // - 10 bit: signed 18 bit integer + // - 12 bit: signed 20 bit integer + #elif CONFIG_COEFFICIENT_RANGE_CHECKING + // For valid VP9 input streams, intermediate stage coefficients should always + // stay within the range of a signed 16 bit integer. Coefficients can go out + // of this range for invalid/corrupt VP9 streams. However, strictly checking + // this range for every intermediate coefficient can burdensome for a decoder, + // therefore the following assertion is only enabled when configured with + // --enable-coefficient-range-checking. + assert(INT16_MIN <= rv); + assert(rv <= INT16_MAX); +#endif return (tran_low_t)rv; } diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c index 0fe58c5c8..ab64d3036 100644 --- a/vp9/common/vp9_mvref_common.c +++ b/vp9/common/vp9_mvref_common.c @@ -20,7 +20,7 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, int block, int mi_row, int mi_col) { const int *ref_sign_bias = cm->ref_frame_sign_bias; int i, refmv_count = 0; - const MODE_INFO *prev_mi = cm->coding_use_prev_mi && cm->prev_mi + const MODE_INFO *prev_mi = !cm->error_resilient_mode && cm->prev_mi ? cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col] : NULL; const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL; diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h index 7bce3fa37..a937b7823 100644 --- a/vp9/common/vp9_mvref_common.h +++ b/vp9/common/vp9_mvref_common.h @@ -125,7 +125,7 @@ static const int idx_n_column_to_subblock[4][2] = { // clamp_mv_ref #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units -static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) { +static INLINE void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) { clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER, xd->mb_to_right_edge + MV_BORDER, xd->mb_to_top_edge - MV_BORDER, diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 342416190..b6b664a0b 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -68,9 +68,6 @@ typedef struct VP9Common { DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]); -#if CONFIG_ALPHA - DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]); -#endif COLOR_SPACE color_space; @@ -138,16 +135,13 @@ typedef struct VP9Common { int y_dc_delta_q; int uv_dc_delta_q; int uv_ac_delta_q; -#if CONFIG_ALPHA - int a_dc_delta_q; - int a_ac_delta_q; -#endif /* We allocate a MODE_INFO struct for each macroblock, together with an extra row on top and column on the left to simplify prediction. */ int mi_idx; int prev_mi_idx; + int mi_alloc_size; MODE_INFO *mip_array[2]; MODE_INFO **mi_grid_base_array[2]; @@ -199,11 +193,6 @@ typedef struct VP9Common { int error_resilient_mode; int frame_parallel_decoding_mode; - // Flag indicates if prev_mi can be used in coding: - // 0: encoder assumes decoder does not have prev_mi - // 1: encoder assumes decoder has and uses prev_mi - unsigned int coding_use_prev_mi; - int log2_tile_cols, log2_tile_rows; // Private data associated with the frame buffer callbacks. @@ -218,6 +207,15 @@ typedef struct VP9Common { ENTROPY_CONTEXT *above_context; } VP9_COMMON; +static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) { + if (index < 0 || index >= REF_FRAMES) + return NULL; + if (cm->ref_frame_map[index] < 0) + return NULL; + assert(cm->ref_frame_map[index] < REF_FRAMES); + return &cm->frame_bufs[cm->ref_frame_map[index]].buf; +} + static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { return &cm->frame_bufs[cm->new_fb_idx].buf; } @@ -282,6 +280,11 @@ static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) { } } +static INLINE int calc_mi_size(int len) { + // len is in mi units. + return len + MI_BLOCK_SIZE; +} + static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, int mi_row, int bh, int mi_col, int bw, @@ -349,7 +352,7 @@ static INLINE int partition_plane_context(const MACROBLOCKD *xd, #if CONFIG_VP9_HIGH static INLINE unsigned int bit_depth_to_bps(vpx_bit_depth_t bit_depth) { - int bps; + int bps = 8; switch (bit_depth) { case VPX_BITS_8: bps = 8; diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c index 47d6bd21a..ff848487f 100644 --- a/vp9/common/vp9_postproc.c +++ b/vp9/common/vp9_postproc.c @@ -467,19 +467,13 @@ void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + 0.0065 + 0.5); int i; - const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; - const int src_widths[4] = {src->y_width, src->uv_width, src->uv_width, - src->alpha_width}; - const int src_heights[4] = {src->y_height, src->uv_height, src->uv_height, - src->alpha_height}; + const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer}; + const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; + const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width}; + const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height}; - uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer, - dst->alpha_buffer}; - const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride, - dst->alpha_stride}; + uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; + const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; for (i = 0; i < MAX_MB_PLANE; ++i) { #if CONFIG_VP9_HIGH @@ -508,19 +502,13 @@ void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + 0.0065 + 0.5); int i; - const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; - const int src_widths[4] = {src->y_width, src->uv_width, src->uv_width, - src->alpha_width}; - const int src_heights[4] = {src->y_height, src->uv_height, src->uv_height, - src->alpha_height}; + const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer}; + const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; + const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width}; + const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height}; - uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer, - dst->alpha_buffer}; - const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride, - dst->alpha_stride}; + uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; + const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; for (i = 0; i < MAX_MB_PLANE; ++i) { const int src_stride = src_strides[i]; diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c index bc9d6ef5e..014638466 100644 --- a/vp9/common/vp9_pred_common.c +++ b/vp9/common/vp9_pred_common.c @@ -353,9 +353,9 @@ int vp9_get_tx_size_context(const MACROBLOCKD *xd) { const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); const int has_above = above_mbmi != NULL; const int has_left = left_mbmi != NULL; - int above_ctx = (has_above && !above_mbmi->skip) ? above_mbmi->tx_size + int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size : max_tx_size; - int left_ctx = (has_left && !left_mbmi->skip) ? left_mbmi->tx_size + int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size : max_tx_size; if (!has_left) left_ctx = above_ctx; @@ -366,7 +366,7 @@ int vp9_get_tx_size_context(const MACROBLOCKD *xd) { return (above_ctx + left_ctx) > max_tx_size; } -int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, +int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids, BLOCK_SIZE bsize, int mi_row, int mi_col) { const int mi_offset = mi_row * cm->mi_cols + mi_col; const int bw = num_8x8_blocks_wide_lookup[bsize]; diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h index 1a7ba86e4..2c965068a 100644 --- a/vp9/common/vp9_pred_common.h +++ b/vp9/common/vp9_pred_common.h @@ -26,7 +26,7 @@ static INLINE const MODE_INFO *get_left_mi(const MACROBLOCKD *const xd) { return xd->left_available ? xd->mi[-1] : NULL; } -int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, +int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids, BLOCK_SIZE bsize, int mi_row, int mi_col); static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) { diff --git a/vp9/common/vp9_prob.h b/vp9/common/vp9_prob.h index f36148035..fa0e36da4 100644 --- a/vp9/common/vp9_prob.h +++ b/vp9/common/vp9_prob.h @@ -44,21 +44,12 @@ typedef int8_t vp9_tree_index; typedef const vp9_tree_index vp9_tree[]; static INLINE vp9_prob clip_prob(int p) { - return (p > 255) ? 255u : (p < 1) ? 1u : p; + return (p > 255) ? 255 : (p < 1) ? 1 : p; } -// int64 is not needed for normal frame level calculations. -// However when outputting entropy stats accumulated over many frames -// or even clips we can overflow int math. -#ifdef ENTROPY_STATS static INLINE vp9_prob get_prob(int num, int den) { return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den); } -#else -static INLINE vp9_prob get_prob(int num, int den) { - return (den == 0) ? 128u : clip_prob((num * 256 + (den >> 1)) / den); -} -#endif static INLINE vp9_prob get_binary_prob(int n0, int n1) { return get_prob(n0, n0 + n1); diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c index f418b5cab..b42e654f5 100644 --- a/vp9/common/vp9_quant_common.c +++ b/vp9/common/vp9_quant_common.c @@ -237,6 +237,7 @@ int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) { return dc_qlookup_12[clamp(qindex + delta, 0, MAXQ)]; default: assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; } #elif CONFIG_VP9_HIGH switch (bit_depth) { @@ -248,6 +249,7 @@ int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) { return dc_qlookup[clamp(qindex + delta, 0, MAXQ)] << 4; default: assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; } #else (void) bit_depth; @@ -266,6 +268,7 @@ int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) { return ac_qlookup_12[clamp(qindex + delta, 0, MAXQ)]; default: assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; } #elif CONFIG_VP9_HIGH switch (bit_depth) { @@ -277,6 +280,7 @@ int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) { return ac_qlookup[clamp(qindex + delta, 0, MAXQ)] << 4; default: assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; } #else (void) bit_depth; diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index a0c2070cb..05c2a1364 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -769,13 +769,13 @@ add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int sourc specialize qw/vp9_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x32 avx2/, "$sse2_x86inc"; +specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x16 mmx avx2/, "$sse2_x86inc"; +specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc"; @@ -784,13 +784,13 @@ add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x8 mmx/, "$sse2_x86inc"; +specialize qw/vp9_variance8x8 mmx neon/, "$sse2_x86inc"; add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get8x8var mmx/, "$sse2_x86inc"; +specialize qw/vp9_get8x8var mmx neon/, "$sse2_x86inc"; add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get16x16var avx2/, "$sse2_x86inc"; +specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance8x4/, "$sse2_x86inc"; @@ -802,10 +802,10 @@ add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_ specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance64x64/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance64x64/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_sub_pixel_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc"; @@ -832,13 +832,13 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance32x32 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance16x16 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc"; @@ -856,7 +856,7 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_p specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance8x8 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc"; @@ -909,7 +909,7 @@ add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stri specialize qw/vp9_sad8x16 mmx/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad8x8 mmx/, "$sse2_x86inc"; +specialize qw/vp9_sad8x8 mmx neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vp9_sad8x4/, "$sse2_x86inc"; @@ -1008,7 +1008,7 @@ add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const specialize qw/vp9_sad4x4x8 sse4/; add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad64x64x4d sse2/; +specialize qw/vp9_sad64x64x4d sse2 avx2/; add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; specialize qw/vp9_sad32x64x4d sse2/; @@ -1023,7 +1023,7 @@ add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, co specialize qw/vp9_sad16x32x4d sse2/; add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad32x32x4d sse2/; +specialize qw/vp9_sad32x32x4d sse2 avx2/; add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; specialize qw/vp9_sad16x16x4d sse2/; @@ -1064,7 +1064,7 @@ specialize qw/vp9_get_mb_ss mmx sse2/; # ENCODEMB INVOKE add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/vp9_subtract_block/, "$sse2_x86inc"; +specialize qw/vp9_subtract_block neon/, "$sse2_x86inc"; if (vpx_config("CONFIG_VP9_HIGH") eq "yes") { # the transform coefficients are held in 32-bit @@ -1088,7 +1088,7 @@ if (vpx_config("CONFIG_VP9_HIGH") eq "yes") { specialize qw/vp9_block_error avx2/; add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_quantize_fp/, "$ssse3_x86_64"; + specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64"; add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64"; @@ -1112,6 +1112,7 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") { } # fdct functions + if (vpx_config("CONFIG_VP9_HIGH") eq "yes") { add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp9_fht4x4/; @@ -1153,10 +1154,10 @@ if (vpx_config("CONFIG_VP9_HIGH") eq "yes") { specialize qw/vp9_fdct32x32_rd/; } else { add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht4x4 sse2 avx2/; + specialize qw/vp9_fht4x4 sse2/; add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht8x8 sse2 avx2/; + specialize qw/vp9_fht8x8 sse2/; add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp9_fht16x16 sse2/; @@ -1168,13 +1169,13 @@ if (vpx_config("CONFIG_VP9_HIGH") eq "yes") { specialize qw/vp9_fdct4x4_1 sse2/; add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct4x4 sse2 avx2/; + specialize qw/vp9_fdct4x4 sse2/; add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct8x8_1 sse2/; + specialize qw/vp9_fdct8x8_1 sse2 neon/; add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct8x8 sse2 avx2/, "$ssse3_x86_64"; + specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64"; add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct16x16_1 sse2/; diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c index 345f2a1e6..e07fa1b37 100644 --- a/vp9/common/vp9_seg_common.c +++ b/vp9/common/vp9_seg_common.c @@ -52,11 +52,10 @@ int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { void vp9_set_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id, int seg_data) { - const int data_max = vp9_seg_feature_data_max(feature_id); - assert(seg_data <= data_max); + assert(seg_data <= vp9_seg_feature_data_max(feature_id)); if (seg_data < 0) { assert(seg_feature_data_signed[feature_id]); - assert(-seg_data <= data_max); + assert(-seg_data <= vp9_seg_feature_data_max(feature_id)); } seg->feature_data[segment_id][feature_id] = seg_data; diff --git a/vp9/common/x86/vp9_postproc_x86.h b/vp9/common/x86/vp9_postproc_x86.h deleted file mode 100644 index cab9d34f2..000000000 --- a/vp9/common/x86/vp9_postproc_x86.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_COMMON_X86_VP9_POSTPROC_X86_H_ -#define VP9_COMMON_X86_VP9_POSTPROC_X86_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -/* Note: - * - * This platform is commonly built for runtime CPU detection. If you modify - * any of the function mappings present in this file, be sure to also update - * them in the function pointer initialization code - */ - -#if HAVE_MMX -extern prototype_postproc_inplace(vp9_mbpost_proc_down_mmx); -extern prototype_postproc(vp9_post_proc_down_and_across_mmx); -extern prototype_postproc_addnoise(vp9_plane_add_noise_mmx); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_postproc_down -#define vp9_postproc_down vp9_mbpost_proc_down_mmx - -#undef vp9_postproc_downacross -#define vp9_postproc_downacross vp9_post_proc_down_and_across_mmx - -#undef vp9_postproc_addnoise -#define vp9_postproc_addnoise vp9_plane_add_noise_mmx - -#endif -#endif - - -#if HAVE_SSE2 -extern prototype_postproc_inplace(vp9_mbpost_proc_down_xmm); -extern prototype_postproc_inplace(vp9_mbpost_proc_across_ip_xmm); -extern prototype_postproc(vp9_post_proc_down_and_across_xmm); -extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_postproc_down -#define vp9_postproc_down vp9_mbpost_proc_down_xmm - -#undef vp9_postproc_across -#define vp9_postproc_across vp9_mbpost_proc_across_ip_xmm - -#undef vp9_postproc_downacross -#define vp9_postproc_downacross vp9_post_proc_down_and_across_xmm - -#undef vp9_postproc_addnoise -#define vp9_postproc_addnoise vp9_plane_add_noise_wmt - - -#endif -#endif - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP9_COMMON_X86_VP9_POSTPROC_X86_H_ diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index d0d5eadbe..ba5d6b5a5 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -194,8 +194,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, int eob) { struct macroblockd_plane *const pd = &xd->plane[plane]; if (eob > 0) { - TX_TYPE tx_type; - const PLANE_TYPE plane_type = pd->plane_type; + TX_TYPE tx_type = DCT_DCT; tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); #if CONFIG_VP9_HIGH if (xd->cur_buf->flags&YV12_FLAG_HIGH) { @@ -203,6 +202,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, tx_type = DCT_DCT; vp9_high_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bps); } else { + const PLANE_TYPE plane_type = pd->plane_type; switch (tx_size) { case TX_4X4: tx_type = get_tx_type_4x4(plane_type, xd, block); @@ -229,6 +229,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, tx_type = DCT_DCT; vp9_iwht4x4_add(dqcoeff, dst, stride, eob); } else { + const PLANE_TYPE plane_type = pd->plane_type; switch (tx_size) { case TX_4X4: tx_type = get_tx_type_4x4(plane_type, xd, block); @@ -256,6 +257,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, tx_type = DCT_DCT; vp9_iwht4x4_add(dqcoeff, dst, stride, eob); } else { + const PLANE_TYPE plane_type = pd->plane_type; switch (tx_size) { case TX_4X4: tx_type = get_tx_type_4x4(plane_type, xd, block); @@ -684,13 +686,20 @@ static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { } static void resize_context_buffers(VP9_COMMON *cm, int width, int height) { +#if CONFIG_SIZE_LIMIT + if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Width and height beyond allowed size."); +#endif if (cm->width != width || cm->height != height) { - // Change in frame size (assumption: color format does not change). - if (cm->width == 0 || cm->height == 0 || - width * height > cm->width * cm->height) { + const int new_rows = ALIGN_POWER_OF_TWO(height, + MI_SIZE_LOG2) >> MI_SIZE_LOG2; + const int new_cols = ALIGN_POWER_OF_TWO(width, + MI_SIZE_LOG2) >> MI_SIZE_LOG2; + if (calc_mi_size(new_rows) * calc_mi_size(new_cols) > cm->mi_alloc_size) { if (vp9_alloc_context_buffers(cm, width, height)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate frame buffers"); + "Failed to allocate context buffers"); } else { vp9_set_mb_mi(cm, width, height); } @@ -724,6 +733,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { int width, height; int found = 0, i; + int has_valid_ref_frame = 0; for (i = 0; i < REFS_PER_FRAME; ++i) { if (vp9_rb_read_bit(rb)) { YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf; @@ -737,15 +747,21 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, if (!found) vp9_read_frame_size(rb, &width, &height); - // Check that each of the frames that this frame references has valid - // dimensions. + if (width <=0 || height <= 0) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid frame size"); + + // Check to make sure at least one of frames that this frame references + // has valid dimensions. for (i = 0; i < REFS_PER_FRAME; ++i) { RefBuffer *const ref_frame = &cm->frame_refs[i]; - if (!valid_ref_frame_size(ref_frame->buf->y_width, ref_frame->buf->y_height, - width, height)) - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Referenced frame has invalid size"); + has_valid_ref_frame |= valid_ref_frame_size(ref_frame->buf->y_crop_width, + ref_frame->buf->y_crop_height, + width, height); } + if (!has_valid_ref_frame) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Referenced frame has invalid size"); resize_context_buffers(cm, width, height); setup_display_size(cm, rb); @@ -1180,8 +1196,8 @@ static void read_bitdepth_colorspace_sampling( } } else { if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { - // Note: If colorspace is SRGB then only 4:4:4 chroma sampling - // is supported. + // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed. + // 4:2:2 or 4:4:0 chroma sampling is not allowed. cm->subsampling_y = cm->subsampling_x = 0; if (vp9_rb_read_bit(rb)) vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, @@ -1235,6 +1251,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, if (!vp9_read_sync_code(rb)) vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Invalid frame sync code"); + read_bitdepth_colorspace_sampling(cm, rb); pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1; @@ -1264,6 +1281,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, cm->color_space = BT_601; cm->subsampling_y = cm->subsampling_x = 1; } + pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES); setup_frame_size(cm, rb); } else { @@ -1301,11 +1319,9 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, } } if (!cm->error_resilient_mode) { - cm->coding_use_prev_mi = 1; cm->refresh_frame_context = vp9_rb_read_bit(rb); cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb); } else { - cm->coding_use_prev_mi = 0; cm->refresh_frame_context = 0; cm->frame_parallel_decoding_mode = 1; } @@ -1481,7 +1497,7 @@ void vp9_decode_frame(VP9Decoder *pbi, init_macroblockd(cm, &pbi->mb); - if (cm->coding_use_prev_mi) + if (!cm->error_resilient_mode) set_prev_mi(cm); else cm->prev_mi = NULL; diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index 1afaee1e3..32e80f93b 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -435,6 +435,11 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, for (ref = 0; ref < 1 + is_compound; ++ref) { const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; + const int ref_idx = frame - LAST_FRAME; + if (cm->frame_refs[ref_idx].sf.x_scale_fp == REF_INVALID_SCALE || + cm->frame_refs[ref_idx].sf.y_scale_fp == REF_INVALID_SCALE ) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame], mi_row, mi_col); } diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 0343c214c..1a4155825 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -123,8 +123,12 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi, * later commit that adds VP9-specific controls for this functionality. */ if (ref_frame_flag == VP9_LAST_FLAG) { - const YV12_BUFFER_CONFIG *const cfg = - &cm->frame_bufs[cm->ref_frame_map[0]].buf; + const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, 0); + if (cfg == NULL) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "No 'last' reference frame"); + return VPX_CODEC_ERROR; + } if (!equal_dimensions(cfg, sd)) vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Incorrect buffer dimensions"); @@ -181,17 +185,6 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, return cm->error.error_code; } - -int vp9_get_reference_dec(VP9Decoder *pbi, int index, YV12_BUFFER_CONFIG **fb) { - VP9_COMMON *cm = &pbi->common; - - if (index < 0 || index >= REF_FRAMES) - return -1; - - *fb = &cm->frame_bufs[cm->ref_frame_map[index]].buf; - return 0; -} - /* If any buffer updating is signaled it should be done here. */ static void swap_frame_buffers(VP9Decoder *pbi) { int ref_index = 0, mask; diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index 8e16e1cac..223b66fc7 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -74,9 +74,6 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); -int vp9_get_reference_dec(struct VP9Decoder *pbi, - int index, YV12_BUFFER_CONFIG **fb); - struct VP9Decoder *vp9_decoder_create(); void vp9_decoder_remove(struct VP9Decoder *pbi); diff --git a/vp9/decoder/vp9_reader.h b/vp9/decoder/vp9_reader.h index 32e200e2b..2d9eccfbf 100644 --- a/vp9/decoder/vp9_reader.h +++ b/vp9/decoder/vp9_reader.h @@ -52,7 +52,7 @@ int vp9_reader_has_error(vp9_reader *r); const uint8_t *vp9_reader_find_end(vp9_reader *r); -static int vp9_read(vp9_reader *r, int prob) { +static INLINE int vp9_read(vp9_reader *r, int prob) { unsigned int bit = 0; BD_VALUE value; BD_VALUE bigsplit; @@ -89,11 +89,11 @@ static int vp9_read(vp9_reader *r, int prob) { return bit; } -static int vp9_read_bit(vp9_reader *r) { +static INLINE int vp9_read_bit(vp9_reader *r) { return vp9_read(r, 128); // vp9_prob_half } -static int vp9_read_literal(vp9_reader *r, int bits) { +static INLINE int vp9_read_literal(vp9_reader *r, int bits) { int literal = 0, bit; for (bit = bits - 1; bit >= 0; bit--) @@ -102,8 +102,8 @@ static int vp9_read_literal(vp9_reader *r, int bits) { return literal; } -static int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree, - const vp9_prob *probs) { +static INLINE int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree, + const vp9_prob *probs) { vp9_tree_index i = 0; while ((i = tree[i + vp9_read(r, probs[i >> 1])]) > 0) diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c new file mode 100644 index 000000000..6c66f5d5b --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" + +void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { + int r; + int16x8_t sum = vld1q_s16(&input[0]); + for (r = 1; r < 8; ++r) { + const int16x8_t input_00 = vld1q_s16(&input[r * stride]); + sum = vaddq_s16(sum, input_00); + } + { + const int32x4_t a = vpaddlq_s16(sum); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); + output[1] = 0; + } +} + +void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { + int i; + // stage 1 + int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + for (i = 0; i < 2; ++i) { + int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7; + const int16x8_t v_s0 = vaddq_s16(input_0, input_7); + const int16x8_t v_s1 = vaddq_s16(input_1, input_6); + const int16x8_t v_s2 = vaddq_s16(input_2, input_5); + const int16x8_t v_s3 = vaddq_s16(input_3, input_4); + const int16x8_t v_s4 = vsubq_s16(input_3, input_4); + const int16x8_t v_s5 = vsubq_s16(input_2, input_5); + const int16x8_t v_s6 = vsubq_s16(input_1, input_6); + const int16x8_t v_s7 = vsubq_s16(input_0, input_7); + // fdct4(step, step); + int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); + int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); + int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); + int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); + // fdct4(step, step); + int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64); + int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64); + v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64); + v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); + v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); + v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); + v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); + v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 + out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 + out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 + out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 + } + // Stage 2 + v_x0 = vsubq_s16(v_s6, v_s5); + v_x1 = vaddq_s16(v_s6, v_s5); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x8_t ab = vcombine_s16(a, b); + const int16x8_t cd = vcombine_s16(c, d); + // Stage 3 + v_x0 = vaddq_s16(v_s4, ab); + v_x1 = vsubq_s16(v_s4, ab); + v_x2 = vsubq_s16(v_s7, cd); + v_x3 = vaddq_s16(v_s7, cd); + } + // Stage 4 + v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64); + v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64); + v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64); + v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64); + v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64); + v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64); + v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64); + v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64); + v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64); + v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64); + v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 + out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 + out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 + out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 + } + // transpose 8x8 + { + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + // 04 05 06 07 44 45 46 47 + // 14 15 16 17 54 55 56 57 + // 24 25 26 27 64 65 66 67 + // 34 35 36 37 74 75 76 77 + const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0), + vreinterpretq_s32_s16(out_2)); + const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1), + vreinterpretq_s32_s16(out_3)); + const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4), + vreinterpretq_s32_s16(out_6)); + const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5), + vreinterpretq_s32_s16(out_7)); + const int16x8x2_t r01_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), + vreinterpretq_s16_s32(r13_s32.val[0])); + const int16x8x2_t r23_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), + vreinterpretq_s16_s32(r13_s32.val[1])); + const int16x8x2_t r45_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), + vreinterpretq_s16_s32(r57_s32.val[0])); + const int16x8x2_t r67_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), + vreinterpretq_s16_s32(r57_s32.val[1])); + input_0 = r01_s16.val[0]; + input_1 = r01_s16.val[1]; + input_2 = r23_s16.val[0]; + input_3 = r23_s16.val[1]; + input_4 = r45_s16.val[0]; + input_5 = r45_s16.val[1]; + input_6 = r67_s16.val[0]; + input_7 = r67_s16.val[1]; + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } // for + { + // from vp9_dct_sse2.c + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15); + const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15); + const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15); + const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15); + const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15); + const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15); + const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15); + const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15); + input_0 = vhsubq_s16(input_0, sign_in0); + input_1 = vhsubq_s16(input_1, sign_in1); + input_2 = vhsubq_s16(input_2, sign_in2); + input_3 = vhsubq_s16(input_3, sign_in3); + input_4 = vhsubq_s16(input_4, sign_in4); + input_5 = vhsubq_s16(input_5, sign_in5); + input_6 = vhsubq_s16(input_6, sign_in6); + input_7 = vhsubq_s16(input_7, sign_in7); + // store results + vst1q_s16(&final_output[0 * 8], input_0); + vst1q_s16(&final_output[1 * 8], input_1); + vst1q_s16(&final_output[2 * 8], input_2); + vst1q_s16(&final_output[3 * 8], input_3); + vst1q_s16(&final_output[4 * 8], input_4); + vst1q_s16(&final_output[5 * 8], input_5); + vst1q_s16(&final_output[6 * 8], input_6); + vst1q_s16(&final_output[7 * 8], input_7); + } +} + diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c new file mode 100644 index 000000000..2d5ec79b3 --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include + +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_rd.h" + +void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)zbin_oq_value; + (void)scan; + + if (!skip_block) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + + const int16x8_t v_zero = vdupq_n_s16(0); + const int16x8_t v_one = vdupq_n_s16(1); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + int16x8_t v_round = vmovq_n_s16(round_ptr[1]); + int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); + int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); + // adjust for dc + v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); + v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); + v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); + + for (i = 0; i < count; i += 8) { + const int16x8_t v_iscan = vld1q_s16(&iscan[i]); + const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs_coeff = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs_coeff, v_round); + const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), + vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp), + vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), + vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = + vandq_s16(vmvnq_s16(vreinterpretq_s16_u16(v_nz_mask)), v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + + vst1q_s16(&qcoeff_ptr[i], v_qcoeff); + vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff); + v_round = vmovq_n_s16(round_ptr[1]); + v_quant = vmovq_n_s16(quant_ptr[1]); + v_dequant = vmovq_n_s16(dequant_ptr[1]); + } + { + const int16x4_t v_eobmax_3210 = + vmax_s16(vget_low_s16(v_eobmax_76543210), + vget_high_s16(v_eobmax_76543210)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + + *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); + } + } else { + vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); + *eob_ptr = 0; + } +} diff --git a/vp9/encoder/arm/neon/vp9_sad_neon.c b/vp9/encoder/arm/neon/vp9_sad_neon.c index fe40b5452..c4cd85680 100644 --- a/vp9/encoder/arm/neon/vp9_sad_neon.c +++ b/vp9/encoder/arm/neon/vp9_sad_neon.c @@ -26,9 +26,8 @@ static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, vreinterpret_u32_u64(vget_high_u64(b))); return vget_lane_u32(c, 0); } -static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_lo, - const uint16x8_t vec_hi) { - const uint32x4_t a = vpaddlq_u16(vaddq_u16(vec_lo, vec_hi)); +static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { + const uint32x4_t a = vpaddlq_u16(vec_16x8); const uint64x2_t b = vpaddlq_u32(a); const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), vreinterpret_u32_u64(vget_high_u64(b))); @@ -93,7 +92,7 @@ unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride, vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), vget_high_u8(vec_ref_16)); } - return horizontal_add_16x8(vec_accum_lo, vec_accum_hi); + return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); } unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, @@ -112,5 +111,20 @@ unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref)); } - return horizontal_add_16x8(vec_accum_lo, vec_accum_hi); + return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); +} + +unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum = vdupq_n_u16(0); + + for (i = 0; i < 8; ++i) { + const uint8x8_t vec_src = vld1_u8(src); + const uint8x8_t vec_ref = vld1_u8(ref); + src += src_stride; + ref += ref_stride; + vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); + } + return horizontal_add_16x8(vec_accum); } diff --git a/vp9/encoder/arm/neon/vp9_subtract_neon.c b/vp9/encoder/arm/neon/vp9_subtract_neon.c new file mode 100644 index 000000000..b4bf567db --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_subtract_neon.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +void vp9_subtract_block_neon(int rows, int cols, + int16_t *diff, ptrdiff_t diff_stride, + const uint8_t *src, ptrdiff_t src_stride, + const uint8_t *pred, ptrdiff_t pred_stride) { + int r, c; + + if (cols > 16) { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; c += 32) { + const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); + const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); + const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); + const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); + const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00), + vget_low_u8(v_pred_00)); + const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00), + vget_high_u8(v_pred_00)); + const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16), + vget_low_u8(v_pred_16)); + const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16), + vget_high_u8(v_pred_16)); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); + vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); + vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); + } + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else if (cols > 8) { + for (r = 0; r < rows; ++r) { + const uint8x16_t v_src = vld1q_u8(&src[0]); + const uint8x16_t v_pred = vld1q_u8(&pred[0]); + const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src), + vget_low_u8(v_pred)); + const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src), + vget_high_u8(v_pred)); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); + vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else if (cols > 4) { + for (r = 0; r < rows; ++r) { + const uint8x8_t v_src = vld1_u8(&src[0]); + const uint8x8_t v_pred = vld1_u8(&pred[0]); + const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) + diff[c] = src[c] - pred[c]; + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } +} diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c new file mode 100644 index 000000000..816fbda1f --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_variance_neon.c @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp9_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_filter.h" + +#include "vp9/encoder/vp9_variance.h" + +enum { kWidth8 = 8 }; +enum { kHeight8 = 8 }; +enum { kHeight8PlusOne = 9 }; +enum { kWidth16 = 16 }; +enum { kHeight16 = 16 }; +enum { kHeight16PlusOne = 17 }; +enum { kWidth32 = 32 }; +enum { kHeight32 = 32 }; +enum { kHeight32PlusOne = 33 }; +enum { kPixelStepOne = 1 }; +enum { kAlign16 = 16 }; + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static void variance_neon_w8(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = vmlal_s16(v_sse_lo, + vget_low_s16(sv_diff), + vget_low_s16(sv_diff)); + v_sse_hi = vmlal_s16(v_sse_hi, + vget_high_s16(sv_diff), + vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth8, + kHeight8, sse, sum); +} + +unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum); + return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8)); +} + +void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16, + kHeight16, sse, sum); +} + +unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum); + return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16)); +} + +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); + const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); + unsigned int i; + for (i = 0; i < output_height; ++i) { + const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); + const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); + const uint16x8_t a = vmull_u8(src_0, f0); + const uint16x8_t b = vmlal_u8(a, src_1, f1); + const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); + vst1_u8(&output_ptr[0], out); + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); + const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 16) { + const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); + const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); + const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); + const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); + const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); + const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); + const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); + const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); + vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8); + + var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne, + kHeight8PlusOne, kWidth8, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8, + kWidth8, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse); +} + +unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight16 * kWidth16); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight16PlusOne * kWidth16); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, + kHeight16PlusOne, kWidth16, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16, + kWidth16, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse); +} + +void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth32, + kHeight32, sse, sum); +} + +unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum); + return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32)); +} + +unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight32 * kWidth32); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight32PlusOne * kWidth32); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, + kHeight32PlusOne, kWidth32, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w16(fdata3, temp2, kWidth32, kWidth32, kHeight32, + kWidth32, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse); +} diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index bfcbee7b0..d9654d2b5 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -80,13 +80,13 @@ static void prob_diff_update(const vp9_tree_index *tree, vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]); } -static void write_selected_tx_size(const VP9_COMP *cpi, +static void write_selected_tx_size(const VP9_COMMON *cm, + const MACROBLOCKD *xd, TX_SIZE tx_size, BLOCK_SIZE bsize, vp9_writer *w) { const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; - const MACROBLOCKD *const xd = &cpi->mb.e_mbd; const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd, - &cpi->common.fc.tx_probs); + &cm->fc.tx_probs); vp9_write(w, tx_size != TX_4X4, tx_probs[0]); if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { vp9_write(w, tx_size != TX_8X8, tx_probs[1]); @@ -95,14 +95,13 @@ static void write_selected_tx_size(const VP9_COMP *cpi, } } -static int write_skip(const VP9_COMP *cpi, int segment_id, const MODE_INFO *mi, - vp9_writer *w) { - const MACROBLOCKD *const xd = &cpi->mb.e_mbd; - if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) { +static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd, + int segment_id, const MODE_INFO *mi, vp9_writer *w) { + if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; } else { const int skip = mi->mbmi.skip; - vp9_write(w, skip, vp9_get_skip_prob(&cpi->common, xd)); + vp9_write(w, skip, vp9_get_skip_prob(cm, xd)); return skip; } } @@ -123,7 +122,7 @@ static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) { } static void pack_mb_tokens(vp9_writer *w, - TOKENEXTRA **tp, const TOKENEXTRA *stop, + TOKENEXTRA **tp, const TOKENEXTRA *const stop, vpx_bit_depth_t bit_depth) { TOKENEXTRA *p = *tp; @@ -198,9 +197,8 @@ static void write_segment_id(vp9_writer *w, const struct segmentation *seg, } // This function encodes the reference frame -static void write_ref_frames(const VP9_COMP *cpi, vp9_writer *w) { - const VP9_COMMON *const cm = &cpi->common; - const MACROBLOCKD *const xd = &cpi->mb.e_mbd; +static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd, + vp9_writer *w) { const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; const int is_compound = has_second_ref(mbmi); const int segment_id = mbmi->segment_id; @@ -262,7 +260,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, } } - skip = write_skip(cpi, segment_id, mi, w); + skip = write_skip(cm, xd, segment_id, mi, w); if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd)); @@ -270,7 +268,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT && !(is_inter && (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) { - write_selected_tx_size(cpi, mbmi->tx_size, bsize, w); + write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w); } if (!is_inter) { @@ -291,7 +289,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, } else { const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]]; const vp9_prob *const inter_probs = cm->fc.inter_mode_probs[mode_ctx]; - write_ref_frames(cpi, w); + write_ref_frames(cm, xd, w); // If segment skip is not enabled code the mode. if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { @@ -339,10 +337,8 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, } } -static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, - vp9_writer *w) { - const VP9_COMMON *const cm = &cpi->common; - const MACROBLOCKD *const xd = &cpi->mb.e_mbd; +static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd, + MODE_INFO **mi_8x8, vp9_writer *w) { const struct segmentation *const seg = &cm->seg; const MODE_INFO *const mi = mi_8x8[0]; const MODE_INFO *const above_mi = mi_8x8[-xd->mi_stride]; @@ -353,10 +349,10 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, if (seg->update_map) write_segment_id(w, seg, mbmi->segment_id); - write_skip(cpi, mbmi->segment_id, mi, w); + write_skip(cm, xd, mbmi->segment_id, mi, w); if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT) - write_selected_tx_size(cpi, mbmi->tx_size, bsize, w); + write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w); if (bsize >= BLOCK_8X8) { write_intra_mode(w, mbmi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0)); @@ -378,9 +374,10 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, } static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile, - vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end, + vp9_writer *w, TOKENEXTRA **tok, + const TOKENEXTRA *const tok_end, int mi_row, int mi_col) { - VP9_COMMON *const cm = &cpi->common; + const VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; MODE_INFO *m; @@ -392,7 +389,7 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile, mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type], cm->mi_rows, cm->mi_cols); if (frame_is_intra_only(cm)) { - write_mb_modes_kf(cpi, xd->mi, w); + write_mb_modes_kf(cm, xd, xd->mi, w); } else { pack_inter_mode_mvs(cpi, m, w); } @@ -401,7 +398,8 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile, pack_mb_tokens(w, tok, tok_end, cm->bit_depth); } -static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd, +static void write_partition(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, int hbs, int mi_row, int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) { const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); @@ -423,21 +421,23 @@ static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd, } static void write_modes_sb(VP9_COMP *cpi, - const TileInfo *const tile, - vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end, + const TileInfo *const tile, vp9_writer *w, + TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize) { - VP9_COMMON *const cm = &cpi->common; + const VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; const int bsl = b_width_log2(bsize); const int bs = (1 << bsl) / 4; PARTITION_TYPE partition; BLOCK_SIZE subsize; - MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]; + const MODE_INFO *m = NULL; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]; + partition = partition_lookup[bsl][m->mbmi.sb_type]; write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w); subsize = get_subsize(bsize, partition); @@ -479,8 +479,8 @@ static void write_modes_sb(VP9_COMP *cpi, } static void write_modes(VP9_COMP *cpi, - const TileInfo *const tile, - vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end) { + const TileInfo *const tile, vp9_writer *w, + TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) { int mi_row, mi_col; for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; @@ -530,7 +530,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, int i, j, k, l, t; switch (cpi->sf.use_fast_coef_updates) { case TWO_LOOP: { - /* dry run to see if there is any udpate at all needed */ + /* dry run to see if there is any update at all needed */ int savings = 0; int update[2] = {0, 0}; for (i = 0; i < PLANE_TYPES; ++i) { @@ -741,7 +741,7 @@ static void write_delta_q(struct vp9_write_bit_buffer *wb, int delta_q) { } } -static void encode_quantization(VP9_COMMON *cm, +static void encode_quantization(const VP9_COMMON *const cm, struct vp9_write_bit_buffer *wb) { vp9_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS); write_delta_q(wb, cm->y_dc_delta_q); @@ -749,12 +749,11 @@ static void encode_quantization(VP9_COMMON *cm, write_delta_q(wb, cm->uv_ac_delta_q); } - -static void encode_segmentation(VP9_COMP *cpi, +static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd, struct vp9_write_bit_buffer *wb) { int i, j; - struct segmentation *seg = &cpi->common.seg; + const struct segmentation *seg = &cm->seg; vp9_wb_write_bit(wb, seg->enabled); if (!seg->enabled) @@ -764,7 +763,7 @@ static void encode_segmentation(VP9_COMP *cpi, vp9_wb_write_bit(wb, seg->update_map); if (seg->update_map) { // Select the coding strategy (temporal or spatial) - vp9_choose_segmap_coding_method(cpi); + vp9_choose_segmap_coding_method(cm, xd); // Write out probabilities used to decode unpredicted macro-block segments for (i = 0; i < SEG_TREE_PROBS; i++) { const int prob = seg->tree_probs[i]; @@ -812,7 +811,6 @@ static void encode_segmentation(VP9_COMP *cpi, } } - static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) { // Mode vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2); @@ -881,7 +879,8 @@ static void fix_interp_filter(VP9_COMMON *cm) { } } -static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) { +static void write_tile_info(const VP9_COMMON *const cm, + struct vp9_write_bit_buffer *wb) { int min_log2_tile_cols, max_log2_tile_cols, ones; vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); @@ -900,28 +899,29 @@ static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) { } static int get_refresh_mask(VP9_COMP *cpi) { - if (!cpi->multi_arf_allowed && cpi->refresh_golden_frame && - cpi->rc.is_src_frame_alt_ref && !cpi->use_svc) { - // Preserve the previously existing golden frame and update the frame in - // the alt ref slot instead. This is highly specific to the use of - // alt-ref as a forward reference, and this needs to be generalized as - // other uses are implemented (like RTC/temporal scaling) - // - // gld_fb_idx and alt_fb_idx need to be swapped for future frames, but - // that happens in vp9_encoder.c:update_reference_frames() so that it can - // be done outside of the recode loop. - return (cpi->refresh_last_frame << cpi->lst_fb_idx) | - (cpi->refresh_golden_frame << cpi->alt_fb_idx); - } else { - int arf_idx = cpi->alt_fb_idx; - if ((cpi->pass == 2) && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - arf_idx = gf_group->arf_update_idx[gf_group->index]; - } - return (cpi->refresh_last_frame << cpi->lst_fb_idx) | - (cpi->refresh_golden_frame << cpi->gld_fb_idx) | - (cpi->refresh_alt_ref_frame << arf_idx); + if (vp9_preserve_existing_gf(cpi)) { + // We have decided to preserve the previously existing golden frame as our + // new ARF frame. However, in the short term we leave it in the GF slot and, + // if we're updating the GF with the current decoded frame, we save it + // instead to the ARF slot. + // Later, in the function vp9_encoder.c:vp9_update_reference_frames() we + // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it + // there so that it can be done outside of the recode loop. + // Note: This is highly specific to the use of ARF as a forward reference, + // and this needs to be generalized as other uses are implemented + // (like RTC/temporal scalability). + return (cpi->refresh_last_frame << cpi->lst_fb_idx) | + (cpi->refresh_golden_frame << cpi->alt_fb_idx); + } else { + int arf_idx = cpi->alt_fb_idx; + if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + arf_idx = gf_group->arf_update_idx[gf_group->index]; } + return (cpi->refresh_last_frame << cpi->lst_fb_idx) | + (cpi->refresh_golden_frame << cpi->gld_fb_idx) | + (cpi->refresh_alt_ref_frame << arf_idx); + } } static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { @@ -1101,10 +1101,12 @@ static void write_uncompressed_header(VP9_COMP *cpi, if (cm->intra_only) { write_sync_code(wb); + // Note for profile 0, 420 8bpp is assumed. if (cm->profile > PROFILE_0) { write_bitdepth_colorspace_sampling(cm, wb); } + vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES); write_frame_size(cm, wb); } else { @@ -1134,7 +1136,7 @@ static void write_uncompressed_header(VP9_COMP *cpi, encode_loopfilter(&cm->lf, wb); encode_quantization(cm, wb); - encode_segmentation(cpi, wb); + encode_segmentation(cm, &cpi->mb.e_mbd, wb); write_tile_info(cm, wb); } @@ -1226,11 +1228,9 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) { saved_wb = wb; vp9_wb_write_literal(&wb, 0, 16); // don't know in advance first part. size - uncompressed_hdr_size = vp9_rb_bytes_written(&wb); + uncompressed_hdr_size = vp9_wb_bytes_written(&wb); data += uncompressed_hdr_size; - vp9_compute_update_table(); - vp9_clear_system_state(); first_part_size = write_compressed_header(cpi, data); @@ -1242,4 +1242,3 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) { *size = data - dest; } - diff --git a/vp9/encoder/vp9_bitstream.h b/vp9/encoder/vp9_bitstream.h index ddfd0ed4f..8e82d1c97 100644 --- a/vp9/encoder/vp9_bitstream.h +++ b/vp9/encoder/vp9_bitstream.h @@ -16,11 +16,21 @@ extern "C" { #endif -struct VP9_COMP; +#include "vp9/encoder/vp9_encoder.h" void vp9_entropy_mode_init(); -void vp9_pack_bitstream(struct VP9_COMP *cpi, uint8_t *dest, size_t *size); +void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size); + +static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) { + return !cpi->multi_arf_allowed && cpi->refresh_golden_frame && + cpi->rc.is_src_frame_alt_ref && + (!cpi->use_svc || // Add spatial svc base layer case here + (is_spatial_svc(cpi) && + cpi->svc.spatial_layer_id == 0 && + cpi->svc.layer_context[0].gold_ref_idx >=0 && + cpi->oxcf.ss_play_alternate[0])); +} #ifdef __cplusplus } // extern "C" diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 1e5dc1a4c..a640e30c9 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -42,6 +42,7 @@ struct macroblock_plane { int16_t *zbin; int16_t *round; + int64_t quant_thred[2]; // Zbin Over Quant value int16_t zbin_extra; }; @@ -67,7 +68,7 @@ struct macroblock { int sadperbit4; int rddiv; int rdmult; - unsigned int mb_energy; + int mb_energy; int mv_best_ref_index[MAX_REF_FRAMES]; unsigned int max_mv_context[MAX_REF_FRAMES]; @@ -116,7 +117,9 @@ struct macroblock { int quant_fp; // skip forward transform and quantization - int skip_txfm; + int skip_txfm[MAX_MB_PLANE]; + + int64_t bsse[MAX_MB_PLANE]; // Used to store sub partition's choices. MV pred_mv[MAX_REF_FRAMES]; diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h index d6d5efd96..50c3fb706 100644 --- a/vp9/encoder/vp9_context_tree.h +++ b/vp9/encoder/vp9_context_tree.h @@ -34,7 +34,7 @@ typedef struct { int is_coded; int num_4x4_blk; int skip; - int skip_txfm; + int skip_txfm[MAX_MB_PLANE]; int best_mode_index; int hybrid_pred_diff; int comp_pred_diff; @@ -42,7 +42,7 @@ typedef struct { int64_t tx_rd_diff[TX_MODES]; int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; -#if CONFIG_DENOISING +#if CONFIG_VP9_TEMPORAL_DENOISING unsigned int newmv_sse; unsigned int zeromv_sse; PREDICTION_MODE best_sse_inter_mode; diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index ca84a1b39..90ea9cc25 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -345,10 +345,9 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, int refresh_last_frame) { if (frame_type == KEY_FRAME) { int i; - copy_frame(denoiser->running_avg_y[LAST_FRAME], src); - for (i = 2; i < MAX_REF_FRAMES - 1; i++) { - copy_frame(denoiser->running_avg_y[i], - denoiser->running_avg_y[LAST_FRAME]); + // Start at 1 so as not to overwrite the INTRA_FRAME + for (i = 1; i < MAX_REF_FRAMES; ++i) { + copy_frame(denoiser->running_avg_y[i], src); } } else { /* For non key frames */ if (refresh_alt_ref_frame) { diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 49b24b407..a609b6dc6 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -247,11 +247,9 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, } } -static void duplicate_mode_info_in_sb(VP9_COMMON * const cm, - MACROBLOCKD *const xd, - int mi_row, - int mi_col, - BLOCK_SIZE bsize) { +static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { const int block_width = num_8x8_blocks_wide_lookup[bsize]; const int block_height = num_8x8_blocks_high_lookup[bsize]; int i, j; @@ -319,6 +317,8 @@ typedef enum { static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { int i; + node->part_variances = NULL; + vpx_memset(node->split, 0, sizeof(node->split)); switch (bsize) { case BLOCK_64X64: { v64x64 *vt = (v64x64 *) data; @@ -350,6 +350,7 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { } default: { assert(0); + break; } } } @@ -612,10 +613,9 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, } // Else for cyclic refresh mode update the segment map, set the segment id // and then update the quantizer. - else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col, bsize, 1); - vp9_init_plane_quantizers(cpi, x); } } @@ -712,10 +712,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col) { - uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; + uint8_t *const buffers[3] = {src->y_buffer, src->u_buffer, src->v_buffer }; + const int strides[3] = {src->y_stride, src->uv_stride, src->uv_stride }; int i; // Set current frame pointer. @@ -882,9 +880,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, } } -static void update_stats(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - const MACROBLOCK *const x = &cpi->mb; +static void update_stats(VP9_COMMON *cm, const MACROBLOCK *x) { const MACROBLOCKD *const xd = &x->e_mbd; const MODE_INFO *const mi = xd->mi[0]; const MB_MODE_INFO *const mbmi = &mi->mbmi; @@ -996,7 +992,7 @@ static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx); if (output_enabled) { - update_stats(cpi); + update_stats(&cpi->common, &cpi->mb); (*tp)->token = EOSB_TOKEN; (*tp)++; @@ -1069,6 +1065,7 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile, break; default: assert("Invalid partition type."); + break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) @@ -1405,19 +1402,17 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, } x->skip = ctx->skip; - x->skip_txfm = mbmi->segment_id ? 0 : ctx->skip_txfm; + x->skip_txfm[0] = mbmi->segment_id ? 0 : ctx->skip_txfm[0]; } static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, int output_enabled, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { - - set_offsets(cpi, tile, mi_row, mi_col, bsize); update_state_rt(cpi, ctx, mi_row, mi_col, bsize); -#if CONFIG_DENOISING +#if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && output_enabled) { vp9_denoiser_denoise(&cpi->denoiser, &cpi->mb, mi_row, mi_col, MAX(BLOCK_8X8, bsize), ctx); @@ -1425,7 +1420,7 @@ static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile, #endif encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx); - update_stats(cpi); + update_stats(&cpi->common, &cpi->mb); (*tp)->token = EOSB_TOKEN; (*tp)++; @@ -1448,7 +1443,6 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile, return; if (bsize >= BLOCK_8X8) { - MACROBLOCKD *const xd = &cpi->mb.e_mbd; const int idx_str = xd->mi_stride * mi_row + mi_col; MODE_INFO ** mi_8x8 = cm->mi_grid_visible + idx_str; ctx = partition_plane_context(xd, mi_row, mi_col, bsize); @@ -1496,6 +1490,7 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile, break; default: assert("Invalid partition type."); + break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) @@ -1547,7 +1542,7 @@ static void rd_use_partition(VP9_COMP *cpi, pc_tree->partitioning = partition; save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - if (bsize == BLOCK_16X16) { + if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) { set_offsets(cpi, tile, mi_row, mi_col, bsize); x->mb_energy = vp9_block_energy(cpi, x, bsize); } @@ -1674,6 +1669,7 @@ static void rd_use_partition(VP9_COMP *cpi, break; default: assert(0); + break; } pl = partition_plane_context(xd, mi_row, mi_col, bsize); @@ -1809,10 +1805,9 @@ static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = { // // The min and max are assumed to have been initialized prior to calling this // function so repeat calls can accumulate a min and max of more than one sb64. -static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8, - BLOCK_SIZE * min_block_size, - BLOCK_SIZE * max_block_size ) { - MACROBLOCKD *const xd = &cpi->mb.e_mbd; +static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8, + BLOCK_SIZE *min_block_size, + BLOCK_SIZE *max_block_size ) { int sb_width_in_blocks = MI_BLOCK_SIZE; int sb_height_in_blocks = MI_BLOCK_SIZE; int i, j; @@ -1867,17 +1862,17 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, if (cm->frame_type != KEY_FRAME) { MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col]; - get_sb_partition_size_range(cpi, prev_mi, &min_size, &max_size); + get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size); } // Find the min and max partition sizes used in the left SB64 if (left_in_image) { MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE]; - get_sb_partition_size_range(cpi, left_sb64_mi, &min_size, &max_size); + get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size); } // Find the min and max partition sizes used in the above SB64. if (above_in_image) { MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE]; - get_sb_partition_size_range(cpi, above_sb64_mi, &min_size, &max_size); + get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size); } // adjust observed min and max if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) { @@ -1920,7 +1915,7 @@ static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, BLOCK_SIZE max_size = BLOCK_8X8; int bsl = mi_width_log2(BLOCK_64X64); const int search_range_ctrl = (((mi_row + mi_col) >> bsl) + - get_chessboard_index(cm)) % 2; + get_chessboard_index(cm->current_video_frame)) & 0x1; // Trap case where we do not have a prediction. if (search_range_ctrl && (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) { @@ -1964,6 +1959,60 @@ static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, *max_block_size = max_size; } +// TODO(jingning) refactor functions setting partition search range +static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BLOCK_SIZE bsize, + BLOCK_SIZE *min_bs, BLOCK_SIZE *max_bs) { + int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int mi_height = num_8x8_blocks_high_lookup[bsize]; + int idx, idy; + + MODE_INFO *mi; + MODE_INFO **prev_mi = + &cm->prev_mi_grid_visible[mi_row * cm->mi_stride + mi_col]; + BLOCK_SIZE bs, min_size, max_size; + + min_size = BLOCK_64X64; + max_size = BLOCK_4X4; + + if (prev_mi) { + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + mi = prev_mi[idy * cm->mi_stride + idx]; + bs = mi ? mi->mbmi.sb_type : bsize; + min_size = MIN(min_size, bs); + max_size = MAX(max_size, bs); + } + } + } + + if (xd->left_available) { + for (idy = 0; idy < mi_height; ++idy) { + mi = xd->mi[idy * cm->mi_stride - 1]; + bs = mi ? mi->mbmi.sb_type : bsize; + min_size = MIN(min_size, bs); + max_size = MAX(max_size, bs); + } + } + + if (xd->up_available) { + for (idx = 0; idx < mi_width; ++idx) { + mi = xd->mi[idx - cm->mi_stride]; + bs = mi ? mi->mbmi.sb_type : bsize; + min_size = MIN(min_size, bs); + max_size = MAX(max_size, bs); + } + } + + if (min_size == max_size) { + min_size = min_partition_size[min_size]; + max_size = max_partition_size[max_size]; + } + + *min_bs = min_size; + *max_bs = max_size; +} + static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv)); } @@ -1972,13 +2021,58 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv)); } +#if CONFIG_FP_MB_STATS +const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] = + {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4}; +const int num_16x16_blocks_high_lookup[BLOCK_SIZES] = + {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4}; +const int qindex_skip_threshold_lookup[BLOCK_SIZES] = + {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120}; +const int qindex_split_threshold_lookup[BLOCK_SIZES] = + {0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120}; +const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6}; + +typedef enum { + MV_ZERO = 0, + MV_LEFT = 1, + MV_UP = 2, + MV_RIGHT = 3, + MV_DOWN = 4, + MV_INVALID +} MOTION_DIRECTION; + +static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) { + if (fp_byte & FPMB_MOTION_ZERO_MASK) { + return MV_ZERO; + } else if (fp_byte & FPMB_MOTION_LEFT_MASK) { + return MV_LEFT; + } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) { + return MV_RIGHT; + } else if (fp_byte & FPMB_MOTION_UP_MASK) { + return MV_UP; + } else { + return MV_DOWN; + } +} + +static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, + MOTION_DIRECTION that_mv) { + if (this_mv == that_mv) { + return 0; + } else { + return abs(this_mv - that_mv) == 2 ? 2 : 1; + } +} +#endif + // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, - int64_t *dist, int do_recon, int64_t best_rd, + int64_t *dist, int64_t best_rd, PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; @@ -1995,12 +2089,21 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, int64_t sum_rd = 0; int do_split = bsize >= BLOCK_8X8; int do_rect = 1; + // Override skipping rectangular partition operations for edge blocks const int force_horz_split = (mi_row + mi_step >= cm->mi_rows); const int force_vert_split = (mi_col + mi_step >= cm->mi_cols); const int xss = x->e_mbd.plane[1].subsampling_x; const int yss = x->e_mbd.plane[1].subsampling_y; + BLOCK_SIZE min_size = cpi->sf.min_partition_size; + BLOCK_SIZE max_size = cpi->sf.max_partition_size; + +#if CONFIG_FP_MB_STATS + unsigned int src_diff_var = UINT_MAX; + int none_complexity = 0; +#endif + int partition_none_allowed = !force_horz_split && !force_vert_split; int partition_horz_allowed = !force_vert_split && yss <= xss && bsize >= BLOCK_8X8; @@ -2011,22 +2114,28 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, assert(num_8x8_blocks_wide_lookup[bsize] == num_8x8_blocks_high_lookup[bsize]); - if (bsize == BLOCK_16X16) { - set_offsets(cpi, tile, mi_row, mi_col, bsize); + set_offsets(cpi, tile, mi_row, mi_col, bsize); + + if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) x->mb_energy = vp9_block_energy(cpi, x, bsize); + + if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) { + int cb_partition_search_ctrl = ((pc_tree->index == 0 || pc_tree->index == 3) + + get_chessboard_index(cm->current_video_frame)) & 0x1; + + if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size) + set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size); } + // Determine partition types in search according to the speed features. // The threshold set here has to be of square block size. if (cpi->sf.auto_min_max_partition_size) { - partition_none_allowed &= (bsize <= cpi->sf.max_partition_size && - bsize >= cpi->sf.min_partition_size); - partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size && - bsize > cpi->sf.min_partition_size) || + partition_none_allowed &= (bsize <= max_size && bsize >= min_size); + partition_horz_allowed &= ((bsize <= max_size && bsize > min_size) || force_horz_split); - partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size && - bsize > cpi->sf.min_partition_size) || + partition_vert_allowed &= ((bsize <= max_size && bsize > min_size) || force_vert_split); - do_split &= bsize > cpi->sf.min_partition_size; + do_split &= bsize > min_size; } if (cpi->sf.use_square_partition_only) { partition_horz_allowed &= force_horz_split; @@ -2056,6 +2165,65 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, } } +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + set_offsets(cpi, tile, mi_row, mi_col, bsize); + src_diff_var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src, + mi_row, mi_col, bsize); + } +#endif + +#if CONFIG_FP_MB_STATS + // Decide whether we shall split directly and skip searching NONE by using + // the first pass block statistics + if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_split && + partition_none_allowed && src_diff_var > 4 && + cm->base_qindex < qindex_split_threshold_lookup[bsize]) { + int mb_row = mi_row >> 1; + int mb_col = mi_col >> 1; + int mb_row_end = + MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); + int mb_col_end = + MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); + int r, c; + + // compute a complexity measure, basically measure inconsistency of motion + // vectors obtained from the first pass in the current block + for (r = mb_row; r < mb_row_end ; r++) { + for (c = mb_col; c < mb_col_end; c++) { + const int mb_index = r * cm->mb_cols + c; + + MOTION_DIRECTION this_mv; + MOTION_DIRECTION right_mv; + MOTION_DIRECTION bottom_mv; + + this_mv = + get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]); + + // to its right + if (c != mb_col_end - 1) { + right_mv = get_motion_direction_fp( + cpi->twopass.this_frame_mb_stats[mb_index + 1]); + none_complexity += get_motion_inconsistency(this_mv, right_mv); + } + + // to its bottom + if (r != mb_row_end - 1) { + bottom_mv = get_motion_direction_fp( + cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]); + none_complexity += get_motion_inconsistency(this_mv, bottom_mv); + } + + // do not count its left and top neighbors to avoid double counting + } + } + + if (none_complexity > complexity_16x16_blocks_threshold[bsize]) { + partition_none_allowed = 0; + } + } +#endif + // PARTITION_NONE if (partition_none_allowed) { rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize, @@ -2066,6 +2234,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, this_rate += cpi->partition_cost[pl][PARTITION_NONE]; } sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist); + if (sum_rd < best_rd) { int64_t stop_thresh = 4096; int64_t stop_thresh_rd; @@ -2087,6 +2256,52 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, do_split = 0; do_rect = 0; } + +#if CONFIG_FP_MB_STATS + // Check if every 16x16 first pass block statistics has zero + // motion and the corresponding first pass residue is small enough. + // If that is the case, check the difference variance between the + // current frame and the last frame. If the variance is small enough, + // stop further splitting in RD optimization + if (cpi->use_fp_mb_stats && do_split != 0 && + cm->base_qindex > qindex_skip_threshold_lookup[bsize]) { + int mb_row = mi_row >> 1; + int mb_col = mi_col >> 1; + int mb_row_end = + MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); + int mb_col_end = + MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); + int r, c; + + int skip = 1; + for (r = mb_row; r < mb_row_end; r++) { + for (c = mb_col; c < mb_col_end; c++) { + const int mb_index = r * cm->mb_cols + c; + if (!(cpi->twopass.this_frame_mb_stats[mb_index] & + FPMB_MOTION_ZERO_MASK) || + !(cpi->twopass.this_frame_mb_stats[mb_index] & + FPMB_ERROR_SMALL_MASK)) { + skip = 0; + break; + } + } + if (skip == 0) { + break; + } + } + if (skip) { + if (src_diff_var == UINT_MAX) { + set_offsets(cpi, tile, mi_row, mi_col, bsize); + src_diff_var = get_sby_perpixel_diff_variance( + cpi, &cpi->mb.plane[0].src, mi_row, mi_col, bsize); + } + if (src_diff_var < 8) { + do_split = 0; + do_rect = 0; + } + } + } +#endif } } restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); @@ -2109,17 +2324,10 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, ctx->mic.mbmi.interp_filter; rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, pc_tree->leaf_split[0], best_rd, 0); - if (sum_rate == INT_MAX) { + if (sum_rate == INT_MAX) sum_rd = INT64_MAX; - } else { + else sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); - if (sum_rd < best_rd) { - update_state(cpi, pc_tree->leaf_split[0], mi_row, mi_col, subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, - pc_tree->leaf_split[0]); - update_partition_context(xd, mi_row, mi_col, subsize, bsize); - } - } } else { for (i = 0; i < 4 && sum_rd < best_rd; ++i) { const int x_idx = (i & 1) * mi_step; @@ -2131,8 +2339,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); + pc_tree->split[i]->index = i; rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, - subsize, &this_rate, &this_dist, i != 3, + subsize, &this_rate, &this_dist, best_rd - sum_rd, pc_tree->split[i]); if (this_rate == INT_MAX) { @@ -2149,6 +2358,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, pl = partition_plane_context(xd, mi_row, mi_col, bsize); sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT]; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + if (sum_rd < best_rd) { best_rate = sum_rate; best_dist = sum_dist; @@ -2261,6 +2471,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, } restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } + // TODO(jbb): This code added so that we avoid static analysis // warning related to the fact that best_rd isn't used after this // point. This code should be refactored so that the duplicate @@ -2269,7 +2480,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, *rate = best_rate; *dist = best_dist; - if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) { + if (best_rate < INT_MAX && best_dist < INT64_MAX && pc_tree->index != 3) { int output_enabled = (bsize == BLOCK_64X64); // Check the projected output rate for this SB against it's target @@ -2326,6 +2537,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, } vp9_zero(cpi->mb.pred_mv); + cpi->pc_root->index = 0; if ((sf->partition_search_type == SEARCH_PARTITION && sf->use_lastframe_partitioning) || @@ -2357,11 +2569,11 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, } else { GF_GROUP * gf_grp = &cpi->twopass.gf_group; int last_was_mid_sequence_overlay = 0; - if ((cpi->pass == 2) && (gf_grp->index)) { + if ((cpi->oxcf.pass == 2) && (gf_grp->index)) { if (gf_grp->update_type[gf_grp->index - 1] == OVERLAY_UPDATE) last_was_mid_sequence_overlay = 1; } - if ((cm->current_video_frame + if ((cpi->rc.frames_since_key % sf->last_partitioning_redo_frequency) == 0 || last_was_mid_sequence_overlay || cm->prev_mi == 0 @@ -2379,7 +2591,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, &sf->max_partition_size); } rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, INT64_MAX, + &dummy_rate, &dummy_dist, INT64_MAX, cpi->pc_root); } else { if (sf->constrain_copy_partition && @@ -2401,7 +2613,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, &sf->max_partition_size); } rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, INT64_MAX, cpi->pc_root); + &dummy_rate, &dummy_dist, INT64_MAX, cpi->pc_root); } } } @@ -2456,41 +2668,23 @@ static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) { else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) return ALTREF_FRAME; else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) - return LAST_FRAME; - else return GOLDEN_FRAME; + else + return LAST_FRAME; } static TX_MODE select_tx_mode(const VP9_COMP *cpi) { - if (cpi->mb.e_mbd.lossless) { + if (cpi->mb.e_mbd.lossless) return ONLY_4X4; - } else if (cpi->common.current_video_frame == 0) { + if (cpi->common.frame_type == KEY_FRAME) return TX_MODE_SELECT; - } else { - if (cpi->sf.tx_size_search_method == USE_LARGESTALL) { - return ALLOW_32X32; - } else if (cpi->sf.tx_size_search_method == USE_FULL_RD) { - const RD_OPT *const rd_opt = &cpi->rd; - const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi); - return rd_opt->tx_select_threshes[frame_type][ALLOW_32X32] > - rd_opt->tx_select_threshes[frame_type][TX_MODE_SELECT] ? - ALLOW_32X32 : TX_MODE_SELECT; - } else if (cpi->sf.tx_size_search_method == USE_TX_8X8) { - return TX_MODE_SELECT; - } else { - unsigned int total = 0; - int i; - for (i = 0; i < TX_SIZES; ++i) - total += cpi->tx_stepdown_count[i]; - - if (total) { - const double fraction = (double)cpi->tx_stepdown_count[0] / total; - return fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT; - } else { - return cpi->common.tx_mode; - } - } - } + if (cpi->sf.tx_size_search_method == USE_LARGESTALL) + return ALLOW_32X32; + else if (cpi->sf.tx_size_search_method == USE_FULL_RD|| + cpi->sf.tx_size_search_method == USE_TX_8X8) + return TX_MODE_SELECT; + else + return cpi->common.tx_mode; } static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, @@ -2579,6 +2773,8 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, int do_recon, int64_t best_rd, PC_TREE *pc_tree) { + const SPEED_FEATURES *const sf = &cpi->sf; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -2610,18 +2806,18 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // Determine partition types in search according to the speed features. // The threshold set here has to be of square block size. - if (cpi->sf.auto_min_max_partition_size) { - partition_none_allowed &= (bsize <= cpi->sf.max_partition_size && - bsize >= cpi->sf.min_partition_size); - partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size && - bsize > cpi->sf.min_partition_size) || + if (sf->auto_min_max_partition_size) { + partition_none_allowed &= (bsize <= sf->max_partition_size && + bsize >= sf->min_partition_size); + partition_horz_allowed &= ((bsize <= sf->max_partition_size && + bsize > sf->min_partition_size) || force_horz_split); - partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size && - bsize > cpi->sf.min_partition_size) || + partition_vert_allowed &= ((bsize <= sf->max_partition_size && + bsize > sf->min_partition_size) || force_vert_split); - do_split &= bsize > cpi->sf.min_partition_size; + do_split &= bsize > sf->min_partition_size; } - if (cpi->sf.use_square_partition_only) { + if (sf->use_square_partition_only) { partition_horz_allowed &= force_horz_split; partition_vert_allowed &= force_vert_split; } @@ -2631,7 +2827,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize, ctx); ctx->mic.mbmi = xd->mi[0]->mbmi; - ctx->skip_txfm = x->skip_txfm; + ctx->skip_txfm[0] = x->skip_txfm[0]; ctx->skip = x->skip; if (this_rate != INT_MAX) { @@ -2700,7 +2896,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, } else { // skip rectangular partition test when larger block size // gives better rd cost - if (cpi->sf.less_rectangular_check) + if (sf->less_rectangular_check) do_rect &= !partition_none_allowed; } } @@ -2708,7 +2904,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // PARTITION_HORZ if (partition_horz_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_HORZ); - if (cpi->sf.adaptive_motion_search) + if (sf->adaptive_motion_search) load_pred_mv(x, ctx); nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, @@ -2716,7 +2912,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, &pc_tree->horizontal[0]); pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi; - pc_tree->horizontal[0].skip_txfm = x->skip_txfm; + pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0]; pc_tree->horizontal[0].skip = x->skip; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); @@ -2728,7 +2924,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, &pc_tree->horizontal[1]); pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi; - pc_tree->horizontal[1].skip_txfm = x->skip_txfm; + pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0]; pc_tree->horizontal[1].skip = x->skip; if (this_rate == INT_MAX) { @@ -2753,14 +2949,14 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (partition_vert_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_VERT); - if (cpi->sf.adaptive_motion_search) + if (sf->adaptive_motion_search) load_pred_mv(x, ctx); nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, subsize, &pc_tree->vertical[0]); pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi; - pc_tree->vertical[0].skip_txfm = x->skip_txfm; + pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0]; pc_tree->vertical[0].skip = x->skip; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) { @@ -2769,7 +2965,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, &this_rate, &this_dist, subsize, &pc_tree->vertical[1]); pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi; - pc_tree->vertical[1].skip_txfm = x->skip_txfm; + pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0]; pc_tree->vertical[1].skip = x->skip; if (this_rate == INT_MAX) { sum_rd = INT64_MAX; @@ -2810,12 +3006,12 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // Check the projected output rate for this SB against it's target // and and if necessary apply a Q delta using segmentation to get // closer to the target. - if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) { + if ((oxcf->aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) { vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled, best_rate); } - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + if (oxcf->aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh, best_rate, best_dist); @@ -2860,20 +3056,20 @@ static void nonrd_use_partition(VP9_COMP *cpi, nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize, &pc_tree->none); pc_tree->none.mic.mbmi = xd->mi[0]->mbmi; - pc_tree->none.skip_txfm = x->skip_txfm; + pc_tree->none.skip_txfm[0] = x->skip_txfm[0]; pc_tree->none.skip = x->skip; break; case PARTITION_VERT: nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize, &pc_tree->vertical[0]); pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi; - pc_tree->vertical[0].skip_txfm = x->skip_txfm; + pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0]; pc_tree->vertical[0].skip = x->skip; if (mi_col + hbs < cm->mi_cols) { nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs, &rate, &dist, subsize, &pc_tree->vertical[1]); pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi; - pc_tree->vertical[1].skip_txfm = x->skip_txfm; + pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0]; pc_tree->vertical[1].skip = x->skip; if (rate != INT_MAX && dist != INT64_MAX && *totrate != INT_MAX && *totdist != INT64_MAX) { @@ -2886,13 +3082,13 @@ static void nonrd_use_partition(VP9_COMP *cpi, nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize, &pc_tree->horizontal[0]); pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi; - pc_tree->horizontal[0].skip_txfm = x->skip_txfm; + pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0]; pc_tree->horizontal[0].skip = x->skip; if (mi_row + hbs < cm->mi_rows) { nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col, &rate, &dist, subsize, &pc_tree->horizontal[0]); pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi; - pc_tree->horizontal[1].skip_txfm = x->skip_txfm; + pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0]; pc_tree->horizontal[1].skip = x->skip; if (rate != INT_MAX && dist != INT64_MAX && *totrate != INT_MAX && *totdist != INT64_MAX) { @@ -2933,6 +3129,7 @@ static void nonrd_use_partition(VP9_COMP *cpi, break; default: assert("Invalid partition type."); + break; } if (bsize == BLOCK_64X64 && output_enabled) { @@ -2945,9 +3142,10 @@ static void nonrd_use_partition(VP9_COMP *cpi, static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, int mi_row, TOKENEXTRA **tp) { - VP9_COMMON *cm = &cpi->common; - MACROBLOCK *x = &cpi->mb; - MACROBLOCKD *xd = &x->e_mbd; + SPEED_FEATURES *const sf = &cpi->sf; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; int mi_col; // Initialize the left context for the new SB row @@ -2957,7 +3155,6 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, // Code each SB in the row for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) { - MACROBLOCK *x = &cpi->mb; int dummy_rate = 0; int64_t dummy_dist = 0; const int idx_str = cm->mi_stride * mi_row + mi_col; @@ -2970,7 +3167,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, vp9_zero(x->pred_mv); // Set the partition type of the 64X64 block - switch (cpi->sf.partition_search_type) { + switch (sf->partition_search_type) { case VAR_BASED_PARTITION: choose_partitioning(cpi, tile, mi_row, mi_col); nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, @@ -2983,20 +3180,20 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, break; case VAR_BASED_FIXED_PARTITION: case FIXED_PARTITION: - bsize = cpi->sf.partition_search_type == FIXED_PARTITION ? - cpi->sf.always_this_block_size : + bsize = sf->partition_search_type == FIXED_PARTITION ? + sf->always_this_block_size : get_nonrd_var_based_fixed_partition(cpi, mi_row, mi_col); set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize); nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rate, &dummy_dist, cpi->pc_root); break; case REFERENCE_PARTITION: - if (cpi->sf.partition_check || + if (sf->partition_check || !is_background(cpi, tile, mi_row, mi_col)) { set_modeinfo_offsets(cm, xd, mi_row, mi_col); auto_partition_range(cpi, tile, mi_row, mi_col, - &cpi->sf.min_partition_size, - &cpi->sf.max_partition_size); + &sf->min_partition_size, + &sf->max_partition_size); nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1, INT64_MAX, cpi->pc_root); @@ -3009,14 +3206,15 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, break; default: assert(0); + break; } } } // end RTC play code static int set_var_thresh_from_histogram(VP9_COMP *cpi) { - SPEED_FEATURES *const sf = &cpi->sf; - VP9_COMMON *const cm = &cpi->common; + const SPEED_FEATURES *const sf = &cpi->sf; + const VP9_COMMON *const cm = &cpi->common; const uint8_t *src = cpi->Source->y_buffer; const uint8_t *last_src = cpi->Last_Source->y_buffer; @@ -3199,30 +3397,28 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_zero(rd_opt->tx_select_diff); vp9_zero(rd_opt->tx_select_threshes); - cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && - cm->y_dc_delta_q == 0 && - cm->uv_dc_delta_q == 0 && - cm->uv_ac_delta_q == 0; + xd->lossless = cm->base_qindex == 0 && + cm->y_dc_delta_q == 0 && + cm->uv_dc_delta_q == 0 && + cm->uv_ac_delta_q == 0; cm->tx_mode = select_tx_mode(cpi); #if CONFIG_VP9_HIGH if (cm->use_high) - cpi->mb.fwd_txm4x4 = cpi->mb.e_mbd.lossless ? vp9_fwht4x4 : vp9_fdct4x4; + x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4; else - cpi->mb.fwd_txm4x4 = cpi->mb.e_mbd.lossless ? vp9_high_fwht4x4 : - vp9_high_fdct4x4; - cpi->mb.itxm_add = cpi->mb.e_mbd.lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; - cpi->mb.high_itxm_add = cpi->mb.e_mbd.lossless ? vp9_high_iwht4x4_add : - vp9_high_idct4x4_add; + x->fwd_txm4x4 = xd->lossless ? vp9_high_fwht4x4 : vp9_high_fdct4x4; + x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + x->high_itxm_add = xd->lossless ? vp9_high_iwht4x4_add : vp9_high_idct4x4_add; #else - cpi->mb.fwd_txm4x4 = cpi->mb.e_mbd.lossless ? vp9_fwht4x4 : vp9_fdct4x4; + x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4; #endif - cpi->mb.itxm_add = cpi->mb.e_mbd.lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; - if (cpi->mb.e_mbd.lossless) { - cpi->mb.optimize = 0; - cpi->common.lf.filter_level = 0; + if (xd->lossless) { + x->optimize = 0; + cm->lf.filter_level = 0; cpi->zbin_mode_boost_enabled = 0; } @@ -3234,7 +3430,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { set_prev_mi(cm); x->quant_fp = cpi->sf.use_quant_fp; - x->skip_txfm = 0; + vp9_zero(x->skip_txfm); if (sf->use_nonrd_pick_mode) { // Initialize internal buffer pointers for rtc coding, where non-RD // mode decision is used and hence no buffer pointer swap needed. @@ -3474,7 +3670,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, MODE_INFO **mi_8x8 = xd->mi; MODE_INFO *mi = mi_8x8[0]; MB_MODE_INFO *mbmi = &mi->mbmi; - unsigned int segment_id = mbmi->segment_id; + const int seg_skip = vp9_segfeature_active(&cm->seg, mbmi->segment_id, + SEG_LVL_SKIP); const int mis = cm->mi_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; @@ -3484,6 +3681,9 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ && cpi->sf.allow_skip_recode; + if (!x->skip_recode && !cpi->sf.use_nonrd_pick_mode) + vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); + x->skip_optimize = ctx->is_coded; ctx->is_coded = 1; x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct; @@ -3518,7 +3718,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, &xd->block_refs[ref]->sf); } - if (!cpi->sf.reuse_inter_pred_sby) + if (!cpi->sf.reuse_inter_pred_sby || seg_skip) vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); @@ -3529,8 +3729,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8)); } else { mbmi->skip = 1; - if (output_enabled && - !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) + if (output_enabled && !seg_skip) cm->counts.skip[vp9_get_skip_context(xd)][1]++; reset_skip_context(xd, MAX(bsize, BLOCK_8X8)); } @@ -3539,9 +3738,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, if (output_enabled) { if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 && - !(is_inter_block(mbmi) && - (mbmi->skip || - vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) { + !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) { ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd), &cm->counts.tx)[mbmi->tx_size]; } else { diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 5bfe60c45..f50a0dbb8 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -32,7 +32,7 @@ struct optimize_ctx { struct encode_b_args { MACROBLOCK *x; struct optimize_ctx *ctx; - unsigned char *skip; + int8_t *skip; }; void vp9_subtract_block_c(int rows, int cols, @@ -465,6 +465,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, break; default: assert(0); + break; } } @@ -543,6 +544,7 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, break; default: assert(0); + break; } } @@ -630,6 +632,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, break; default: assert(0); + break; } } @@ -658,22 +661,22 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, return; } - if (x->skip_txfm == 0) { - // full forward transform and quantization - if (!x->skip_recode) { + if (!x->skip_recode) { + if (x->skip_txfm[plane] == 0) { + // full forward transform and quantization if (x->quant_fp) vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size); else vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + } else if (x->skip_txfm[plane] == 2) { + // fast path forward transform and quantization + vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size); + } else { + // skip forward transform + p->eobs[block] = 0; + *a = *l = 0; + return; } - } else if (x->skip_txfm == 2) { - // fast path forward transform and quantization - vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size); - } else { - // skip forward transform - p->eobs[block] = 0; - *a = *l = 0; - return; } if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { @@ -735,6 +738,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, break; default: assert(0 && "Invalid transform size"); + break; } } @@ -1022,6 +1026,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, break; default: assert(0); + break; } if (*eob) *(args->skip) = 0; @@ -1029,7 +1034,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - unsigned char *skip) { + int8_t *skip) { struct encode_b_args arg = {x, NULL, skip}; encode_block_intra(plane, block, plane_bsize, tx_size, &arg); } diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h index 0b8c3d2b0..199971865 100644 --- a/vp9/encoder/vp9_encodemb.h +++ b/vp9/encoder/vp9_encodemb.h @@ -33,7 +33,7 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - unsigned char *skip); + int8_t *skip); void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 1922e3cee..4a019928c 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -65,9 +65,6 @@ void vp9_coef_tree_initialize(); #ifdef OUTPUT_YUV_DENOISED FILE *yuv_denoised_file = NULL; #endif -#ifdef OUTPUT_YUV_SRC -FILE *yuv_file; -#endif #ifdef OUTPUT_YUV_REC FILE *yuv_rec_file; #endif @@ -135,7 +132,8 @@ static void setup_frame(VP9_COMP *cpi) { } if (cm->frame_type == KEY_FRAME) { - cpi->refresh_golden_frame = 1; + if (!is_spatial_svc(cpi)) + cpi->refresh_golden_frame = 1; cpi->refresh_alt_ref_frame = 1; } else { cm->fc = cm->frame_contexts[cm->frame_context_idx]; @@ -497,7 +495,7 @@ static void update_frame_size(VP9_COMP *cpi) { vp9_init_context_buffers(cm); init_macroblockd(cm, xd); - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) { + if (is_spatial_svc(cpi)) { if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, @@ -511,7 +509,7 @@ static void update_frame_size(VP9_COMP *cpi) { } void vp9_new_framerate(VP9_COMP *cpi, double framerate) { - cpi->oxcf.framerate = framerate < 0.1 ? 30 : framerate; + cpi->framerate = framerate < 0.1 ? 30 : framerate; vp9_rc_update_framerate(cpi); } @@ -544,6 +542,7 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { VP9_COMMON *const cm = &cpi->common; cpi->oxcf = *oxcf; + cpi->framerate = oxcf->init_framerate; cm->profile = oxcf->profile; cm->bit_depth = oxcf->bit_depth; @@ -579,23 +578,6 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { set_tile_limits(cpi); } -static int get_pass(MODE mode) { - switch (mode) { - case REALTIME: - case ONE_PASS_GOOD: - case ONE_PASS_BEST: - return 0; - - case TWO_PASS_FIRST: - return 1; - - case TWO_PASS_SECOND_GOOD: - case TWO_PASS_SECOND_BEST: - return 2; - } - return -1; -} - void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; @@ -607,7 +589,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { assert(cm->bit_depth <= VPX_BITS_12); cpi->oxcf = *oxcf; - cpi->pass = get_pass(cpi->oxcf.mode); #if CONFIG_VP9_HIGH if (cpi->oxcf.use_high) { cpi->mb.e_mbd.bps = bit_depth_to_bps(cm->bit_depth); @@ -660,7 +641,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { rc->buffer_level = MIN(rc->buffer_level, rc->maximum_buffer_size); // Set up frame rate and related parameters rate control values. - vp9_new_framerate(cpi, cpi->oxcf.framerate); + vp9_new_framerate(cpi, cpi->framerate); // Set absolute upper and lower quality limits rc->worst_quality = cpi->oxcf.worst_allowed_q; @@ -682,7 +663,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || - (cpi->svc.number_spatial_layers > 1 && cpi->pass == 2)) { + (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) { vp9_update_layer_context_change_config(cpi, (int)cpi->oxcf.target_bandwidth); } @@ -705,7 +686,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { high_set_var_fns(cpi); #endif -#if CONFIG_DENOISING +#if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, @@ -1316,7 +1297,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { cpi->use_svc = 0; init_config(cpi, oxcf); - vp9_rc_init(&cpi->oxcf, cpi->pass, &cpi->rc); + vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc); cm->current_video_frame = 0; @@ -1368,7 +1349,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { // pending further tuning and testing. The code is left in place here // as a place holder in regard to the required paths. cpi->multi_arf_last_grp_enabled = 0; - if (cpi->pass == 2) { + if (oxcf->pass == 2) { if (cpi->use_svc) { cpi->multi_arf_allowed = 0; cpi->multi_arf_enabled = 0; @@ -1435,14 +1416,11 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX]; cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp); -#if CONFIG_DENOISING +#if CONFIG_VP9_TEMPORAL_DENOISING #ifdef OUTPUT_YUV_DENOISED yuv_denoised_file = fopen("denoised.yuv", "ab"); #endif #endif -#ifdef OUTPUT_YUV_SRC - yuv_file = fopen("bd.yuv", "ab"); -#endif #ifdef OUTPUT_YUV_REC yuv_rec_file = fopen("rec.yuv", "wb"); #endif @@ -1456,9 +1434,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; - if (cpi->pass == 1) { + if (oxcf->pass == 1) { vp9_init_first_pass(cpi); - } else if (cpi->pass == 2) { + } else if (oxcf->pass == 2) { const size_t packet_sz = sizeof(FIRSTPASS_STATS); const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz); @@ -1638,7 +1616,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vp9_clear_system_state(); // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count); - if (cpi->pass != 1) { + if (cpi->oxcf.pass != 1) { FILE *f = fopen("opsnr.stt", "a"); double time_encoded = (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) / 10000000.000; @@ -1694,7 +1672,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) { #endif } -#if CONFIG_DENOISING +#if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { vp9_denoiser_free(&(cpi->denoiser)); } @@ -1718,14 +1696,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vp9_remove_common(&cpi->common); vpx_free(cpi); -#if CONFIG_DENOISING +#if CONFIG_VP9_TEMPORAL_DENOISING #ifdef OUTPUT_YUV_DENOISED fclose(yuv_denoised_file); #endif #endif -#ifdef OUTPUT_YUV_SRC - fclose(yuv_file); -#endif #ifdef OUTPUT_YUV_REC fclose(yuv_rec_file); #endif @@ -1990,16 +1965,6 @@ int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, } } -int vp9_get_reference_enc(VP9_COMP *cpi, int index, YV12_BUFFER_CONFIG **fb) { - VP9_COMMON *cm = &cpi->common; - - if (index < 0 || index >= REF_FRAMES) - return -1; - - *fb = &cm->frame_bufs[cm->ref_frame_map[index]].buf; - return 0; -} - int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) { YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag); @@ -2017,36 +1982,7 @@ int vp9_update_entropy(VP9_COMP * cpi, int update) { return 0; } - -#if defined(OUTPUT_YUV_SRC) -void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s, FILE *f) { - uint8_t *src = s->y_buffer; - int h = s->y_height; - - do { - fwrite(src, s->y_width, 1, f); - src += s->y_stride; - } while (--h); - - src = s->u_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, f); - src += s->uv_stride; - } while (--h); - - src = s->v_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, f); - src += s->uv_stride; - } while (--h); -} -#endif - -#if CONFIG_DENOISING +#if CONFIG_VP9_TEMPORAL_DENOISING #if defined(OUTPUT_YUV_DENOISED) // The denoiser buffer is allocated as a YUV 440 buffer. This function writes it // as YUV 420. We simply use the top-left pixels of the UV buffers, since we do @@ -2136,17 +2072,6 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { src += s->uv_stride; } while (--h); -#if CONFIG_ALPHA - if (s->alpha_buffer) { - src = s->alpha_buffer; - h = s->alpha_crop_height; - do { - fwrite(src, s->alpha_crop_width, 1, yuv_rec_file); - src += s->alpha_stride; - } while (--h); - } -#endif - fflush(yuv_rec_file); } #endif @@ -2162,22 +2087,18 @@ static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, #endif // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t int i; - const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; - const int src_widths[4] = {src->y_crop_width, src->uv_crop_width, - src->uv_crop_width, src->y_crop_width}; - const int src_heights[4] = {src->y_crop_height, src->uv_crop_height, - src->uv_crop_height, src->y_crop_height}; - uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer, - dst->alpha_buffer}; - const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride, - dst->alpha_stride}; - const int dst_widths[4] = {dst->y_crop_width, dst->uv_crop_width, - dst->uv_crop_width, dst->y_crop_width}; - const int dst_heights[4] = {dst->y_crop_height, dst->uv_crop_height, - dst->uv_crop_height, dst->y_crop_height}; + const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer}; + const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; + const int src_widths[3] = {src->y_crop_width, src->uv_crop_width, + src->uv_crop_width }; + const int src_heights[3] = {src->y_crop_height, src->uv_crop_height, + src->uv_crop_height}; + uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; + const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; + const int dst_widths[3] = {dst->y_crop_width, dst->uv_crop_width, + dst->uv_crop_width}; + const int dst_heights[3] = {dst->y_crop_height, dst->uv_crop_height, + dst->uv_crop_height}; for (i = 0; i < MAX_MB_PLANE; ++i) { #if CONFIG_VP9_HIGH @@ -2210,14 +2131,10 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; const int dst_h = dst->y_crop_height; - const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; - uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer, - dst->alpha_buffer}; - const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride, - dst->alpha_stride}; + const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer}; + const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; + uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; + const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; const InterpKernel *const kernel = vp9_get_interp_kernel(EIGHTTAP); int x, y, i; @@ -2338,17 +2255,15 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx); ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx); - } else if (!cpi->multi_arf_allowed && cpi->refresh_golden_frame && - cpi->rc.is_src_frame_alt_ref && !cpi->use_svc) { - /* Preserve the previously existing golden frame and update the frame in - * the alt ref slot instead. This is highly specific to the current use of - * alt-ref as a forward reference, and this needs to be generalized as - * other uses are implemented (like RTC/temporal scaling) - * - * The update to the buffer in the alt ref slot was signaled in - * vp9_pack_bitstream(), now swap the buffer pointers so that it's treated - * as the golden frame next time. - */ + } else if (vp9_preserve_existing_gf(cpi)) { + // We have decided to preserve the previously existing golden frame as our + // new ARF frame. However, in the short term in function + // vp9_bitstream.c::get_refresh_mask() we left it in the GF slot and, if + // we're updating the GF with the current decoded frame, we save it to the + // ARF slot instead. + // We now have to update the ARF with the current frame and swap gld_fb_idx + // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF + // slot and, if we're updating the GF, the current frame becomes the new GF. int tmp; ref_cnt_fb(cm->frame_bufs, @@ -2357,10 +2272,15 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { tmp = cpi->alt_fb_idx; cpi->alt_fb_idx = cpi->gld_fb_idx; cpi->gld_fb_idx = tmp; + + if (is_spatial_svc(cpi)) { + cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx; + cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx; + } } else { /* For non key/golden frames */ if (cpi->refresh_alt_ref_frame) { int arf_idx = cpi->alt_fb_idx; - if ((cpi->pass == 2) && cpi->multi_arf_allowed) { + if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; arf_idx = gf_group->arf_update_idx[gf_group->index]; } @@ -2379,7 +2299,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx); } -#if CONFIG_DENOISING +#if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { vp9_denoiser_update_frame_info(&cpi->denoiser, *cpi->Source, @@ -2510,8 +2430,8 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { cpi->rc.total_actual_bits, cm->base_qindex, vp9_convert_qindex_to_q(cm->base_qindex), (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0, + vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality), cpi->rc.avg_q, - vp9_convert_qindex_to_q(cpi->rc.ni_av_qi), vp9_convert_qindex_to_q(cpi->oxcf.cq_level), cpi->refresh_last_frame, cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost, @@ -2796,7 +2716,8 @@ static void get_ref_frame_flags(VP9_COMP *cpi) { if (cpi->gold_is_last) cpi->ref_frame_flags &= ~VP9_GOLD_FLAG; - if (cpi->rc.frames_till_gf_update_due == INT_MAX) + if (cpi->rc.frames_till_gf_update_due == INT_MAX && + !is_spatial_svc(cpi)) cpi->ref_frame_flags &= ~VP9_GOLD_FLAG; if (cpi->alt_is_last) @@ -2847,9 +2768,7 @@ static void configure_skippable_frame(VP9_COMP *cpi) { // according to the variance SVC *const svc = &cpi->svc; - const int is_spatial_svc = (svc->number_spatial_layers > 1) && - (svc->number_temporal_layers == 1); - TWO_PASS *const twopass = is_spatial_svc ? + TWO_PASS *const twopass = is_spatial_svc(cpi) ? &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass; @@ -2867,7 +2786,7 @@ static void set_arf_sign_bias(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; int arf_sign_bias; - if ((cpi->pass == 2) && cpi->multi_arf_allowed) { + if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; arf_sign_bias = cpi->rc.source_alt_ref_active && (!cpi->refresh_alt_ref_frame || @@ -2955,9 +2874,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, (cpi->oxcf.frame_parallel_decoding_mode != 0); // By default, encoder assumes decoder can use prev_mi. - cm->coding_use_prev_mi = 1; if (cm->error_resilient_mode) { - cm->coding_use_prev_mi = 0; cm->frame_parallel_decoding_mode = 1; cm->reset_frame_context = 0; cm->refresh_frame_context = 0; @@ -2971,19 +2888,19 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // static regions if indicated. // Only allowed in second pass of two pass (as requires lagged coding) // and if the relevant speed feature flag is set. - if (cpi->pass == 2 && cpi->sf.static_segmentation) + if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation) configure_static_seg_features(cpi); // Check if the current frame is skippable for the partition search in the // second pass according to the first pass stats - if (cpi->pass == 2 && - (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) { + if (cpi->oxcf.pass == 2 && + (!cpi->use_svc || is_spatial_svc(cpi))) { configure_skippable_frame(cpi); } // For 1 pass CBR, check if we are dropping this frame. // Never drop on key frame. - if (cpi->pass == 0 && + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && cm->frame_type != KEY_FRAME) { if (vp9_rc_drop_frame(cpi)) { @@ -3020,21 +2937,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } #endif -#ifdef OUTPUT_YUV_SRC - vp9_write_yuv_frame(cpi->Source, yuv_file); -#endif - set_speed_features(cpi); -#if CONFIG_DENOISING -#ifdef OUTPUT_YUV_DENOISED - if (cpi->oxcf.noise_sensitivity > 0) { - vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME], - yuv_denoised_file); - } -#endif -#endif - // Decide q and q bounds. q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index); @@ -3050,6 +2954,16 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, encode_with_recode_loop(cpi, size, dest, q, bottom_index, top_index); } +#if CONFIG_VP9_TEMPORAL_DENOISING +#ifdef OUTPUT_YUV_DENOISED + if (cpi->oxcf.noise_sensitivity > 0) { + vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME], + yuv_denoised_file); + } +#endif +#endif + + // Special case code to reduce pulsing when key frames are forced at a // fixed interval. Note the reconstruction error if it is the frame before // the force key frame @@ -3246,8 +3160,6 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags, #if CONFIG_VP9_HIGH const int use_high = sd->flags & YV12_FLAG_HIGH; #endif - const int is_spatial_svc = cpi->use_svc && - (cpi->svc.number_temporal_layers == 1); #if CONFIG_VP9_HIGH check_initial_width(cpi, subsampling_x, subsampling_y, use_high); @@ -3257,8 +3169,8 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags, vpx_usec_timer_start(&timer); -#ifdef CONFIG_SPATIAL_SVC - if (is_spatial_svc) +#if CONFIG_SPATIAL_SVC + if (is_spatial_svc(cpi)) res = vp9_svc_lookahead_push(cpi, cpi->lookahead, sd, time_stamp, end_time, frame_flags); else @@ -3327,7 +3239,7 @@ void adjust_frame_rate(VP9_COMP *cpi) { // over the whole interval seen. const double interval = MIN((double)(cpi->source->ts_end - cpi->first_time_stamp_ever), 10000000.0); - double avg_duration = 10000000.0 / cpi->oxcf.framerate; + double avg_duration = 10000000.0 / cpi->framerate; avg_duration *= (interval - avg_duration + this_duration); avg_duration /= interval; @@ -3344,7 +3256,7 @@ static int get_arf_src_index(VP9_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; int arf_src_index = 0; if (is_altref_enabled(cpi)) { - if (cpi->pass == 2) { + if (cpi->oxcf.pass == 2) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { arf_src_index = gf_group->arf_src_offset[gf_group->index]; @@ -3359,7 +3271,7 @@ static int get_arf_src_index(VP9_COMP *cpi) { static void check_src_altref(VP9_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; - if (cpi->pass == 2) { + if (cpi->oxcf.pass == 2) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; rc->is_src_frame_alt_ref = (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE); @@ -3388,15 +3300,14 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, YV12_BUFFER_CONFIG *force_src_buffer = NULL; MV_REFERENCE_FRAME ref_frame; int arf_src_index; - const int is_spatial_svc = cpi->use_svc && - (cpi->svc.number_temporal_layers == 1) && - (cpi->svc.number_spatial_layers > 1); if (!cpi) return -1; - if (is_spatial_svc && cpi->pass == 2) { + if (is_spatial_svc(cpi) && cpi->oxcf.pass == 2) { +#if CONFIG_SPATIAL_SVC vp9_svc_lookahead_peek(cpi, cpi->lookahead, 0, 1); +#endif vp9_restore_layer_context(cpi); } @@ -3419,8 +3330,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, if (arf_src_index) { assert(arf_src_index <= rc->frames_to_key); -#ifdef CONFIG_SPATIAL_SVC - if (is_spatial_svc) +#if CONFIG_SPATIAL_SVC + if (is_spatial_svc(cpi)) cpi->source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, arf_src_index, 0); else @@ -3429,8 +3340,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, if (cpi->source != NULL) { cpi->alt_ref_source = cpi->source; -#ifdef CONFIG_SPATIAL_SVC - if (is_spatial_svc && cpi->svc.spatial_layer_id > 0) { +#if CONFIG_SPATIAL_SVC + if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0) { int i; // Reference a hidden frame from a lower layer for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) { @@ -3464,8 +3375,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, if (!cpi->source) { // Get last frame source. if (cm->current_video_frame > 0) { -#ifdef CONFIG_SPATIAL_SVC - if (is_spatial_svc) +#if CONFIG_SPATIAL_SVC + if (is_spatial_svc(cpi)) cpi->last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0); else #endif @@ -3475,8 +3386,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } // Read in the source frame. -#ifdef CONFIG_SPATIAL_SVC - if (is_spatial_svc) +#if CONFIG_SPATIAL_SVC + if (is_spatial_svc(cpi)) cpi->source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush); else #endif @@ -3507,7 +3418,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } else { *size = 0; - if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) { + if (flush && cpi->oxcf.pass == 1 && !cpi->twopass.first_pass_done) { vp9_end_first_pass(cpi); /* get last stats packet */ cpi->twopass.first_pass_done = 1; } @@ -3519,6 +3430,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->last_end_time_stamp_seen = cpi->source->ts_start; } + // Clear down mmx registers + vp9_clear_system_state(); + // adjust frame rates based on timestamps given if (cm->show_frame) { adjust_frame_rate(cpi); @@ -3533,9 +3447,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // start with a 0 size frame *size = 0; - // Clear down mmx registers - vp9_clear_system_state(); - /* find a free buffer for the new frame, releasing the reference previously * held. */ @@ -3545,7 +3456,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, if (!cpi->use_svc && cpi->multi_arf_allowed) { if (cm->frame_type == KEY_FRAME) { init_buffer_indices(cpi); - } else if (cpi->pass == 2) { + } else if (cpi->oxcf.pass == 2) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index]; } @@ -3553,7 +3464,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->frame_flags = *frame_flags; - if (cpi->pass == 2 && + if (cpi->oxcf.pass == 2 && cm->current_video_frame == 0 && cpi->oxcf.allow_spatial_resampling && cpi->oxcf.rc_mode == VPX_VBR) { @@ -3601,8 +3512,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, vp9_vaq_init(); } - if (cpi->pass == 1 && - (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) { + if (cpi->oxcf.pass == 1 && + (!cpi->use_svc || is_spatial_svc(cpi))) { const int lossless = is_lossless_requested(&cpi->oxcf); #if CONFIG_VP9_HIGH if (cpi->oxcf.use_high) @@ -3616,8 +3527,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; vp9_first_pass(cpi); - } else if (cpi->pass == 2 && - (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) { + } else if (cpi->oxcf.pass == 2 && + (!cpi->use_svc || is_spatial_svc(cpi))) { Pass2Encode(cpi, size, dest, frame_flags); } else if (cpi->use_svc) { SvcEncode(cpi, size, dest, frame_flags); @@ -3641,19 +3552,19 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // Save layer specific state. if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || - (cpi->svc.number_spatial_layers > 1 && cpi->pass == 2)) { + (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) { vp9_save_layer_context(cpi); } vpx_usec_timer_mark(&cmptimer); cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); - if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) + if (cpi->b_calculate_psnr && cpi->oxcf.pass != 1 && cm->show_frame) generate_psnr_packet(cpi); #if CONFIG_INTERNAL_STATS - if (cpi->pass != 1) { + if (cpi->oxcf.pass != 1) { cpi->bytes += (int)(*size); if (cm->show_frame) { @@ -3920,6 +3831,7 @@ int vp9_high_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, return (int) sse; default: assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; } } #endif diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 8f6d1e427..95c07d344 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -37,7 +37,7 @@ #include "vp9/encoder/vp9_svc_layercontext.h" #include "vp9/encoder/vp9_tokenize.h" #include "vp9/encoder/vp9_variance.h" -#if CONFIG_DENOISING +#if CONFIG_VP9_TEMPORAL_DENOISING #include "vp9/encoder/vp9_denoiser.h" #endif @@ -135,7 +135,7 @@ typedef struct VP9EncoderConfig { int width; // width of data passed to the compressor int height; // height of data passed to the compressor unsigned int in_bit_depth; // input bit depth - double framerate; // set to passed in framerate + double init_framerate; // set to passed in framerate int64_t target_bandwidth; // bandwidth to be used in kilobits per second int noise_sensitivity; // pre processing blur: recommendation 0 @@ -144,6 +144,7 @@ typedef struct VP9EncoderConfig { unsigned int rc_max_intra_bitrate_pct; MODE mode; + int pass; // Key Framing Operations int auto_key; // autodetect cut scenes and set the keyframes @@ -235,6 +236,7 @@ typedef struct VP9EncoderConfig { vp8e_tuning tuning; /* Flag to say whether we are using 16bit frame buffers */ int use_high; + vp9e_tune_content content; } VP9EncoderConfig; static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { @@ -307,6 +309,7 @@ typedef struct VP9_COMP { int64_t first_time_stamp_ever; RATE_CONTROL rc; + double framerate; vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES]; @@ -315,9 +318,6 @@ typedef struct VP9_COMP { MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS]; int mbgraph_n_frames; // number of frames filled in the above int static_mb_pct; // % forced skip mbs by segmentation - - int pass; - int ref_frame_flags; SPEED_FEATURES sf; @@ -433,7 +433,7 @@ typedef struct VP9_COMP { int multi_arf_enabled; int multi_arf_last_grp_enabled; -#if CONFIG_DENOISING +#if CONFIG_VP9_TEMPORAL_DENOISING VP9_DENOISER denoiser; #endif } VP9_COMP; @@ -465,9 +465,6 @@ void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags); int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); -int vp9_get_reference_enc(VP9_COMP *cpi, int index, - YV12_BUFFER_CONFIG **fb); - int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); @@ -512,9 +509,8 @@ static INLINE int frame_is_boosted(const VP9_COMP *cpi) { } static INLINE int get_token_alloc(int mb_rows, int mb_cols) { - // TODO(JBB): make this work for alpha channel and double check we can't - // exceed this token count if we have a 32x32 transform crossing a boundary - // at a multiple of 16. + // TODO(JBB): double check we can't exceed this token count if we have a + // 32x32 transform crossing a boundary at a multiple of 16. // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full // resolution. We assume up to 1 token per pixel, and then allow // a head room of 4. @@ -544,10 +540,16 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags); +static INLINE int is_spatial_svc(const struct VP9_COMP *const cpi) { + return cpi->use_svc && + cpi->svc.number_temporal_layers == 1 && + cpi->svc.number_spatial_layers > 1; +} + static INLINE int is_altref_enabled(const VP9_COMP *const cpi) { return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 && (cpi->oxcf.play_alternate && - (!(cpi->use_svc && cpi->svc.number_temporal_layers == 1) || + (!is_spatial_svc(cpi) || cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id])); } @@ -560,8 +562,8 @@ static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd, : 0]; } -static INLINE int get_chessboard_index(const VP9_COMMON *cm) { - return cm->current_video_frame % 2; +static INLINE int get_chessboard_index(const int frame_index) { + return frame_index & 0x1; } #ifdef __cplusplus diff --git a/vp9/encoder/vp9_extend.c b/vp9/encoder/vp9_extend.c index 55d16e6bd..e1ab83b90 100644 --- a/vp9/encoder/vp9_extend.c +++ b/vp9/encoder/vp9_extend.c @@ -121,18 +121,6 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int eb_uv = eb_y >> uv_height_subsampling; const int er_uv = er_y >> uv_width_subsampling; -#if CONFIG_ALPHA - const int et_a = dst->border >> (dst->alpha_height != dst->y_height); - const int el_a = dst->border >> (dst->alpha_width != dst->y_width); - const int eb_a = et_a + dst->alpha_height - src->alpha_height; - const int er_a = el_a + dst->alpha_width - src->alpha_width; - - copy_and_extend_plane(src->alpha_buffer, src->alpha_stride, - dst->alpha_buffer, dst->alpha_stride, - src->alpha_width, src->alpha_height, - et_a, el_a, eb_a, er_a); -#endif - #if CONFIG_VP9_HIGH if (src->flags & YV12_FLAG_HIGH) { copy_and_extend_plane_high(src->y_buffer, src->y_stride, @@ -152,6 +140,7 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, return; } #endif + copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src->y_width, src->y_height, diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 9bd83d214..e367f285f 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -22,7 +22,6 @@ #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_reconinter.h" // vp9_setup_dst_planes() #include "vp9/common/vp9_systemdependent.h" - #include "vp9/encoder/vp9_aq_variance.h" #include "vp9/encoder/vp9_block.h" #include "vp9/encoder/vp9_encodeframe.h" @@ -56,7 +55,6 @@ #define MIN_KF_BOOST 300 #define MIN_GF_INTERVAL 4 -#define LONG_TERM_VBR_CORRECTION static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) { YV12_BUFFER_CONFIG temp = *a; @@ -225,26 +223,6 @@ static void subtract_stats(FIRSTPASS_STATS *section, section->duration -= frame->duration; } -static void avg_stats(FIRSTPASS_STATS *section) { - if (section->count < 1.0) - return; - - section->intra_error /= section->count; - section->coded_error /= section->count; - section->sr_coded_error /= section->count; - section->pcnt_inter /= section->count; - section->pcnt_second_ref /= section->count; - section->pcnt_neutral /= section->count; - section->pcnt_motion /= section->count; - section->MVr /= section->count; - section->mvr_abs /= section->count; - section->MVc /= section->count; - section->mvc_abs /= section->count; - section->MVrv /= section->count; - section->MVcv /= section->count; - section->mv_in_out_count /= section->count; - section->duration /= section->count; -} // Calculate a modified Error used in distributing bits between easier and // harder frames. @@ -278,7 +256,7 @@ void vp9_init_first_pass(VP9_COMP *cpi) { } void vp9_end_first_pass(VP9_COMP *cpi) { - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) { + if (is_spatial_svc(cpi)) { int i; for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { output_stats(&cpi->svc.layer_context[i].twopass.total_stats, @@ -526,7 +504,7 @@ void vp9_first_pass(VP9_COMP *cpi) { set_first_pass_params(cpi); vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth)); - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) { + if (is_spatial_svc(cpi)) { MV_REFERENCE_FRAME ref_frame = LAST_FRAME; const YV12_BUFFER_CONFIG *scaled_ref_buf = NULL; twopass = &cpi->svc.layer_context[cpi->svc.spatial_layer_id].twopass; @@ -613,6 +591,9 @@ void vp9_first_pass(VP9_COMP *cpi) { const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); double error_weight = 1.0; const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col); +#if CONFIG_FP_MB_STATS + const int mb_index = mb_row * cm->mb_cols + mb_col; +#endif vp9_clear_system_state(); @@ -676,7 +657,8 @@ void vp9_first_pass(VP9_COMP *cpi) { #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { - // TODO(pengchong): store some related block statistics here + // initialization + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; } #endif @@ -727,8 +709,7 @@ void vp9_first_pass(VP9_COMP *cpi) { &unscaled_last_source_buf_2d); #endif // TODO(pengchong): Replace the hard-coded threshold - if (raw_motion_error > 25 || - (cpi->use_svc && cpi->svc.number_temporal_layers == 1)) { + if (raw_motion_error > 25 || is_spatial_svc(cpi)) { // Test last reference frame using the previous best mv as the // starting point (best reference) for the search. first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv, @@ -805,6 +786,20 @@ void vp9_first_pass(VP9_COMP *cpi) { // Start by assuming that intra mode is best. best_ref_mv.as_int = 0; +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // intra predication statistics + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; + if (this_error > FPMB_ERROR_LARGE_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK; + } else if (this_error < FPMB_ERROR_SMALL_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK; + } + } +#endif + if (motion_error <= this_error) { // Keep a count of cases where the inter and intra were very close // and very low. This helps with scene cut detection for example in @@ -835,13 +830,50 @@ void vp9_first_pass(VP9_COMP *cpi) { #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { - // TODO(pengchong): save some related block statistics here + // inter predication statistics + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; + if (this_error > FPMB_ERROR_LARGE_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_ERROR_LARGE_MASK; + } else if (this_error < FPMB_ERROR_SMALL_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_ERROR_SMALL_MASK; + } } #endif if (mv.as_int) { ++mvcount; +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + cpi->twopass.frame_mb_stats_buf[mb_index] &= + ~FPMB_MOTION_ZERO_MASK; + // check estimated motion direction + if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) { + // right direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_RIGHT_MASK; + } else if (mv.as_mv.row < 0 && + abs(mv.as_mv.row) >= abs(mv.as_mv.col)) { + // up direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_UP_MASK; + } else if (mv.as_mv.col < 0 && + abs(mv.as_mv.col) >= abs(mv.as_mv.row)) { + // left direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_LEFT_MASK; + } else { + // down direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_DOWN_MASK; + } + } +#endif + // Non-zero vector, was it different from the last non zero vector? if (mv.as_int != lastmv_as_int) ++new_mv_count; @@ -968,7 +1000,7 @@ void vp9_first_pass(VP9_COMP *cpi) { vp9_extend_frame_borders(new_yv12); - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) { + if (is_spatial_svc(cpi)) { vp9_update_reference_frames(cpi); } else { // Swap frame pointers so last frame refers to the frame we just compressed. @@ -1035,13 +1067,11 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi, const double err_per_mb = section_err / num_mbs; const double speed_term = 1.0 + 0.04 * oxcf->speed; const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth << - BPER_MB_NORMBITS) / num_mbs; + BPER_MB_NORMBITS) / num_mbs; int q; int is_svc_upper_layer = 0; - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1 && - cpi->svc.spatial_layer_id > 0) { + if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0) is_svc_upper_layer = 1; - } // Try and pick a max Q that will be high enough to encode the // content at the given rate. @@ -1157,6 +1187,19 @@ static double get_prediction_decay_rate(const VP9_COMMON *cm, return MIN(second_ref_decay, next_frame->pcnt_inter); } +// This function gives an estimate of how badly we believe the prediction +// quality is decaying from frame to frame. +static double get_zero_motion_factor(const VP9_COMMON *cm, + const FIRSTPASS_STATS *frame) { + const double sr_ratio = frame->coded_error / + DOUBLE_DIVIDE_CHECK(frame->sr_coded_error); + const double zero_motion_pct = frame->pcnt_inter - + frame->pcnt_motion; + + return MIN(sr_ratio, zero_motion_pct); +} + + // Function to test for a condition where a complex transition is followed // by a static section. For example in slide shows where there is a fade // between slides. This is to help with more optimal kf and gf positioning. @@ -1681,11 +1724,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { decay_accumulator = decay_accumulator * loop_decay_rate; // Monitor for static sections. - if ((next_frame.pcnt_inter - next_frame.pcnt_motion) < - zero_motion_accumulator) { - zero_motion_accumulator = next_frame.pcnt_inter - - next_frame.pcnt_motion; - } + zero_motion_accumulator = + MIN(zero_motion_accumulator, + get_zero_motion_factor(&cpi->common, &next_frame)); // Break clause to detect very still sections after motion. For example, // a static image after a fade or other transition. @@ -2056,11 +2097,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { break; // Monitor for static sections. - if ((next_frame.pcnt_inter - next_frame.pcnt_motion) < - zero_motion_accumulator) { - zero_motion_accumulator = (next_frame.pcnt_inter - - next_frame.pcnt_motion); - } + zero_motion_accumulator = + MIN(zero_motion_accumulator, + get_zero_motion_factor(&cpi->common, &next_frame)); // For the first few frames collect data to decide kf boost. if (i <= (rc->max_gf_interval * 2)) { @@ -2175,9 +2214,11 @@ void configure_buffer_updates(VP9_COMP *cpi) { break; default: assert(0); + break; } - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) { - cpi->refresh_golden_frame = 0; + if (is_spatial_svc(cpi)) { + if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0) + cpi->refresh_golden_frame = 0; if (cpi->alt_ref_source == NULL) cpi->refresh_alt_ref_frame = 0; } @@ -2194,9 +2235,8 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { int target_rate; LAYER_CONTEXT *lc = NULL; - const int is_spatial_svc = (cpi->use_svc && - cpi->svc.number_temporal_layers == 1); - if (is_spatial_svc) { + + if (is_spatial_svc(cpi)) { lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; frames_left = (int)(twopass->total_stats.count - lc->current_video_frame_in_layer); @@ -2216,15 +2256,15 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index]; target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); rc->base_frame_target = target_rate; -#ifdef LONG_TERM_VBR_CORRECTION + // Correction to rate target based on prior over or under shoot. if (cpi->oxcf.rc_mode == VPX_VBR) vbr_rate_correction(&target_rate, rc->vbr_bits_off_target); -#endif + vp9_rc_set_frame_target(cpi, target_rate); cm->frame_type = INTER_FRAME; - if (is_spatial_svc) { + if (is_spatial_svc(cpi)) { if (cpi->svc.spatial_layer_id == 0) { lc->is_key_frame = 0; } else { @@ -2240,7 +2280,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { vp9_clear_system_state(); - if (is_spatial_svc && twopass->kf_intra_err_min == 0) { + if (is_spatial_svc(cpi) && twopass->kf_intra_err_min == 0) { twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs; twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs; } @@ -2248,7 +2288,8 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { if (cpi->oxcf.rc_mode == VPX_Q) { twopass->active_worst_quality = cpi->oxcf.cq_level; } else if (cm->current_video_frame == 0 || - (is_spatial_svc && lc->current_video_frame_in_layer == 0)) { + (is_spatial_svc(cpi) && + lc->current_video_frame_in_layer == 0)) { // Special case code for first frame. const int section_target_bandwidth = (int)(twopass->bits_left / frames_left); @@ -2274,9 +2315,12 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { cm->frame_type = INTER_FRAME; } - if (is_spatial_svc) { + if (is_spatial_svc(cpi)) { if (cpi->svc.spatial_layer_id == 0) { lc->is_key_frame = (cm->frame_type == KEY_FRAME); + if (lc->is_key_frame) + cpi->ref_frame_flags &= + (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); } else { cm->frame_type = INTER_FRAME; lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; @@ -2302,7 +2346,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { } rc->frames_till_gf_update_due = rc->baseline_gf_interval; - if (!is_spatial_svc) + if (!is_spatial_svc(cpi)) cpi->refresh_golden_frame = 1; } @@ -2315,11 +2359,11 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); rc->base_frame_target = target_rate; -#ifdef LONG_TERM_VBR_CORRECTION + // Correction to rate target based on prior over or under shoot. if (cpi->oxcf.rc_mode == VPX_VBR) vbr_rate_correction(&target_rate, rc->vbr_bits_off_target); -#endif + vp9_rc_set_frame_target(cpi, target_rate); // Update the total stats remaining structure. @@ -2329,45 +2373,19 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { void vp9_twopass_postencode_update(VP9_COMP *cpi) { TWO_PASS *const twopass = &cpi->twopass; RATE_CONTROL *const rc = &cpi->rc; -#ifdef LONG_TERM_VBR_CORRECTION - // In this experimental mode, the VBR correction is done exclusively through - // rc->vbr_bits_off_target. Based on the sign of this value, a limited % - // adjustment is made to the target rate of subsequent frames, to try and - // push it back towards 0. This mode is less likely to suffer from - // extreme behaviour at the end of a clip or group of frames. + + // VBR correction is done through rc->vbr_bits_off_target. Based on the + // sign of this value, a limited % adjustment is made to the target rate + // of subsequent frames, to try and push it back towards 0. This method + // is designed to prevent extreme behaviour at the end of a clip + // or group of frames. const int bits_used = rc->base_frame_target; rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size; -#else - // In this mode, VBR correction is acheived by altering bits_left, - // kf_group_bits & gf_group_bits to reflect any deviation from the target - // rate in this frame. This alters the allocation of bits to the - // remaning frames in the group / clip. - // - // This method can give rise to unstable behaviour near the end of a clip - // or kf/gf group of frames where any accumulated error is corrected over an - // ever decreasing number of frames. Hence we change the balance of target - // vs. actual bitrate gradually as we progress towards the end of the - // sequence in order to mitigate this effect. - const double progress = - (double)(twopass->stats_in - twopass->stats_in_start) / - (twopass->stats_in_end - twopass->stats_in_start); - const int bits_used = (int)(progress * rc->this_frame_target + - (1.0 - progress) * rc->projected_frame_size); -#endif twopass->bits_left = MAX(twopass->bits_left - bits_used, 0); -#ifdef LONG_TERM_VBR_CORRECTION if (cpi->common.frame_type != KEY_FRAME && !vp9_is_upper_layer_key_frame(cpi)) { -#else - if (cpi->common.frame_type == KEY_FRAME || - vp9_is_upper_layer_key_frame(cpi)) { - // For key frames kf_group_bits already had the target bits subtracted out. - // So now update to the correct value based on the actual bits used. - twopass->kf_group_bits += rc->this_frame_target - bits_used; - } else { -#endif twopass->kf_group_bits -= bits_used; } twopass->kf_group_bits = MAX(twopass->kf_group_bits, 0); diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 33a795f26..bf8c9fd96 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -19,6 +19,20 @@ extern "C" { #endif #if CONFIG_FP_MB_STATS + +#define FPMB_DCINTRA_MASK 0x01 + +#define FPMB_MOTION_ZERO_MASK 0x02 +#define FPMB_MOTION_LEFT_MASK 0x04 +#define FPMB_MOTION_RIGHT_MASK 0x08 +#define FPMB_MOTION_UP_MASK 0x10 +#define FPMB_MOTION_DOWN_MASK 0x20 + +#define FPMB_ERROR_SMALL_MASK 0x40 +#define FPMB_ERROR_LARGE_MASK 0x80 +#define FPMB_ERROR_SMALL_TH 2000 +#define FPMB_ERROR_LARGE_TH 48000 + typedef struct { uint8_t *mb_stats_start; uint8_t *mb_stats_end; diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h index bbe509e54..b8234412a 100644 --- a/vp9/encoder/vp9_lookahead.h +++ b/vp9/encoder/vp9_lookahead.h @@ -14,7 +14,7 @@ #include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" -#ifdef CONFIG_SPATIAL_SVC +#if CONFIG_SPATIAL_SVC #include "vpx/vp8cx.h" #include "vpx/vpx_encoder.h" #endif @@ -31,7 +31,7 @@ struct lookahead_entry { int64_t ts_end; unsigned int flags; -#ifdef CONFIG_SPATIAL_SVC +#if CONFIG_SPATIAL_SVC vpx_svc_parameters_t svc_params[VPX_SS_MAX_LAYERS]; #endif }; diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c index b1eea18b4..a6bcd533b 100644 --- a/vp9/encoder/vp9_picklpf.c +++ b/vp9/encoder/vp9_picklpf.c @@ -24,7 +24,7 @@ #include "vp9/encoder/vp9_quantize.h" static int get_max_filter_level(const VP9_COMP *cpi) { - if (cpi->pass == 2) { + if (cpi->oxcf.pass == 2) { return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 : MAX_LOOP_FILTER; } else { @@ -89,7 +89,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, // Bias against raising loop filter in favor of lowering it. int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; - if ((cpi->pass == 2) && (cpi->twopass.section_intra_rating < 20)) + if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20)) bias = (bias * cpi->twopass.section_intra_rating) / 20; // yx, bias less for large block size @@ -174,6 +174,7 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, default: assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 " "or VPX_BITS_12"); + return; } #else filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 74e0d2b58..f4d156587 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -212,11 +212,11 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, *sse_y = sse; if (sse < dc_quant * dc_quant >> 6) - x->skip_txfm = 1; + x->skip_txfm[0] = 1; else if (var < ac_quant * ac_quant >> 6) - x->skip_txfm = 2; + x->skip_txfm[0] = 2; else - x->skip_txfm = 0; + x->skip_txfm[0] = 0; if (cpi->common.tx_mode == TX_MODE_SELECT) { if (sse > (var << 2)) @@ -431,7 +431,8 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, INTERP_FILTER filter_ref = cm->interp_filter; int bsl = mi_width_log2(bsize); const int pred_filter_search = cm->interp_filter == SWITCHABLE ? - (((mi_row + mi_col) >> bsl) + get_chessboard_index(cm)) % 2 : 0; + (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & 0x1 : 0; int const_motion[MAX_REF_FRAMES] = { 0 }; int bh = num_4x4_blocks_high_lookup[bsize] << 2; int bw = num_4x4_blocks_wide_lookup[bsize] << 2; @@ -449,6 +450,10 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, PRED_BUFFER *this_mode_pred = NULL; int i; + // CTX is used by the temporal denoiser which is currently being developed. + // TODO(jbb): when temporal denoiser is finished and in the default build + // remove the following line; + (void) ctx; if (cpi->sf.reuse_inter_pred_sby) { for (i = 0; i < 3; i++) { #if CONFIG_VP9_HIGH @@ -500,7 +505,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); - if (cm->coding_use_prev_mi) + if (!cm->error_resilient_mode) vp9_find_mv_refs(cm, xd, tile, xd->mi[0], ref_frame, candidates, mi_row, mi_col); else @@ -600,7 +605,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (cost < best_cost) { best_filter = filter; best_cost = cost; - skip_txfm = x->skip_txfm; + skip_txfm = x->skip_txfm[0]; if (cpi->sf.reuse_inter_pred_sby) { if (this_mode_pred != current_pred) { @@ -626,7 +631,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, dist = pf_dist[mbmi->interp_filter]; var_y = pf_var[mbmi->interp_filter]; sse_y = pf_sse[mbmi->interp_filter]; - x->skip_txfm = skip_txfm; + x->skip_txfm[0] = skip_txfm; } else { mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref; vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); @@ -649,7 +654,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } } -#if CONFIG_DENOISING +#if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { vp9_denoiser_update_frame_stats(&cpi->denoiser, mbmi, sse_y, this_mode, ctx); @@ -664,7 +669,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, best_pred_filter = mbmi->interp_filter; best_tx_size = mbmi->tx_size; best_ref_frame = ref_frame; - skip_txfm = x->skip_txfm; + skip_txfm = x->skip_txfm[0]; if (cpi->sf.reuse_inter_pred_sby) { if (best_pred != NULL) @@ -715,7 +720,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->ref_frame[0] = best_ref_frame; mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int; xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int; - x->skip_txfm = skip_txfm; + x->skip_txfm[0] = skip_txfm; // Perform intra prediction search, if the best SAD is above a certain // threshold. @@ -724,7 +729,6 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int i, j; const int width = num_4x4_blocks_wide_lookup[bsize]; const int height = num_4x4_blocks_high_lookup[bsize]; - const BLOCK_SIZE bsize_tx = txsize_to_bsize[mbmi->tx_size]; int rate2 = 0; int64_t dist2 = 0; @@ -734,28 +738,36 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TX_SIZE tmp_tx_size = MIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + const BLOCK_SIZE bsize_tx = txsize_to_bsize[tmp_tx_size]; const int step = 1 << tmp_tx_size; - for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) { - if (cpi->sf.reuse_inter_pred_sby) { - pd->dst.buf = tmp[0].data; - pd->dst.stride = bw; - } + if (cpi->sf.reuse_inter_pred_sby) { + pd->dst.buf = tmp[0].data; + pd->dst.stride = bw; + } + for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) { + uint8_t *const src_buf_base = p->src.buf; + uint8_t *const dst_buf_base = pd->dst.buf; for (j = 0; j < height; j += step) { for (i = 0; i < width; i += step) { + p->src.buf = &src_buf_base[4 * (j * src_stride + i)]; + pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)]; + // Use source buffer as an approximation for the fully reconstructed + // buffer vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize), tmp_tx_size, this_mode, - &p->src.buf[4 * (j * dst_stride + i)], - src_stride, - &pd->dst.buf[4 * (j * dst_stride + i)], - dst_stride, i, j, 0); + p->src.buf, src_stride, + pd->dst.buf, dst_stride, + i, j, 0); model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y); rate2 += rate; dist2 += dist; ++block_idx; } } + p->src.buf = src_buf_base; + pd->dst.buf = dst_buf_base; rate = rate2; dist = dist2; @@ -777,7 +789,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->uv_mode = this_mode; mbmi->mv[0].as_int = INVALID_MV; } else { - x->skip_txfm = skip_txfm; + x->skip_txfm[0] = skip_txfm; } } } @@ -798,6 +810,7 @@ int vp9_get_intra_cost_penalty(int qindex, int qdelta, return ROUND_POWER_OF_TWO(5 * q, 2); default: assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; } #else return 20 * q; diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 56eac75aa..633abb929 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -25,15 +25,14 @@ void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { - int eob = -1; + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp, eob = -1; if (!skip_block) { - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - - int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); tmp = (tmp * quant) >> 16; qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr; @@ -71,15 +70,15 @@ void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { - int eob = -1; + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp, eob = -1; if (!skip_block) { - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); tmp = (tmp * quant) >> 15; qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; @@ -558,6 +557,7 @@ static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) { return q == 0 ? 64 : (quant < 2368 ? 84 : 80); default: assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; } #else return q == 0 ? 64 : (quant < 148 ? 84 : 80); @@ -599,16 +599,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) { quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); quants->uv_round[q][i] = (qrounding_factor * quant) >> 7; cm->uv_dequant[q][i] = quant; - -#if CONFIG_ALPHA - // alpha - quant = i == 0 ? vp9_dc_quant(q, cm->a_dc_delta_q, cm->bit_depth) - : vp9_ac_quant(q, cm->a_ac_delta_q, cm->bit_depth); - invert_quant(&quants->a_quant[q][i], &quants->a_quant_shift[q][i], quant); - quants->a_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); - quants->a_round[q][i] = (qrounding_factor * quant) >> 7; - cm->a_dequant[q][i] = quant; -#endif } for (i = 2; i < 8; i++) { @@ -627,14 +617,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) { quants->uv_zbin[q][i] = quants->uv_zbin[q][1]; quants->uv_round[q][i] = quants->uv_round[q][1]; cm->uv_dequant[q][i] = cm->uv_dequant[q][1]; - -#if CONFIG_ALPHA - quants->a_quant[q][i] = quants->a_quant[q][1]; - quants->a_quant_shift[q][i] = quants->a_quant_shift[q][1]; - quants->a_zbin[q][i] = quants->a_zbin[q][1]; - quants->a_round[q][i] = quants->a_round[q][1]; - cm->a_dequant[q][i] = cm->a_dequant[q][1]; -#endif } } } @@ -657,6 +639,10 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[0].quant_shift = quants->y_quant_shift[qindex]; x->plane[0].zbin = quants->y_zbin[qindex]; x->plane[0].round = quants->y_round[qindex]; + x->plane[0].quant_thred[0] = cm->y_dequant[qindex][0] * + cm->y_dequant[qindex][0]; + x->plane[0].quant_thred[1] = cm->y_dequant[qindex][1] * + cm->y_dequant[qindex][1]; x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7); xd->plane[0].dequant = cm->y_dequant[qindex]; @@ -668,19 +654,14 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[i].quant_shift = quants->uv_quant_shift[qindex]; x->plane[i].zbin = quants->uv_zbin[qindex]; x->plane[i].round = quants->uv_round[qindex]; + x->plane[i].quant_thred[0] = cm->y_dequant[qindex][0] * + cm->y_dequant[qindex][0]; + x->plane[i].quant_thred[1] = cm->y_dequant[qindex][1] * + cm->y_dequant[qindex][1]; x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7); xd->plane[i].dequant = cm->uv_dequant[qindex]; } -#if CONFIG_ALPHA - x->plane[3].quant = quants->a_quant[qindex]; - x->plane[3].quant_shift = quants->a_quant_shift[qindex]; - x->plane[3].zbin = quants->a_zbin[qindex]; - x->plane[3].round = quants->a_round[qindex]; - x->plane[3].zbin_extra = (int16_t)((cm->a_dequant[qindex][1] * zbin) >> 7); - xd->plane[3].dequant = cm->a_dequant[qindex]; -#endif - x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); x->q_index = qindex; diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index fa1734729..56f731787 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -35,13 +35,6 @@ typedef struct { DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]); - -#if CONFIG_ALPHA - DECLARE_ALIGNED(16, int16_t, a_quant[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, a_quant_shift[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, a_zbin[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, a_round[QINDEX_RANGE][8]); -#endif } QUANTS; void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block, diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 1bc8ca84a..b100da0cc 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -60,6 +60,7 @@ default: \ assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10" \ " or VPX_BITS_12"); \ + name = NULL; \ } \ } while (0) #else @@ -162,6 +163,7 @@ double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) { return vp9_ac_quant(qindex, 0, bit_depth) / 64.0; default: assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1.0; } #else return vp9_ac_quant(qindex, 0, bit_depth) / 4.0; @@ -174,7 +176,7 @@ int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000; // q based adjustment to baseline enumerator enumerator += (int)(enumerator * q) >> 12; - return (int)(0.5 + (enumerator * correction_factor / q)); + return (int)(enumerator * correction_factor / q); } static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, @@ -349,7 +351,7 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) { if (cpi->common.frame_type == KEY_FRAME) { return rc->rate_correction_factors[KF_STD]; - } else if (cpi->pass == 2) { + } else if (cpi->oxcf.pass == 2) { RATE_FACTOR_LEVEL rf_lvl = cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; return rc->rate_correction_factors[rf_lvl]; @@ -368,7 +370,7 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) { if (cpi->common.frame_type == KEY_FRAME) { rc->rate_correction_factors[KF_STD] = factor; - } else if (cpi->pass == 2) { + } else if (cpi->oxcf.pass == 2) { RATE_FACTOR_LEVEL rf_lvl = cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; rc->rate_correction_factors[rf_lvl] = factor; @@ -1027,7 +1029,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index, int *top_index) { int q; - if (cpi->pass == 0) { + if (cpi->oxcf.pass == 0) { if (cpi->oxcf.rc_mode == VPX_CBR) q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index); else @@ -1095,7 +1097,7 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { // this frame refreshes means next frames don't unless specified by user rc->frames_since_golden = 0; - if (cpi->pass == 2) { + if (cpi->oxcf.pass == 2) { if (!rc->source_alt_ref_pending && cpi->twopass.gf_group.rf_level[0] == GF_ARF_STD) rc->source_alt_ref_active = 0; @@ -1312,7 +1314,7 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { ? INT_MAX : (int)(rc->starting_buffer_level / 2); } else { int kf_boost = 32; - double framerate = oxcf->framerate; + double framerate = cpi->framerate; if (svc->number_temporal_layers > 1 && oxcf->rc_mode == VPX_CBR) { // Use the layer framerate for temporal layers CBR mode. @@ -1340,26 +1342,31 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { cm->frame_type = KEY_FRAME; rc->source_alt_ref_active = 0; - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) { + if (is_spatial_svc(cpi)) { cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame = 1; + cpi->ref_frame_flags &= + (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); } - if (cpi->pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) { + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) { target = calc_iframe_target_size_one_pass_cbr(cpi); } } else { cm->frame_type = INTER_FRAME; - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) { + if (is_spatial_svc(cpi)) { LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; if (cpi->svc.spatial_layer_id == 0) { lc->is_key_frame = 0; } else { lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; + if (lc->is_key_frame) + cpi->ref_frame_flags &= (~VP9_LAST_FLAG); } + cpi->ref_frame_flags &= (~VP9_ALT_FLAG); } - if (cpi->pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) { + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) { target = calc_pframe_target_size_one_pass_cbr(cpi); } } @@ -1466,7 +1473,7 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; int vbr_max_bits; - rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / oxcf->framerate); + rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate); rc->min_frame_bandwidth = (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100); diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 275e7c2fb..ce7ec9637 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -154,7 +154,7 @@ int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); } #endif - if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { + if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index]; const int boost_index = MIN(15, (cpi->rc.gfu_boost / 100)); @@ -180,6 +180,7 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) { break; default: assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; } #else (void) bit_depth; @@ -289,8 +290,6 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { } } -static const int MAX_XSQ_Q10 = 245727; - static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { // NOTE: The tables below must be of the same size. @@ -380,10 +379,10 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, *dist = 0; } else { int d_q10, r_q10; + static const uint32_t MAX_XSQ_Q10 = 245727; const uint64_t xsq_q10_64 = ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var; - const int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ? - MAX_XSQ_Q10 : (int)xsq_q10_64; + const int xsq_q10 = (int)MIN(xsq_q10_64, MAX_XSQ_Q10); model_rd_norm(xsq_q10, &r_q10, &d_q10); *rate = (n * r_q10 + 2) >> 2; *dist = (var * (int64_t)d_q10 + 512) >> 10; @@ -426,6 +425,7 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, break; default: assert(0 && "Invalid transform size."); + break; } } @@ -501,10 +501,6 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd, dst[1].buf = src->u_buffer; dst[2].buf = src->v_buffer; dst[1].stride = dst[2].stride = src->uv_stride; -#if CONFIG_ALPHA - dst[3].buf = src->alpha_buffer; - dst[3].stride = src->alpha_stride; -#endif for (i = 0; i < MAX_MB_PLANE; ++i) { setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col, diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index d3b32a15b..cd8e41e73 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -173,15 +173,27 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, int64_t dist_sum = 0; const int ref = xd->mi[0]->mbmi.ref_frame[0]; unsigned int sse; + const int shift = 8; for (i = 0; i < MAX_MB_PLANE; ++i) { struct macroblock_plane *const p = &x->plane[i]; struct macroblockd_plane *const pd = &xd->plane[i]; const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); - (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, - pd->dst.buf, pd->dst.stride, &sse); + const unsigned int var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, + &sse); + if (!x->select_tx_size) { + if (sse < p->quant_thred[0] >> shift) + x->skip_txfm[i] = 1; + else if (var < p->quant_thred[1] >> shift) + x->skip_txfm[i] = 2; + else + x->skip_txfm[i] = 0; + } + + x->bsse[i] = sse; if (i == 0) x->pred_sse[ref] = sse; @@ -433,19 +445,51 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, if (args->skip) return; - if (!is_inter_block(mbmi)) + if (!is_inter_block(mbmi)) { vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip); - else - vp9_xform_quant(x, plane, block, plane_bsize, tx_size); #if CONFIG_VP9_HIGH - if (xd->cur_buf->flags & YV12_FLAG_HIGH) { - dist_block(plane, block, tx_size, args, xd->bps); - } else { - dist_block(plane, block, tx_size, args, 8); - } + if (xd->cur_buf->flags & YV12_FLAG_HIGH) { + dist_block(plane, block, tx_size, args, xd->bps); + } else { + dist_block(plane, block, tx_size, args, 8); + } #else - dist_block(plane, block, tx_size, args); + dist_block(plane, block, tx_size, args); #endif + } else { + if (x->skip_txfm[plane] == 0) { + // full forward transform and quantization + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); +#if CONFIG_VP9_HIGH + if (xd->cur_buf->flags & YV12_FLAG_HIGH) { + dist_block(plane, block, tx_size, args, xd->bps); + } else { + dist_block(plane, block, tx_size, args, 8); + } +#else + dist_block(plane, block, tx_size, args); +#endif + } else if (x->skip_txfm[plane] == 2) { + // compute DC coefficient + tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block); + vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size); + args->sse = x->bsse[plane] << 4; + args->dist = args->sse; + if (!x->plane[plane].eobs[block]) + args->dist = args->sse - ((coeff[0] * coeff[0] - + (coeff[0] - dqcoeff[0]) * (coeff[0] - dqcoeff[0])) >> 2); + } else { + // skip forward transform + x->plane[plane].eobs[block] = 0; + args->sse = x->bsse[plane] << 4; + args->dist = args->sse; + } + } + + + + rate_block(plane, block, plane_bsize, tx_size, args); rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist); rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse); @@ -517,22 +561,26 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, mbmi->tx_size = MIN(max_tx_size, largest_tx_size); txfm_rd_in_plane(x, rate, distortion, skip, - &sse[mbmi->tx_size], ref_best_rd, 0, bs, + sse, ref_best_rd, 0, bs, mbmi->tx_size, cpi->sf.use_fast_coef_costing); cpi->tx_stepdown_count[0]++; } static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, - int (*r)[2], int *rate, - int64_t *d, int64_t *distortion, - int *s, int *skip, + int *rate, + int64_t *distortion, + int *skip, + int64_t *psse, int64_t tx_cache[TX_MODES], + int64_t ref_best_rd, BLOCK_SIZE bs) { const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; vp9_prob skip_prob = vp9_get_skip_prob(cm, xd); + int r[TX_SIZES][2], s[TX_SIZES]; + int64_t d[TX_SIZES], sse[TX_SIZES]; int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX}, {INT64_MAX, INT64_MAX}, {INT64_MAX, INT64_MAX}, @@ -549,6 +597,9 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, s1 = vp9_cost_bit(skip_prob, 1); for (n = TX_4X4; n <= max_tx_size; n++) { + txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n], + &sse[n], ref_best_rd, 0, bs, n, + cpi->sf.use_fast_coef_costing); r[n][1] = r[n][0]; if (r[n][0] < INT_MAX) { for (m = 0; m <= n - (n == max_tx_size); m++) { @@ -579,6 +630,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, *distortion = d[mbmi->tx_size]; *rate = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT]; *skip = s[mbmi->tx_size]; + *psse = sse[mbmi->tx_size]; tx_cache[ONLY_4X4] = rd[TX_4X4][0]; tx_cache[ALLOW_8X8] = rd[TX_8X8][0]; @@ -605,65 +657,39 @@ static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *psse, BLOCK_SIZE bs, int64_t txfm_cache[TX_MODES], int64_t ref_best_rd) { - int r[TX_SIZES][2], s[TX_SIZES]; - int64_t d[TX_SIZES], sse[TX_SIZES]; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const TX_SIZE max_tx_size = max_txsize_lookup[bs]; - TX_SIZE tx_size; - assert(bs == mbmi->sb_type); + assert(bs == xd->mi[0]->mbmi.sb_type); vp9_subtract_plane(x, bs, 0); if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) { vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t)); - choose_largest_tx_size(cpi, x, rate, distortion, skip, sse, ref_best_rd, + choose_largest_tx_size(cpi, x, rate, distortion, skip, psse, ref_best_rd, bs); - if (psse) - *psse = sse[mbmi->tx_size]; - return; + } else { + choose_tx_size_from_rd(cpi, x, rate, distortion, skip, psse, + txfm_cache, ref_best_rd, bs); } - - for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) - txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size], &s[tx_size], - &sse[tx_size], ref_best_rd, 0, bs, tx_size, - cpi->sf.use_fast_coef_costing); - choose_tx_size_from_rd(cpi, x, r, rate, d, distortion, s, - skip, txfm_cache, bs); - - if (psse) - *psse = sse[mbmi->tx_size]; } static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skip, - int64_t *psse, BLOCK_SIZE bs, + BLOCK_SIZE bs, int64_t txfm_cache[TX_MODES], int64_t ref_best_rd) { - int64_t sse[TX_SIZES]; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + int64_t sse; - assert(bs == mbmi->sb_type); + assert(bs == xd->mi[0]->mbmi.sb_type); if (cpi->sf.tx_size_search_method != USE_FULL_RD || xd->lossless) { vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t)); - choose_largest_tx_size(cpi, x, rate, distortion, skip, sse, ref_best_rd, + choose_largest_tx_size(cpi, x, rate, distortion, skip, &sse, ref_best_rd, bs); } else { - int r[TX_SIZES][2], s[TX_SIZES]; - int64_t d[TX_SIZES]; - TX_SIZE tx_size; - for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size) - txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size], - &s[tx_size], &sse[tx_size], - ref_best_rd, 0, bs, tx_size, - cpi->sf.use_fast_coef_costing); - choose_tx_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache, - bs); + choose_tx_size_from_rd(cpi, x, rate, distortion, skip, &sse, + txfm_cache, ref_best_rd, bs); } - if (psse) - *psse = sse[mbmi->tx_size]; } @@ -1023,7 +1049,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, mic->mbmi.mode = mode; intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, - &s, NULL, bsize, local_tx_cache, best_rd); + &s, bsize, local_tx_cache, best_rd); if (this_rate_tokenonly == INT_MAX) continue; @@ -2275,6 +2301,100 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd, } } +static void rd_encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int *rate2, + int64_t *distortion, int64_t *distortion_uv, + int *disable_skip) { + VP9_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]); + const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]); + unsigned int var, sse; + // Skipping threshold for ac. + unsigned int thresh_ac; + // Skipping threshold for dc + unsigned int thresh_dc; + + var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].dst.buf, + xd->plane[0].dst.stride, &sse); + + if (x->encode_breakout > 0) { + // Set a maximum for threshold to avoid big PSNR loss in low bitrate + // case. Use extreme low threshold for static frames to limit skipping. + const unsigned int max_thresh = (cpi->allow_encode_breakout == + ENCODE_BREAKOUT_LIMITED) ? 128 : 36000; + // The encode_breakout input + const unsigned int min_thresh = + MIN(((unsigned int)x->encode_breakout << 4), max_thresh); + + // Calculate threshold according to dequant value. + thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9; +#if CONFIG_VP9_HIGH + if (xd->cur_buf->flags & YV12_FLAG_HIGH) { + const int shift = 2 * xd->bps - 16; + if (shift > 0) + thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift); + } +#endif + thresh_ac = clamp(thresh_ac, min_thresh, max_thresh); + + // Adjust threshold according to partition size. + thresh_ac >>= 8 - (b_width_log2(bsize) + + b_height_log2(bsize)); + thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6); +#if CONFIG_VP9_HIGH + if (xd->cur_buf->flags & YV12_FLAG_HIGH) { + const int shift = 2 * xd->bps - 16; + if (shift > 0) + thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift); + } +#endif + } else { + thresh_ac = 0; + thresh_dc = 0; + } + + // Y skipping condition checking + if (sse < thresh_ac || sse == 0) { + // dc skipping checking + if ((sse - var) < thresh_dc || sse == var) { + unsigned int sse_u, sse_v; + unsigned int var_u, var_v; + + var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf, + x->plane[1].src.stride, + xd->plane[1].dst.buf, + xd->plane[1].dst.stride, &sse_u); + + // U skipping condition checking + if ((sse_u * 4 < thresh_ac || sse_u == 0) && + (sse_u - var_u < thresh_dc || sse_u == var_u)) { + var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf, + x->plane[2].src.stride, + xd->plane[2].dst.buf, + xd->plane[2].dst.stride, &sse_v); + + // V skipping condition checking + if ((sse_v * 4 < thresh_ac || sse_v == 0) && + (sse_v - var_v < thresh_dc || sse_v == var_v)) { + x->skip = 1; + + // The cost of skip bit needs to be added. + *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); + + // Scaling factor for SSE from spatial domain to frequency domain + // is 16. Adjust distortion accordingly. + *distortion_uv = (sse_u + sse_v) << 4; + *distortion = (sse << 4) + *distortion_uv; + + *disable_skip = 1; + } + } + } + } +} + static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int64_t txfm_cache[], @@ -2282,8 +2402,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int *skippable, int *rate_y, int64_t *distortion_y, int *rate_uv, int64_t *distortion_uv, - int *mode_excluded, int *disable_skip, - INTERP_FILTER *best_filter, + int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], @@ -2294,7 +2413,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; const int is_comp_pred = has_second_ref(mbmi); - const int num_refs = is_comp_pred ? 2 : 1; const int this_mode = mbmi->mode; int_mv *frame_mv = mode_mv[this_mode]; int i; @@ -2316,6 +2434,15 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *orig_dst[MAX_MB_PLANE]; int orig_dst_stride[MAX_MB_PLANE]; int rs = 0; + INTERP_FILTER best_filter = SWITCHABLE; + int skip_txfm[MAX_MB_PLANE] = {0}; + int64_t bsse[MAX_MB_PLANE] = {0}; + + int bsl = mi_width_log2_lookup[bsize]; + int pred_filter_search = cpi->sf.cb_pred_filter_search ? + (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & 0x1 : 0; + #if CONFIG_VP9_HIGH if (xd->cur_buf->flags & YV12_FLAG_HIGH) { tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16); @@ -2324,6 +2451,17 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } #endif + if (pred_filter_search) { + INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE; + if (xd->up_available) + af = xd->mi[-xd->mi_stride]->mbmi.interp_filter; + if (xd->left_available) + lf = xd->mi[-1]->mbmi.interp_filter; + + if ((this_mode != NEWMV) || (af == lf)) + best_filter = af; + } + if (is_comp_pred) { if (frame_mv[refs[0]].as_int == INVALID_MV || frame_mv[refs[1]].as_int == INVALID_MV) @@ -2362,7 +2500,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } } - for (i = 0; i < num_refs; ++i) { + for (i = 0; i < is_comp_pred + 1; ++i) { cur_mv[i] = frame_mv[refs[i]]; // Clip "next_nearest" so that it does not extend to far out of image if (this_mode != NEWMV) @@ -2389,10 +2527,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, * if the first is known */ *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]); - if (!(*mode_excluded)) - *mode_excluded = is_comp_pred ? cm->reference_mode == SINGLE_REFERENCE - : cm->reference_mode == COMPOUND_REFERENCE; - pred_exists = 0; // Are all MVs integer pel for Y and UV intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv); @@ -2406,10 +2540,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, rd_opt->filter_cache[i] = INT64_MAX; if (cm->interp_filter != BILINEAR) { - *best_filter = EIGHTTAP; if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) { - *best_filter = EIGHTTAP; - } else { + best_filter = EIGHTTAP; + } else if (best_filter == SWITCHABLE) { int newbest; int tmp_rate_sum = 0; int64_t tmp_dist_sum = 0; @@ -2471,9 +2604,11 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (newbest) { best_rd = rd; - *best_filter = mbmi->interp_filter; + best_filter = mbmi->interp_filter; if (cm->interp_filter == SWITCHABLE && i && !intpel_mv) best_needs_copy = !best_needs_copy; + vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); + vpx_memcpy(bsse, x->bsse, sizeof(bsse)); } if ((cm->interp_filter == SWITCHABLE && newbest) || @@ -2487,7 +2622,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } // Set the appropriate filter mbmi->interp_filter = cm->interp_filter != SWITCHABLE ? - cm->interp_filter : *best_filter; + cm->interp_filter : best_filter; rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi) : 0; if (pred_exists) { @@ -2521,97 +2656,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, *rate2 += vp9_get_switchable_rate(cpi); if (!is_comp_pred) { - if (cpi->allow_encode_breakout) { - const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]); - const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]); - unsigned int var, sse; - // Skipping threshold for ac. - unsigned int thresh_ac; - // Skipping threshold for dc - unsigned int thresh_dc; - - var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].dst.buf, - xd->plane[0].dst.stride, &sse); - - if (x->encode_breakout > 0) { - // Set a maximum for threshold to avoid big PSNR loss in low bitrate - // case. Use extreme low threshold for static frames to limit skipping. - const unsigned int max_thresh = (cpi->allow_encode_breakout == - ENCODE_BREAKOUT_LIMITED) ? 128 : 36000; - // The encode_breakout input - const unsigned int min_thresh = - MIN(((unsigned int)x->encode_breakout << 4), max_thresh); - - // Calculate threshold according to dequant value. - thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9; -#if CONFIG_VP9_HIGH - if (xd->cur_buf->flags & YV12_FLAG_HIGH) { - const int shift = 2 * xd->bps - 16; - if (shift > 0) - thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift); - } -#endif - thresh_ac = clamp(thresh_ac, min_thresh, max_thresh); - - // Adjust threshold according to partition size. - thresh_ac >>= 8 - (b_width_log2(bsize) + - b_height_log2(bsize)); - thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6); -#if CONFIG_VP9_HIGH - if (xd->cur_buf->flags & YV12_FLAG_HIGH) { - const int shift = 2 * xd->bps - 16; - if (shift > 0) - thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift); - } -#endif - } else { - thresh_ac = 0; - thresh_dc = 0; - } - - // Y skipping condition checking - if (sse < thresh_ac || sse == 0) { - // dc skipping checking - if ((sse - var) < thresh_dc || sse == var) { - unsigned int sse_u, sse_v; - unsigned int var_u, var_v; - - var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf, - x->plane[1].src.stride, - xd->plane[1].dst.buf, - xd->plane[1].dst.stride, &sse_u); - - // U skipping condition checking - if ((sse_u * 4 < thresh_ac || sse_u == 0) && - (sse_u - var_u < thresh_dc || sse_u == var_u)) { - var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf, - x->plane[2].src.stride, - xd->plane[2].dst.buf, - xd->plane[2].dst.stride, &sse_v); - - // V skipping condition checking - if ((sse_v * 4 < thresh_ac || sse_v == 0) && - (sse_v - var_v < thresh_dc || sse_v == var_v)) { - x->skip = 1; - - // The cost of skip bit needs to be added. - *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); - - // Scaling factor for SSE from spatial domain to frequency domain - // is 16. Adjust distortion accordingly. - *distortion_uv = (sse_u + sse_v) << 4; - *distortion = (sse << 4) + *distortion_uv; - - *disable_skip = 1; - this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion); - } - } - } - } - } + if (cpi->allow_encode_breakout) + rd_encode_breakout_test(cpi, x, bsize, rate2, distortion, distortion_uv, + disable_skip); } + vpx_memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm)); + vpx_memcpy(x->bsse, bsse, sizeof(bsse)); + if (!x->skip) { int skippable_y, skippable_uv; int64_t sseuv = INT64_MAX; @@ -2776,7 +2828,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t best_inter_rd = INT64_MAX; PREDICTION_MODE best_intra_mode = DC_PRED; MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME; - INTERP_FILTER tmp_best_filter = SWITCHABLE; int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES]; int64_t dist_uv[TX_SIZES]; int skip_uv[TX_SIZES]; @@ -2818,9 +2869,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; if (cpi->ref_frame_flags & flag_list[ref_frame]) { - setup_buffer_inter(cpi, x, tile, - ref_frame, bsize, mi_row, mi_col, - frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); + setup_buffer_inter(cpi, x, tile, ref_frame, bsize, mi_row, mi_col, + frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); } frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; @@ -2926,6 +2976,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, case NONE: case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); + break; } } if (mode_skip_mask & (1 << mode_index)) @@ -2942,6 +2993,55 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, continue; second_ref_frame = vp9_mode_order[mode_index].ref_frame[1]; + if (cpi->sf.motion_field_mode_search) { + const int mi_width = MIN(num_8x8_blocks_wide_lookup[bsize], + tile->mi_col_end - mi_col); + const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize], + tile->mi_row_end - mi_row); + const int bsl = mi_width_log2(bsize); + int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & 0x1; + MB_MODE_INFO *ref_mbmi; + int const_motion = 1; + int skip_ref_frame = !cb_partition_search_ctrl; + MV_REFERENCE_FRAME rf = NONE; + int_mv ref_mv; + ref_mv.as_int = INVALID_MV; + + if ((mi_row - 1) >= tile->mi_row_start) { + ref_mv = xd->mi[-xd->mi_stride]->mbmi.mv[0]; + rf = xd->mi[-xd->mi_stride]->mbmi.ref_frame[0]; + for (i = 0; i < mi_width; ++i) { + ref_mbmi = &xd->mi[-xd->mi_stride + i]->mbmi; + const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) && + (ref_frame == ref_mbmi->ref_frame[0]); + skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]); + } + } + + if ((mi_col - 1) >= tile->mi_col_start) { + if (ref_mv.as_int == INVALID_MV) + ref_mv = xd->mi[-1]->mbmi.mv[0]; + if (rf == NONE) + rf = xd->mi[-1]->mbmi.ref_frame[0]; + for (i = 0; i < mi_height; ++i) { + ref_mbmi = &xd->mi[i * xd->mi_stride - 1]->mbmi; + const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) && + (ref_frame == ref_mbmi->ref_frame[0]); + skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]); + } + } + + if (skip_ref_frame && this_mode != NEARESTMV && this_mode != NEWMV) + if (rf > INTRA_FRAME) + if (ref_frame != rf) + continue; + + if (const_motion) + if (this_mode == NEARMV || this_mode == ZEROMV) + continue; + } + comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && @@ -3012,7 +3112,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (ref_frame == INTRA_FRAME) { TX_SIZE uv_tx; - intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, + intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, bsize, tx_cache, best_rd); if (rate_y == INT_MAX) @@ -3041,8 +3141,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, &rate2, &distortion2, &skippable, &rate_y, &distortion_y, &rate_uv, &distortion_uv, - &mode_excluded, &disable_skip, - &tmp_best_filter, frame_mv, + &disable_skip, frame_mv, mi_row, mi_col, single_newmv, &total_sse, best_rd); if (this_rd == INT64_MAX) @@ -3322,7 +3421,6 @@ int64_t vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x, RD_OPT *const rd_opt = &cpi->rd; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const struct segmentation *const seg = &cm->seg; unsigned char segment_id = mbmi->segment_id; const int comp_pred = 0; int i; @@ -3348,7 +3446,7 @@ int64_t vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x, *returnrate = INT_MAX; - assert(vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)); + assert(vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)); mbmi->mode = ZEROMV; mbmi->uv_mode = DC_PRED; @@ -3532,6 +3630,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, case NONE: case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); + break; } } if (mode_skip_mask & (1 << ref_index)) @@ -3576,13 +3675,10 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf)) continue; - if (comp_pred) { - mode_excluded = mode_excluded ? mode_excluded - : cm->reference_mode == SINGLE_REFERENCE; - } else if (ref_frame != INTRA_FRAME) { - mode_excluded = mode_excluded ? mode_excluded - : cm->reference_mode == COMPOUND_REFERENCE; - } + if (comp_pred) + mode_excluded = cm->reference_mode == SINGLE_REFERENCE; + else if (ref_frame != INTRA_FRAME) + mode_excluded = cm->reference_mode == COMPOUND_REFERENCE; // If the segment reference frame feature is enabled.... // then do nothing if the current ref frame is not allowed.. diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c index 897ae0129..d5676c3d1 100644 --- a/vp9/encoder/vp9_segmentation.c +++ b/vp9/encoder/vp9_segmentation.c @@ -110,14 +110,12 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) { return cost; } -static void count_segs(VP9_COMP *cpi, const TileInfo *const tile, - MODE_INFO **mi, +static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *tile, MODE_INFO **mi, int *no_pred_segcounts, int (*temporal_predictor_count)[2], int *t_unpred_seg_counts, int bw, int bh, int mi_row, int mi_col) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->mb.e_mbd; int segment_id; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) @@ -151,14 +149,13 @@ static void count_segs(VP9_COMP *cpi, const TileInfo *const tile, } } -static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile, - MODE_INFO **mi, +static void count_segs_sb(const VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *tile, MODE_INFO **mi, int *no_pred_segcounts, int (*temporal_predictor_count)[2], int *t_unpred_seg_counts, int mi_row, int mi_col, BLOCK_SIZE bsize) { - const VP9_COMMON *const cm = &cpi->common; const int mis = cm->mi_stride; int bw, bh; const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2; @@ -170,18 +167,18 @@ static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile, bh = num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type]; if (bw == bs && bh == bs) { - count_segs(cpi, tile, mi, no_pred_segcounts, temporal_predictor_count, + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, bs, mi_row, mi_col); } else if (bw == bs && bh < bs) { - count_segs(cpi, tile, mi, no_pred_segcounts, temporal_predictor_count, + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, hbs, mi_row, mi_col); - count_segs(cpi, tile, mi + hbs * mis, no_pred_segcounts, + count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, hbs, mi_row + hbs, mi_col); } else if (bw < bs && bh == bs) { - count_segs(cpi, tile, mi, no_pred_segcounts, temporal_predictor_count, + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row, mi_col); - count_segs(cpi, tile, mi + hbs, + count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs); } else { @@ -194,7 +191,7 @@ static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile, const int mi_dc = hbs * (n & 1); const int mi_dr = hbs * (n >> 1); - count_segs_sb(cpi, tile, &mi[mi_dr * mis + mi_dc], + count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, mi_row + mi_dr, mi_col + mi_dc, subsize); @@ -202,8 +199,7 @@ static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile, } } -void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; +void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd) { struct segmentation *seg = &cm->seg; int no_pred_cost; @@ -237,7 +233,7 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { MODE_INFO **mi = mi_ptr; for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; mi_col += 8, mi += 8) - count_segs_sb(cpi, &tile, mi, no_pred_segcounts, + count_segs_sb(cm, xd, &tile, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, mi_row, mi_col, BLOCK_64X64); } diff --git a/vp9/encoder/vp9_segmentation.h b/vp9/encoder/vp9_segmentation.h index 50dd562c8..8c6944ad1 100644 --- a/vp9/encoder/vp9_segmentation.h +++ b/vp9/encoder/vp9_segmentation.h @@ -42,7 +42,7 @@ void vp9_clear_segdata(struct segmentation *seg, void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data, unsigned char abs_delta); -void vp9_choose_segmap_coding_method(VP9_COMP *cpi); +void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd); void vp9_reset_segment_features(struct segmentation *seg); diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index e3951d532..99f336f2a 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -88,13 +88,13 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->last_partitioning_redo_frequency = 3; sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; + sf->adaptive_pred_interp_filter = 0; } else { sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; sf->last_partitioning_redo_frequency = 2; sf->lf_motion_threshold = NO_MOTION_THRESHOLD; } - sf->adaptive_pred_interp_filter = 0; sf->reference_masking = 1; sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | @@ -110,18 +110,23 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, if (speed >= 3) { sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL; - if (MIN(cm->width, cm->height) >= 720) + if (MIN(cm->width, cm->height) >= 720) { sf->disable_split_mask = DISABLE_ALL_SPLIT; - else + } else { + sf->max_intra_bsize = BLOCK_32X32; sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT; - + } + sf->adaptive_pred_interp_filter = 0; + sf->cb_partition_search = frame_is_boosted(cpi) ? 0 : 1; + sf->cb_pred_filter_search = 1; + sf->motion_field_mode_search = frame_is_boosted(cpi) ? 0 : 1; sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; sf->last_partitioning_redo_frequency = 3; sf->recode_loop = ALLOW_RECODE_KFMAXBW; sf->adaptive_rd_thresh = 3; sf->mode_skip_start = 6; - sf->use_fast_coef_updates = ONE_LOOP_REDUCED; - sf->use_fast_coef_costing = 1; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; + sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; } if (speed >= 4) { @@ -134,6 +139,8 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->disable_filter_search_var_thresh = 200; sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL; sf->use_lp32x32fdct = 1; + sf->use_fast_coef_updates = ONE_LOOP_REDUCED; + sf->use_fast_coef_costing = 1; } if (speed >= 5) { @@ -155,7 +162,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, } static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, - int speed) { + int speed, vp9e_tune_content content) { VP9_COMMON *const cm = &cpi->common; const int frames_since_key = cm->frame_type == KEY_FRAME ? 0 : cpi->rc.frames_since_key; @@ -176,6 +183,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; sf->use_rd_breakout = 1; + sf->adaptive_motion_search = 1; sf->adaptive_pred_interp_filter = 1; sf->mv.auto_mv_step_size = 1; @@ -270,13 +278,19 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, } if (speed >= 6) { + if (content == VP9E_CONTENT_SCREEN) { + int i; + // Allow fancy modes at all sizes since SOURCE_VAR_BASED_PARTITION is used + for (i = 0; i < BLOCK_SIZES; ++i) + sf->inter_mode_mask[i] = INTER_ALL; + } + // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION. sf->partition_search_type = SOURCE_VAR_BASED_PARTITION; sf->search_type_check_frequency = 50; sf->tx_size_search_method = (cm->frame_type == KEY_FRAME) ? USE_LARGESTALL : USE_TX_8X8; - sf->max_intra_bsize = BLOCK_8X8; // This feature is only enabled when partition search is disabled. sf->reuse_inter_pred_sby = 1; @@ -330,6 +344,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->use_lp32x32fdct = 0; sf->adaptive_motion_search = 0; sf->adaptive_pred_interp_filter = 0; + sf->cb_pred_filter_search = 0; + sf->cb_partition_search = 0; + sf->motion_field_mode_search = 0; sf->use_quant_fp = 0; sf->reference_masking = 0; sf->partition_search_type = SEARCH_PARTITION; @@ -385,17 +402,17 @@ void vp9_set_speed_features(VP9_COMP *cpi) { set_good_speed_feature(cpi, cm, sf, oxcf->speed); break; case REALTIME: - set_rt_speed_feature(cpi, sf, oxcf->speed); + set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content); break; } // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. - if (cpi->pass == 1) + if (oxcf->pass == 1) sf->optimize_coefficients = 0; // No recode for 1 pass. - if (cpi->pass == 0) { + if (oxcf->pass == 0) { sf->recode_loop = DISALLOW_RECODE; sf->optimize_coefficients = 0; } @@ -404,7 +421,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree; } - cpi->mb.optimize = sf->optimize_coefficients == 1 && cpi->pass != 1; + cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1; if (sf->disable_split_mask == DISABLE_ALL_SPLIT) sf->adaptive_pred_interp_filter = 0; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index bdbbe5888..243139d7b 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -283,6 +283,13 @@ typedef struct SPEED_FEATURES { // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected. int adaptive_pred_interp_filter; + // Chessboard pattern prediction filter type search + int cb_pred_filter_search; + + int cb_partition_search; + + int motion_field_mode_search; + // Fast quantization process path int use_quant_fp; diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c index 9796d6476..2f1c72992 100644 --- a/vp9/encoder/vp9_subexp.c +++ b/vp9/encoder/vp9_subexp.c @@ -16,7 +16,24 @@ #define vp9_cost_upd256 ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd))) -static int update_bits[255]; +static const int update_bits[255] = { + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 0, +}; static int recenter_nonneg(int v, int m) { if (v > (m << 1)) @@ -61,18 +78,6 @@ static int remap_prob(int v, int m) { return i; } -static int count_term_subexp(int word) { - if (word < 16) - return 5; - if (word < 32) - return 6; - if (word < 64) - return 8; - if (word < 129) - return 10; - return 11; -} - static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) { int delp = remap_prob(newp, oldp); return update_bits[delp] * 256; @@ -111,12 +116,6 @@ void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp) { encode_term_subexp(w, delp); } -void vp9_compute_update_table() { - int i; - for (i = 0; i < 254; i++) - update_bits[i] = count_term_subexp(i); -} - int vp9_prob_diff_update_savings_search(const unsigned int *ct, vp9_prob oldp, vp9_prob *bestp, vp9_prob upd) { diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h index 8e9c0c62a..8e02a1d0d 100644 --- a/vp9/encoder/vp9_subexp.h +++ b/vp9/encoder/vp9_subexp.h @@ -16,9 +16,6 @@ extern "C" { #endif -void vp9_compute_update_table(); - - void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp); diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 1eb450928..52f6cda53 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -35,6 +35,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { RATE_CONTROL *const lrc = &lc->rc; int i; lc->current_video_frame_in_layer = 0; + lc->layer_size = 0; lrc->ni_av_qi = oxcf->worst_allowed_q; lrc->total_actual_bits = 0; lrc->total_target_vs_actual = 0; @@ -48,7 +49,6 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { lrc->rate_correction_factors[i] = 1.0; } - lc->layer_size = 0; if (svc->number_temporal_layers > 1) { lc->target_bandwidth = oxcf->ts_target_bitrate[layer]; @@ -66,12 +66,17 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { lc->alt_ref_idx = alt_ref_idx++; else lc->alt_ref_idx = -1; + lc->gold_ref_idx = -1; } lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level_ms), lc->target_bandwidth, 1000); lrc->bits_off_target = lrc->buffer_level; } + + // Still have extra buffer for base layer golden frame + if (svc->number_spatial_layers > 1 && alt_ref_idx < REF_FRAMES) + svc->layer_context[0].gold_ref_idx = alt_ref_idx; } // Update the layer context from a change_config() call. @@ -111,9 +116,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size); // Update framerate-related quantities. if (svc->number_temporal_layers > 1) { - lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer]; + lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer]; } else { - lc->framerate = oxcf->framerate; + lc->framerate = cpi->framerate; } lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); lrc->max_frame_bandwidth = rc->max_frame_bandwidth; @@ -136,7 +141,7 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { RATE_CONTROL *const lrc = &lc->rc; const int layer = svc->temporal_layer_id; - lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer]; + lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer]; lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth; // Update the average layer frame size (non-cumulative per-frame-bw). @@ -144,7 +149,7 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { lc->avg_frame_size = lrc->avg_frame_bandwidth; } else { const double prev_layer_framerate = - oxcf->framerate / oxcf->ts_rate_decimator[layer - 1]; + cpi->framerate / oxcf->ts_rate_decimator[layer - 1]; const int prev_layer_target_bandwidth = oxcf->ts_target_bitrate[layer - 1]; lc->avg_frame_size = (int)((lc->target_bandwidth - prev_layer_target_bandwidth) / @@ -217,12 +222,12 @@ void vp9_inc_frame_in_layer(SVC *svc) { } int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) { - return cpi->use_svc && - cpi->svc.number_temporal_layers == 1 && + return is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0 && cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame; } +#if CONFIG_SPATIAL_SVC int vp9_svc_lookahead_push(const VP9_COMP *const cpi, struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, int64_t ts_start, int64_t ts_end, unsigned int flags) { @@ -265,21 +270,25 @@ static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) { layer_param = &buf->svc_params[layer_id]; cpi->svc.spatial_layer_id = layer_param->spatial_layer; cpi->svc.temporal_layer_id = layer_param->temporal_layer; + cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; + + lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; cpi->lst_fb_idx = cpi->svc.spatial_layer_id; if (cpi->svc.spatial_layer_id < 1) - cpi->gld_fb_idx = cpi->lst_fb_idx; + cpi->gld_fb_idx = lc->gold_ref_idx >= 0 ? + lc->gold_ref_idx : cpi->lst_fb_idx; else cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1; - lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; - if (lc->current_video_frame_in_layer == 0) { - if (cpi->svc.spatial_layer_id >= 2) + if (cpi->svc.spatial_layer_id >= 2) { cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2; - else + } else { cpi->alt_fb_idx = cpi->lst_fb_idx; + cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG); + } } else { if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]) { cpi->alt_fb_idx = lc->alt_ref_idx; @@ -352,3 +361,4 @@ struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi, return buf; } +#endif diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index 7b533e467..801449b6f 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -31,6 +31,7 @@ typedef struct { vpx_svc_parameters_t svc_params_received; struct lookahead_entry *alt_ref_source; int alt_ref_idx; + int gold_ref_idx; int has_alt_frame; size_t layer_size; } LAYER_CONTEXT; diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 7607307a4..cfe5a39d6 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -634,7 +634,7 @@ static void adjust_arnr_filter(VP9_COMP *cpi, } // Adjustments for second level arf in multi arf case. - if (cpi->pass == 2 && cpi->multi_arf_allowed) { + if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) { cpi->active_arnr_strength >>= 1; @@ -671,7 +671,7 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { } // Setup scaling factors. Scaling on each of the arnr frames is not supported - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) { + if (is_spatial_svc(cpi)) { // In spatial svc the scaling factors might be less then 1/2. So we will use // non-normative scaling. int frame_used = 0; diff --git a/vp9/encoder/vp9_write_bit_buffer.c b/vp9/encoder/vp9_write_bit_buffer.c index 962d0ca56..6d55e84e8 100644 --- a/vp9/encoder/vp9_write_bit_buffer.c +++ b/vp9/encoder/vp9_write_bit_buffer.c @@ -8,9 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include "vp9/encoder/vp9_write_bit_buffer.h" -size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) { +size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb) { return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0); } diff --git a/vp9/encoder/vp9_write_bit_buffer.h b/vp9/encoder/vp9_write_bit_buffer.h index 073608d7f..59f9bbe30 100644 --- a/vp9/encoder/vp9_write_bit_buffer.h +++ b/vp9/encoder/vp9_write_bit_buffer.h @@ -11,8 +11,6 @@ #ifndef VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_ #define VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_ -#include - #include "vpx/vpx_integer.h" #ifdef __cplusplus @@ -24,7 +22,7 @@ struct vp9_write_bit_buffer { size_t bit_offset; }; -size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb); +size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb); void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit); diff --git a/vp9/encoder/vp9_writer.c b/vp9/encoder/vp9_writer.c index 8398fc07a..ff461f218 100644 --- a/vp9/encoder/vp9_writer.c +++ b/vp9/encoder/vp9_writer.c @@ -15,7 +15,6 @@ void vp9_start_encode(vp9_writer *br, uint8_t *source) { br->lowvalue = 0; br->range = 255; - br->value = 0; br->count = -24; br->buffer = source; br->pos = 0; diff --git a/vp9/encoder/vp9_writer.h b/vp9/encoder/vp9_writer.h index 7f4fa1ef2..9d161f95c 100644 --- a/vp9/encoder/vp9_writer.h +++ b/vp9/encoder/vp9_writer.h @@ -22,20 +22,15 @@ extern "C" { typedef struct { unsigned int lowvalue; unsigned int range; - unsigned int value; int count; unsigned int pos; uint8_t *buffer; - - // Variables used to track bit costs without outputing to the bitstream - unsigned int measure_cost; - uint64_t bit_counter; } vp9_writer; void vp9_start_encode(vp9_writer *bc, uint8_t *buffer); void vp9_stop_encode(vp9_writer *bc); -static void vp9_write(vp9_writer *br, int bit, int probability) { +static INLINE void vp9_write(vp9_writer *br, int bit, int probability) { unsigned int split; int count = br->count; unsigned int range = br->range; @@ -83,11 +78,11 @@ static void vp9_write(vp9_writer *br, int bit, int probability) { br->range = range; } -static void vp9_write_bit(vp9_writer *w, int bit) { +static INLINE void vp9_write_bit(vp9_writer *w, int bit) { vp9_write(w, bit, 128); // vp9_prob_half } -static void vp9_write_literal(vp9_writer *w, int data, int bits) { +static INLINE void vp9_write_literal(vp9_writer *w, int data, int bits) { int bit; for (bit = bits - 1; bit >= 0; bit--) diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c index b5269ed03..3a19f5274 100644 --- a/vp9/encoder/x86/vp9_dct_avx2.c +++ b/vp9/encoder/x86/vp9_dct_avx2.c @@ -12,2572 +12,6 @@ #include "vp9/common/vp9_idct.h" // for cospi constants #include "vpx_ports/mem.h" -void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) { - // The 2D transform is done with two passes which are actually pretty - // similar. In the first one, we transform the columns and transpose - // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we transpose the columns (that - // is the transposed rows) and transpose the results (so that it goes back - // in normal/row positions). - int pass; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); - const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); - const __m128i kOne = _mm_set1_epi16(1); - __m128i in0, in1, in2, in3; - // Load inputs. - { - in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - // x = x << 4 - in0 = _mm_slli_epi16(in0, 4); - in1 = _mm_slli_epi16(in1, 4); - in2 = _mm_slli_epi16(in2, 4); - in3 = _mm_slli_epi16(in3, 4); - // if (i == 0 && input[0]) input[0] += 1; - { - // The mask will only contain whether the first value is zero, all - // other comparison will fail as something shifted by 4 (above << 4) - // can never be equal to one. To increment in the non-zero case, we - // add the mask and one for the first element: - // - if zero, mask = -1, v = v - 1 + 1 = v - // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 - __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); - in0 = _mm_add_epi16(in0, mask); - in0 = _mm_add_epi16(in0, k__nonzero_bias_b); - } - } - // Do the two transform/transpose passes - for (pass = 0; pass < 2; ++pass) { - // Transform 1/2: Add/subtract - const __m128i r0 = _mm_add_epi16(in0, in3); - const __m128i r1 = _mm_add_epi16(in1, in2); - const __m128i r2 = _mm_sub_epi16(in1, in2); - const __m128i r3 = _mm_sub_epi16(in0, in3); - // Transform 1/2: Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - // Combine and transpose - const __m128i res0 = _mm_packs_epi32(w0, w2); - const __m128i res1 = _mm_packs_epi32(w4, w6); - // 00 01 02 03 20 21 22 23 - // 10 11 12 13 30 31 32 33 - const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); - const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 - // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 - if (0 == pass) { - // Extract values in the high part for second pass as transform code - // only uses the first four values. - in1 = _mm_unpackhi_epi64(in0, in0); - in3 = _mm_unpackhi_epi64(in2, in2); - } else { - // Post-condition output and store it (v + 1) >> 2, taking advantage - // of the fact 1/3 are stored just after 0/2. - __m128i out01 = _mm_add_epi16(in0, kOne); - __m128i out23 = _mm_add_epi16(in2, kOne); - out01 = _mm_srai_epi16(out01, 2); - out23 = _mm_srai_epi16(out23, 2); - _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); - _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); - } - } -} - -static INLINE void load_buffer_4x4_avx2(const int16_t *input, __m128i *in, - int stride) { - const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); - const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); - __m128i mask; - - in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - - in[0] = _mm_slli_epi16(in[0], 4); - in[1] = _mm_slli_epi16(in[1], 4); - in[2] = _mm_slli_epi16(in[2], 4); - in[3] = _mm_slli_epi16(in[3], 4); - - mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); - in[0] = _mm_add_epi16(in[0], mask); - in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); -} - -static INLINE void write_buffer_4x4_avx2(int16_t *output, __m128i *res) { - const __m128i kOne = _mm_set1_epi16(1); - __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); - __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); - __m128i out01 = _mm_add_epi16(in01, kOne); - __m128i out23 = _mm_add_epi16(in23, kOne); - out01 = _mm_srai_epi16(out01, 2); - out23 = _mm_srai_epi16(out23, 2); - _mm_store_si128((__m128i *)(output + 0 * 8), out01); - _mm_store_si128((__m128i *)(output + 1 * 8), out23); -} - -static INLINE void transpose_4x4_avx2(__m128i *res) { - // Combine and transpose - // 00 01 02 03 20 21 22 23 - // 10 11 12 13 30 31 32 33 - const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); - res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); - - // 00 10 20 30 01 11 21 31 - // 02 12 22 32 03 13 23 33 - // only use the first 4 16-bit integers - res[1] = _mm_unpackhi_epi64(res[0], res[0]); - res[3] = _mm_unpackhi_epi64(res[2], res[2]); -} - -void fdct4_avx2(__m128i *in) { - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u[4], v[4]; - u[0]=_mm_unpacklo_epi16(in[0], in[1]); - u[1]=_mm_unpacklo_epi16(in[3], in[2]); - - v[0] = _mm_add_epi16(u[0], u[1]); - v[1] = _mm_sub_epi16(u[0], u[1]); - - u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 - u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 - u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 - u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[1] = _mm_packs_epi32(u[2], u[3]); - transpose_4x4_avx2(in); -} - -void fadst4_avx2(__m128i *in) { - const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); - const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); - const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); - const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); - const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8]; - __m128i in7 = _mm_add_epi16(in[0], in[1]); - - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpacklo_epi16(in[2], in[3]); - u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpacklo_epi16(in[2], kZero); - u[4] = _mm_unpacklo_epi16(in[3], kZero); - - v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 - v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 - v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 - v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 - v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 - v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 - v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); - - u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = _mm_sub_epi32(v[2], v[6]); - u[2] = _mm_add_epi32(v[3], v[4]); - u[3] = _mm_sub_epi32(u[2], u[0]); - u[4] = _mm_slli_epi32(v[5], 2); - u[5] = _mm_sub_epi32(u[4], v[5]); - u[6] = _mm_add_epi32(u[3], u[5]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[2]); - in[1] = _mm_packs_epi32(u[1], u[3]); - transpose_4x4_avx2(in); -} - -void vp9_fht4x4_avx2(const int16_t *input, int16_t *output, - int stride, int tx_type) { - __m128i in[4]; - - switch (tx_type) { - case DCT_DCT: - vp9_fdct4x4_avx2(input, output, stride); - break; - case ADST_DCT: - load_buffer_4x4_avx2(input, in, stride); - fadst4_avx2(in); - fdct4_avx2(in); - write_buffer_4x4_avx2(output, in); - break; - case DCT_ADST: - load_buffer_4x4_avx2(input, in, stride); - fdct4_avx2(in); - fadst4_avx2(in); - write_buffer_4x4_avx2(output, in); - break; - case ADST_ADST: - load_buffer_4x4_avx2(input, in, stride); - fadst4_avx2(in); - fadst4_avx2(in); - write_buffer_4x4_avx2(output, in); - break; - default: - assert(0); - break; - } -} - -void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) { - int pass; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - // Load input - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - // Pre-condition input (shift by two) - in0 = _mm_slli_epi16(in0, 2); - in1 = _mm_slli_epi16(in1, 2); - in2 = _mm_slli_epi16(in2, 2); - in3 = _mm_slli_epi16(in3, 2); - in4 = _mm_slli_epi16(in4, 2); - in5 = _mm_slli_epi16(in5, 2); - in6 = _mm_slli_epi16(in6, 2); - in7 = _mm_slli_epi16(in7, 2); - - // We do two passes, first the columns, then the rows. The results of the - // first pass are transposed so that the same column code can be reused. The - // results of the second pass are also transposed so that the rows (processed - // as columns) are put back in row positions. - for (pass = 0; pass < 2; pass++) { - // To store results of each pass before the transpose. - __m128i res0, res1, res2, res3, res4, res5, res6, res7; - // Add/subtract - const __m128i q0 = _mm_add_epi16(in0, in7); - const __m128i q1 = _mm_add_epi16(in1, in6); - const __m128i q2 = _mm_add_epi16(in2, in5); - const __m128i q3 = _mm_add_epi16(in3, in4); - const __m128i q4 = _mm_sub_epi16(in3, in4); - const __m128i q5 = _mm_sub_epi16(in2, in5); - const __m128i q6 = _mm_sub_epi16(in1, in6); - const __m128i q7 = _mm_sub_epi16(in0, in7); - // Work on first four results - { - // Add/subtract - const __m128i r0 = _mm_add_epi16(q0, q3); - const __m128i r1 = _mm_add_epi16(q1, q2); - const __m128i r2 = _mm_sub_epi16(q1, q2); - const __m128i r3 = _mm_sub_epi16(q0, q3); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res0 = _mm_packs_epi32(w0, w1); - res4 = _mm_packs_epi32(w2, w3); - res2 = _mm_packs_epi32(w4, w5); - res6 = _mm_packs_epi32(w6, w7); - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i d0 = _mm_unpacklo_epi16(q6, q5); - const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); - const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); - const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); - const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); - const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); - const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); - const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); - const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); - const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); - const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); - const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); - // Combine - const __m128i r0 = _mm_packs_epi32(s0, s1); - const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/subtract - const __m128i x0 = _mm_add_epi16(q4, r0); - const __m128i x1 = _mm_sub_epi16(q4, r0); - const __m128i x2 = _mm_sub_epi16(q7, r1); - const __m128i x3 = _mm_add_epi16(q7, r1); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res1 = _mm_packs_epi32(w0, w1); - res7 = _mm_packs_epi32(w2, w3); - res5 = _mm_packs_epi32(w4, w5); - res3 = _mm_packs_epi32(w6, w7); - } - // Transpose the 8x8. - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); - const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); - const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); - const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); - const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); - const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); - const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); - const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } - } - // Post-condition output and store it - { - // Post-condition (division by two) - // division of two 16 bits signed numbers using shifts - // n / 2 = (n - (n >> 15)) >> 1 - const __m128i sign_in0 = _mm_srai_epi16(in0, 15); - const __m128i sign_in1 = _mm_srai_epi16(in1, 15); - const __m128i sign_in2 = _mm_srai_epi16(in2, 15); - const __m128i sign_in3 = _mm_srai_epi16(in3, 15); - const __m128i sign_in4 = _mm_srai_epi16(in4, 15); - const __m128i sign_in5 = _mm_srai_epi16(in5, 15); - const __m128i sign_in6 = _mm_srai_epi16(in6, 15); - const __m128i sign_in7 = _mm_srai_epi16(in7, 15); - in0 = _mm_sub_epi16(in0, sign_in0); - in1 = _mm_sub_epi16(in1, sign_in1); - in2 = _mm_sub_epi16(in2, sign_in2); - in3 = _mm_sub_epi16(in3, sign_in3); - in4 = _mm_sub_epi16(in4, sign_in4); - in5 = _mm_sub_epi16(in5, sign_in5); - in6 = _mm_sub_epi16(in6, sign_in6); - in7 = _mm_sub_epi16(in7, sign_in7); - in0 = _mm_srai_epi16(in0, 1); - in1 = _mm_srai_epi16(in1, 1); - in2 = _mm_srai_epi16(in2, 1); - in3 = _mm_srai_epi16(in3, 1); - in4 = _mm_srai_epi16(in4, 1); - in5 = _mm_srai_epi16(in5, 1); - in6 = _mm_srai_epi16(in6, 1); - in7 = _mm_srai_epi16(in7, 1); - // store results - _mm_store_si128((__m128i *)(output + 0 * 8), in0); - _mm_store_si128((__m128i *)(output + 1 * 8), in1); - _mm_store_si128((__m128i *)(output + 2 * 8), in2); - _mm_store_si128((__m128i *)(output + 3 * 8), in3); - _mm_store_si128((__m128i *)(output + 4 * 8), in4); - _mm_store_si128((__m128i *)(output + 5 * 8), in5); - _mm_store_si128((__m128i *)(output + 6 * 8), in6); - _mm_store_si128((__m128i *)(output + 7 * 8), in7); - } -} - -// load 8x8 array -static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m128i *in, - int stride) { - in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); - in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); - in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); - in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); - in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); - - in[0] = _mm_slli_epi16(in[0], 2); - in[1] = _mm_slli_epi16(in[1], 2); - in[2] = _mm_slli_epi16(in[2], 2); - in[3] = _mm_slli_epi16(in[3], 2); - in[4] = _mm_slli_epi16(in[4], 2); - in[5] = _mm_slli_epi16(in[5], 2); - in[6] = _mm_slli_epi16(in[6], 2); - in[7] = _mm_slli_epi16(in[7], 2); -} - -// right shift and rounding -static INLINE void right_shift_8x8_avx2(__m128i *res, int const bit) { - const __m128i kOne = _mm_set1_epi16(1); - const int bit_m02 = bit - 2; - __m128i sign0 = _mm_srai_epi16(res[0], 15); - __m128i sign1 = _mm_srai_epi16(res[1], 15); - __m128i sign2 = _mm_srai_epi16(res[2], 15); - __m128i sign3 = _mm_srai_epi16(res[3], 15); - __m128i sign4 = _mm_srai_epi16(res[4], 15); - __m128i sign5 = _mm_srai_epi16(res[5], 15); - __m128i sign6 = _mm_srai_epi16(res[6], 15); - __m128i sign7 = _mm_srai_epi16(res[7], 15); - - if (bit_m02 >= 0) { - __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02); - res[0] = _mm_add_epi16(res[0], k_const_rounding); - res[1] = _mm_add_epi16(res[1], k_const_rounding); - res[2] = _mm_add_epi16(res[2], k_const_rounding); - res[3] = _mm_add_epi16(res[3], k_const_rounding); - res[4] = _mm_add_epi16(res[4], k_const_rounding); - res[5] = _mm_add_epi16(res[5], k_const_rounding); - res[6] = _mm_add_epi16(res[6], k_const_rounding); - res[7] = _mm_add_epi16(res[7], k_const_rounding); - } - - res[0] = _mm_sub_epi16(res[0], sign0); - res[1] = _mm_sub_epi16(res[1], sign1); - res[2] = _mm_sub_epi16(res[2], sign2); - res[3] = _mm_sub_epi16(res[3], sign3); - res[4] = _mm_sub_epi16(res[4], sign4); - res[5] = _mm_sub_epi16(res[5], sign5); - res[6] = _mm_sub_epi16(res[6], sign6); - res[7] = _mm_sub_epi16(res[7], sign7); - - res[0] = _mm_srai_epi16(res[0], bit); - res[1] = _mm_srai_epi16(res[1], bit); - res[2] = _mm_srai_epi16(res[2], bit); - res[3] = _mm_srai_epi16(res[3], bit); - res[4] = _mm_srai_epi16(res[4], bit); - res[5] = _mm_srai_epi16(res[5], bit); - res[6] = _mm_srai_epi16(res[6], bit); - res[7] = _mm_srai_epi16(res[7], bit); -} - -// write 8x8 array -static INLINE void write_buffer_8x8_avx2(int16_t *output, __m128i *res, int stride) { - _mm_store_si128((__m128i *)(output + 0 * stride), res[0]); - _mm_store_si128((__m128i *)(output + 1 * stride), res[1]); - _mm_store_si128((__m128i *)(output + 2 * stride), res[2]); - _mm_store_si128((__m128i *)(output + 3 * stride), res[3]); - _mm_store_si128((__m128i *)(output + 4 * stride), res[4]); - _mm_store_si128((__m128i *)(output + 5 * stride), res[5]); - _mm_store_si128((__m128i *)(output + 6 * stride), res[6]); - _mm_store_si128((__m128i *)(output + 7 * stride), res[7]); -} - -// perform in-place transpose -static INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 44 54 45 55 46 56 47 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 25 35 - // 44 54 64 74 45 55 65 75 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 -} - -void fdct8_avx2(__m128i *in) { - // constants - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - - // stage 1 - s0 = _mm_add_epi16(in[0], in[7]); - s1 = _mm_add_epi16(in[1], in[6]); - s2 = _mm_add_epi16(in[2], in[5]); - s3 = _mm_add_epi16(in[3], in[4]); - s4 = _mm_sub_epi16(in[3], in[4]); - s5 = _mm_sub_epi16(in[2], in[5]); - s6 = _mm_sub_epi16(in[1], in[6]); - s7 = _mm_sub_epi16(in[0], in[7]); - - u0 = _mm_add_epi16(s0, s3); - u1 = _mm_add_epi16(s1, s2); - u2 = _mm_sub_epi16(s1, s2); - u3 = _mm_sub_epi16(s0, s3); - // interleave and perform butterfly multiplication/addition - v0 = _mm_unpacklo_epi16(u0, u1); - v1 = _mm_unpackhi_epi16(u0, u1); - v2 = _mm_unpacklo_epi16(u2, u3); - v3 = _mm_unpackhi_epi16(u2, u3); - - u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); - u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); - u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); - u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); - u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); - u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); - u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); - u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); - - // shift and rounding - v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u0, u1); - in[2] = _mm_packs_epi32(u4, u5); - in[4] = _mm_packs_epi32(u2, u3); - in[6] = _mm_packs_epi32(u6, u7); - - // stage 2 - // interleave and perform butterfly multiplication/addition - u0 = _mm_unpacklo_epi16(s6, s5); - u1 = _mm_unpackhi_epi16(s6, s5); - v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); - - // shift and rounding - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - - u0 = _mm_packs_epi32(v0, v1); - u1 = _mm_packs_epi32(v2, v3); - - // stage 3 - s0 = _mm_add_epi16(s4, u0); - s1 = _mm_sub_epi16(s4, u0); - s2 = _mm_sub_epi16(s7, u1); - s3 = _mm_add_epi16(s7, u1); - - // stage 4 - u0 = _mm_unpacklo_epi16(s0, s3); - u1 = _mm_unpackhi_epi16(s0, s3); - u2 = _mm_unpacklo_epi16(s1, s2); - u3 = _mm_unpackhi_epi16(s1, s2); - - v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); - v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); - v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); - v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); - v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); - v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); - v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); - v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); - - // shift and rounding - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - in[1] = _mm_packs_epi32(v0, v1); - in[3] = _mm_packs_epi32(v4, v5); - in[5] = _mm_packs_epi32(v2, v3); - in[7] = _mm_packs_epi32(v6, v7); - - // transpose - array_transpose_8x8_avx2(in, in); -} - -void fadst8_avx2(__m128i *in) { - // Constants - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - - // properly aligned for butterfly input - in0 = in[7]; - in1 = in[0]; - in2 = in[5]; - in3 = in[2]; - in4 = in[3]; - in5 = in[4]; - in6 = in[1]; - in7 = in[6]; - - // column transformation - // stage 1 - // interleave and multiply/add into 32-bit integer - s0 = _mm_unpacklo_epi16(in0, in1); - s1 = _mm_unpackhi_epi16(in0, in1); - s2 = _mm_unpacklo_epi16(in2, in3); - s3 = _mm_unpackhi_epi16(in2, in3); - s4 = _mm_unpacklo_epi16(in4, in5); - s5 = _mm_unpackhi_epi16(in4, in5); - s6 = _mm_unpacklo_epi16(in6, in7); - s7 = _mm_unpackhi_epi16(in6, in7); - - u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); - u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); - u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); - u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); - u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); - u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); - u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); - u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); - u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); - u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); - u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); - u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); - u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); - u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); - u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); - u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); - - // addition - w0 = _mm_add_epi32(u0, u8); - w1 = _mm_add_epi32(u1, u9); - w2 = _mm_add_epi32(u2, u10); - w3 = _mm_add_epi32(u3, u11); - w4 = _mm_add_epi32(u4, u12); - w5 = _mm_add_epi32(u5, u13); - w6 = _mm_add_epi32(u6, u14); - w7 = _mm_add_epi32(u7, u15); - w8 = _mm_sub_epi32(u0, u8); - w9 = _mm_sub_epi32(u1, u9); - w10 = _mm_sub_epi32(u2, u10); - w11 = _mm_sub_epi32(u3, u11); - w12 = _mm_sub_epi32(u4, u12); - w13 = _mm_sub_epi32(u5, u13); - w14 = _mm_sub_epi32(u6, u14); - w15 = _mm_sub_epi32(u7, u15); - - // shift and rounding - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); - v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); - v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); - v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); - v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); - v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); - v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); - v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); - u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); - u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); - u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); - u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); - u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); - u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); - u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); - - // back to 16-bit and pack 8 integers into __m128i - in[0] = _mm_packs_epi32(u0, u1); - in[1] = _mm_packs_epi32(u2, u3); - in[2] = _mm_packs_epi32(u4, u5); - in[3] = _mm_packs_epi32(u6, u7); - in[4] = _mm_packs_epi32(u8, u9); - in[5] = _mm_packs_epi32(u10, u11); - in[6] = _mm_packs_epi32(u12, u13); - in[7] = _mm_packs_epi32(u14, u15); - - // stage 2 - s0 = _mm_add_epi16(in[0], in[2]); - s1 = _mm_add_epi16(in[1], in[3]); - s2 = _mm_sub_epi16(in[0], in[2]); - s3 = _mm_sub_epi16(in[1], in[3]); - u0 = _mm_unpacklo_epi16(in[4], in[5]); - u1 = _mm_unpackhi_epi16(in[4], in[5]); - u2 = _mm_unpacklo_epi16(in[6], in[7]); - u3 = _mm_unpackhi_epi16(in[6], in[7]); - - v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); - v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); - v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); - v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); - v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); - v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); - v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); - v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); - - w0 = _mm_add_epi32(v0, v4); - w1 = _mm_add_epi32(v1, v5); - w2 = _mm_add_epi32(v2, v6); - w3 = _mm_add_epi32(v3, v7); - w4 = _mm_sub_epi32(v0, v4); - w5 = _mm_sub_epi32(v1, v5); - w6 = _mm_sub_epi32(v2, v6); - w7 = _mm_sub_epi32(v3, v7); - - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - // back to 16-bit intergers - s4 = _mm_packs_epi32(u0, u1); - s5 = _mm_packs_epi32(u2, u3); - s6 = _mm_packs_epi32(u4, u5); - s7 = _mm_packs_epi32(u6, u7); - - // stage 3 - u0 = _mm_unpacklo_epi16(s2, s3); - u1 = _mm_unpackhi_epi16(s2, s3); - u2 = _mm_unpacklo_epi16(s6, s7); - u3 = _mm_unpackhi_epi16(s6, s7); - - v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); - v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); - v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); - v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); - - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - s2 = _mm_packs_epi32(v0, v1); - s3 = _mm_packs_epi32(v2, v3); - s6 = _mm_packs_epi32(v4, v5); - s7 = _mm_packs_epi32(v6, v7); - - // FIXME(jingning): do subtract using bit inversion? - in[0] = s0; - in[1] = _mm_sub_epi16(k__const_0, s4); - in[2] = s6; - in[3] = _mm_sub_epi16(k__const_0, s2); - in[4] = s3; - in[5] = _mm_sub_epi16(k__const_0, s7); - in[6] = s5; - in[7] = _mm_sub_epi16(k__const_0, s1); - - // transpose - array_transpose_8x8_avx2(in, in); -} - -void vp9_fht8x8_avx2(const int16_t *input, int16_t *output, - int stride, int tx_type) { - __m128i in[8]; - - switch (tx_type) { - case DCT_DCT: - vp9_fdct8x8_avx2(input, output, stride); - break; - case ADST_DCT: - load_buffer_8x8_avx2(input, in, stride); - fadst8_avx2(in); - fdct8_avx2(in); - right_shift_8x8_avx2(in, 1); - write_buffer_8x8_avx2(output, in, 8); - break; - case DCT_ADST: - load_buffer_8x8_avx2(input, in, stride); - fdct8_avx2(in); - fadst8_avx2(in); - right_shift_8x8_avx2(in, 1); - write_buffer_8x8_avx2(output, in, 8); - break; - case ADST_ADST: - load_buffer_8x8_avx2(input, in, stride); - fadst8_avx2(in); - fadst8_avx2(in); - right_shift_8x8_avx2(in, 1); - write_buffer_8x8_avx2(output, in, 8); - break; - default: - assert(0); - break; - } -} - -void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { - // The 2D transform is done with two passes which are actually pretty - // similar. In the first one, we transform the columns and transpose - // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we transpose the columns (that - // is the transposed rows) and transpose the results (so that it goes back - // in normal/row positions). - int pass; - // We need an intermediate buffer between passes. - DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); - const int16_t *in = input; - int16_t *out = intermediate; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kOne = _mm_set1_epi16(1); - // Do the two transform/transpose passes - for (pass = 0; pass < 2; ++pass) { - // We process eight columns (transposed rows in second pass) at a time. - int column_start; - for (column_start = 0; column_start < 16; column_start += 8) { - __m128i in00, in01, in02, in03, in04, in05, in06, in07; - __m128i in08, in09, in10, in11, in12, in13, in14, in15; - __m128i input0, input1, input2, input3, input4, input5, input6, input7; - __m128i step1_0, step1_1, step1_2, step1_3; - __m128i step1_4, step1_5, step1_6, step1_7; - __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - __m128i step3_0, step3_1, step3_2, step3_3; - __m128i step3_4, step3_5, step3_6, step3_7; - __m128i res00, res01, res02, res03, res04, res05, res06, res07; - __m128i res08, res09, res10, res11, res12, res13, res14, res15; - // Load and pre-condition input. - if (0 == pass) { - in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); - in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); - in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); - in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); - in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); - in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); - in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); - in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); - in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); - in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); - in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); - in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); - in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); - in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); - in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); - in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); - // x = x << 2 - in00 = _mm_slli_epi16(in00, 2); - in01 = _mm_slli_epi16(in01, 2); - in02 = _mm_slli_epi16(in02, 2); - in03 = _mm_slli_epi16(in03, 2); - in04 = _mm_slli_epi16(in04, 2); - in05 = _mm_slli_epi16(in05, 2); - in06 = _mm_slli_epi16(in06, 2); - in07 = _mm_slli_epi16(in07, 2); - in08 = _mm_slli_epi16(in08, 2); - in09 = _mm_slli_epi16(in09, 2); - in10 = _mm_slli_epi16(in10, 2); - in11 = _mm_slli_epi16(in11, 2); - in12 = _mm_slli_epi16(in12, 2); - in13 = _mm_slli_epi16(in13, 2); - in14 = _mm_slli_epi16(in14, 2); - in15 = _mm_slli_epi16(in15, 2); - } else { - in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); - in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); - in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); - in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); - in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); - in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); - in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); - in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); - in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); - in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); - in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); - in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); - in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); - in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); - in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); - in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); - // x = (x + 1) >> 2 - in00 = _mm_add_epi16(in00, kOne); - in01 = _mm_add_epi16(in01, kOne); - in02 = _mm_add_epi16(in02, kOne); - in03 = _mm_add_epi16(in03, kOne); - in04 = _mm_add_epi16(in04, kOne); - in05 = _mm_add_epi16(in05, kOne); - in06 = _mm_add_epi16(in06, kOne); - in07 = _mm_add_epi16(in07, kOne); - in08 = _mm_add_epi16(in08, kOne); - in09 = _mm_add_epi16(in09, kOne); - in10 = _mm_add_epi16(in10, kOne); - in11 = _mm_add_epi16(in11, kOne); - in12 = _mm_add_epi16(in12, kOne); - in13 = _mm_add_epi16(in13, kOne); - in14 = _mm_add_epi16(in14, kOne); - in15 = _mm_add_epi16(in15, kOne); - in00 = _mm_srai_epi16(in00, 2); - in01 = _mm_srai_epi16(in01, 2); - in02 = _mm_srai_epi16(in02, 2); - in03 = _mm_srai_epi16(in03, 2); - in04 = _mm_srai_epi16(in04, 2); - in05 = _mm_srai_epi16(in05, 2); - in06 = _mm_srai_epi16(in06, 2); - in07 = _mm_srai_epi16(in07, 2); - in08 = _mm_srai_epi16(in08, 2); - in09 = _mm_srai_epi16(in09, 2); - in10 = _mm_srai_epi16(in10, 2); - in11 = _mm_srai_epi16(in11, 2); - in12 = _mm_srai_epi16(in12, 2); - in13 = _mm_srai_epi16(in13, 2); - in14 = _mm_srai_epi16(in14, 2); - in15 = _mm_srai_epi16(in15, 2); - } - in += 8; - // Calculate input for the first 8 results. - { - input0 = _mm_add_epi16(in00, in15); - input1 = _mm_add_epi16(in01, in14); - input2 = _mm_add_epi16(in02, in13); - input3 = _mm_add_epi16(in03, in12); - input4 = _mm_add_epi16(in04, in11); - input5 = _mm_add_epi16(in05, in10); - input6 = _mm_add_epi16(in06, in09); - input7 = _mm_add_epi16(in07, in08); - } - // Calculate input for the next 8 results. - { - step1_0 = _mm_sub_epi16(in07, in08); - step1_1 = _mm_sub_epi16(in06, in09); - step1_2 = _mm_sub_epi16(in05, in10); - step1_3 = _mm_sub_epi16(in04, in11); - step1_4 = _mm_sub_epi16(in03, in12); - step1_5 = _mm_sub_epi16(in02, in13); - step1_6 = _mm_sub_epi16(in01, in14); - step1_7 = _mm_sub_epi16(in00, in15); - } - // Work on the first eight values; fdct8(input, even_results); - { - // Add/subtract - const __m128i q0 = _mm_add_epi16(input0, input7); - const __m128i q1 = _mm_add_epi16(input1, input6); - const __m128i q2 = _mm_add_epi16(input2, input5); - const __m128i q3 = _mm_add_epi16(input3, input4); - const __m128i q4 = _mm_sub_epi16(input3, input4); - const __m128i q5 = _mm_sub_epi16(input2, input5); - const __m128i q6 = _mm_sub_epi16(input1, input6); - const __m128i q7 = _mm_sub_epi16(input0, input7); - // Work on first four results - { - // Add/subtract - const __m128i r0 = _mm_add_epi16(q0, q3); - const __m128i r1 = _mm_add_epi16(q1, q2); - const __m128i r2 = _mm_sub_epi16(q1, q2); - const __m128i r3 = _mm_sub_epi16(q0, q3); - // Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res00 = _mm_packs_epi32(w0, w1); - res08 = _mm_packs_epi32(w2, w3); - res04 = _mm_packs_epi32(w4, w5); - res12 = _mm_packs_epi32(w6, w7); - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i d0 = _mm_unpacklo_epi16(q6, q5); - const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); - const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); - const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); - const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); - const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); - const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); - const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); - const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); - const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); - const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); - const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); - // Combine - const __m128i r0 = _mm_packs_epi32(s0, s1); - const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/subtract - const __m128i x0 = _mm_add_epi16(q4, r0); - const __m128i x1 = _mm_sub_epi16(q4, r0); - const __m128i x2 = _mm_sub_epi16(q7, r1); - const __m128i x3 = _mm_add_epi16(q7, r1); - // Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res02 = _mm_packs_epi32(w0, w1); - res14 = _mm_packs_epi32(w2, w3); - res10 = _mm_packs_epi32(w4, w5); - res06 = _mm_packs_epi32(w6, w7); - } - } - // Work on the next eight values; step1 -> odd_results - { - // step 2 - { - const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); - const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); - const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); - const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - step2_2 = _mm_packs_epi32(w0, w1); - step2_3 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); - const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); - const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); - const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - step2_5 = _mm_packs_epi32(w0, w1); - step2_4 = _mm_packs_epi32(w2, w3); - } - // step 3 - { - step3_0 = _mm_add_epi16(step1_0, step2_3); - step3_1 = _mm_add_epi16(step1_1, step2_2); - step3_2 = _mm_sub_epi16(step1_1, step2_2); - step3_3 = _mm_sub_epi16(step1_0, step2_3); - step3_4 = _mm_sub_epi16(step1_7, step2_4); - step3_5 = _mm_sub_epi16(step1_6, step2_5); - step3_6 = _mm_add_epi16(step1_6, step2_5); - step3_7 = _mm_add_epi16(step1_7, step2_4); - } - // step 4 - { - const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); - const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); - const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); - const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - step2_1 = _mm_packs_epi32(w0, w1); - step2_2 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); - const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); - const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); - const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - step2_6 = _mm_packs_epi32(w0, w1); - step2_5 = _mm_packs_epi32(w2, w3); - } - // step 5 - { - step1_0 = _mm_add_epi16(step3_0, step2_1); - step1_1 = _mm_sub_epi16(step3_0, step2_1); - step1_2 = _mm_sub_epi16(step3_3, step2_2); - step1_3 = _mm_add_epi16(step3_3, step2_2); - step1_4 = _mm_add_epi16(step3_4, step2_5); - step1_5 = _mm_sub_epi16(step3_4, step2_5); - step1_6 = _mm_sub_epi16(step3_7, step2_6); - step1_7 = _mm_add_epi16(step3_7, step2_6); - } - // step 6 - { - const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); - const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); - const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); - const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - res01 = _mm_packs_epi32(w0, w1); - res09 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); - const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); - const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); - const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - res05 = _mm_packs_epi32(w0, w1); - res13 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); - const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); - const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); - const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - res11 = _mm_packs_epi32(w0, w1); - res03 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); - const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); - const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); - const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - res15 = _mm_packs_epi32(w0, w1); - res07 = _mm_packs_epi32(w2, w3); - } - } - // Transpose the results, do it as two 8x8 transposes. - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); - const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); - const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); - const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); - const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); - const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); - const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); - const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0); - _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1); - _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2); - _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3); - _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4); - _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5); - _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6); - _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7); - } - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); - const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); - const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); - const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); - const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); - const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); - const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); - const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - // Store results - _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); - _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); - _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); - _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); - _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); - _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); - _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); - _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); - } - out += 8*16; - } - // Setup in/out for next pass. - in = intermediate; - out = output; - } -} - -static INLINE void load_buffer_16x16_avx2(const int16_t* input, __m128i *in0, - __m128i *in1, int stride) { - // load first 8 columns - load_buffer_8x8_avx2(input, in0, stride); - load_buffer_8x8_avx2(input + 8 * stride, in0 + 8, stride); - - input += 8; - // load second 8 columns - load_buffer_8x8_avx2(input, in1, stride); - load_buffer_8x8_avx2(input + 8 * stride, in1 + 8, stride); -} - -static INLINE void write_buffer_16x16_avx2(int16_t *output, __m128i *in0, - __m128i *in1, int stride) { - // write first 8 columns - write_buffer_8x8_avx2(output, in0, stride); - write_buffer_8x8_avx2(output + 8 * stride, in0 + 8, stride); - // write second 8 columns - output += 8; - write_buffer_8x8_avx2(output, in1, stride); - write_buffer_8x8_avx2(output + 8 * stride, in1 + 8, stride); -} - -static INLINE void array_transpose_16x16_avx2(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8_avx2(res0, res0); - array_transpose_8x8_avx2(res1, tbuf); - array_transpose_8x8_avx2(res0 + 8, res1); - array_transpose_8x8_avx2(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - -static INLINE void right_shift_16x16_avx2(__m128i *res0, __m128i *res1) { - // perform rounding operations - right_shift_8x8_avx2(res0, 2); - right_shift_8x8_avx2(res0 + 8, 2); - right_shift_8x8_avx2(res1, 2); - right_shift_8x8_avx2(res1 + 8, 2); -} - -void fdct16_8col_avx2(__m128i *in) { - // perform 16x16 1-D DCT for 8 columns - __m128i i[8], s[8], p[8], t[8], u[16], v[16]; - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - // stage 1 - i[0] = _mm_add_epi16(in[0], in[15]); - i[1] = _mm_add_epi16(in[1], in[14]); - i[2] = _mm_add_epi16(in[2], in[13]); - i[3] = _mm_add_epi16(in[3], in[12]); - i[4] = _mm_add_epi16(in[4], in[11]); - i[5] = _mm_add_epi16(in[5], in[10]); - i[6] = _mm_add_epi16(in[6], in[9]); - i[7] = _mm_add_epi16(in[7], in[8]); - - s[0] = _mm_sub_epi16(in[7], in[8]); - s[1] = _mm_sub_epi16(in[6], in[9]); - s[2] = _mm_sub_epi16(in[5], in[10]); - s[3] = _mm_sub_epi16(in[4], in[11]); - s[4] = _mm_sub_epi16(in[3], in[12]); - s[5] = _mm_sub_epi16(in[2], in[13]); - s[6] = _mm_sub_epi16(in[1], in[14]); - s[7] = _mm_sub_epi16(in[0], in[15]); - - p[0] = _mm_add_epi16(i[0], i[7]); - p[1] = _mm_add_epi16(i[1], i[6]); - p[2] = _mm_add_epi16(i[2], i[5]); - p[3] = _mm_add_epi16(i[3], i[4]); - p[4] = _mm_sub_epi16(i[3], i[4]); - p[5] = _mm_sub_epi16(i[2], i[5]); - p[6] = _mm_sub_epi16(i[1], i[6]); - p[7] = _mm_sub_epi16(i[0], i[7]); - - u[0] = _mm_add_epi16(p[0], p[3]); - u[1] = _mm_add_epi16(p[1], p[2]); - u[2] = _mm_sub_epi16(p[1], p[2]); - u[3] = _mm_sub_epi16(p[0], p[3]); - - v[0] = _mm_unpacklo_epi16(u[0], u[1]); - v[1] = _mm_unpackhi_epi16(u[0], u[1]); - v[2] = _mm_unpacklo_epi16(u[2], u[3]); - v[3] = _mm_unpackhi_epi16(u[2], u[3]); - - u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); - u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); - u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); - u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); - u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); - u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); - u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); - u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[4] = _mm_packs_epi32(u[4], u[5]); - in[8] = _mm_packs_epi32(u[2], u[3]); - in[12] = _mm_packs_epi32(u[6], u[7]); - - u[0] = _mm_unpacklo_epi16(p[5], p[6]); - u[1] = _mm_unpackhi_epi16(p[5], p[6]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - - u[0] = _mm_packs_epi32(v[0], v[1]); - u[1] = _mm_packs_epi32(v[2], v[3]); - - t[0] = _mm_add_epi16(p[4], u[0]); - t[1] = _mm_sub_epi16(p[4], u[0]); - t[2] = _mm_sub_epi16(p[7], u[1]); - t[3] = _mm_add_epi16(p[7], u[1]); - - u[0] = _mm_unpacklo_epi16(t[0], t[3]); - u[1] = _mm_unpackhi_epi16(t[0], t[3]); - u[2] = _mm_unpacklo_epi16(t[1], t[2]); - u[3] = _mm_unpackhi_epi16(t[1], t[2]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); - v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); - v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); - v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); - v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); - v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); - v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - in[2] = _mm_packs_epi32(v[0], v[1]); - in[6] = _mm_packs_epi32(v[4], v[5]); - in[10] = _mm_packs_epi32(v[2], v[3]); - in[14] = _mm_packs_epi32(v[6], v[7]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[2], s[5]); - u[1] = _mm_unpackhi_epi16(s[2], s[5]); - u[2] = _mm_unpacklo_epi16(s[3], s[4]); - u[3] = _mm_unpackhi_epi16(s[3], s[4]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[2] = _mm_packs_epi32(v[0], v[1]); - t[3] = _mm_packs_epi32(v[2], v[3]); - t[4] = _mm_packs_epi32(v[4], v[5]); - t[5] = _mm_packs_epi32(v[6], v[7]); - - // stage 3 - p[0] = _mm_add_epi16(s[0], t[3]); - p[1] = _mm_add_epi16(s[1], t[2]); - p[2] = _mm_sub_epi16(s[1], t[2]); - p[3] = _mm_sub_epi16(s[0], t[3]); - p[4] = _mm_sub_epi16(s[7], t[4]); - p[5] = _mm_sub_epi16(s[6], t[5]); - p[6] = _mm_add_epi16(s[6], t[5]); - p[7] = _mm_add_epi16(s[7], t[4]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(p[1], p[6]); - u[1] = _mm_unpackhi_epi16(p[1], p[6]); - u[2] = _mm_unpacklo_epi16(p[2], p[5]); - u[3] = _mm_unpackhi_epi16(p[2], p[5]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); - v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); - v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); - v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); - v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); - v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[1] = _mm_packs_epi32(v[0], v[1]); - t[2] = _mm_packs_epi32(v[2], v[3]); - t[5] = _mm_packs_epi32(v[4], v[5]); - t[6] = _mm_packs_epi32(v[6], v[7]); - - // stage 5 - s[0] = _mm_add_epi16(p[0], t[1]); - s[1] = _mm_sub_epi16(p[0], t[1]); - s[2] = _mm_sub_epi16(p[3], t[2]); - s[3] = _mm_add_epi16(p[3], t[2]); - s[4] = _mm_add_epi16(p[4], t[5]); - s[5] = _mm_sub_epi16(p[4], t[5]); - s[6] = _mm_sub_epi16(p[7], t[6]); - s[7] = _mm_add_epi16(p[7], t[6]); - - // stage 6 - u[0] = _mm_unpacklo_epi16(s[0], s[7]); - u[1] = _mm_unpackhi_epi16(s[0], s[7]); - u[2] = _mm_unpacklo_epi16(s[1], s[6]); - u[3] = _mm_unpackhi_epi16(s[1], s[6]); - u[4] = _mm_unpacklo_epi16(s[2], s[5]); - u[5] = _mm_unpackhi_epi16(s[2], s[5]); - u[6] = _mm_unpacklo_epi16(s[3], s[4]); - u[7] = _mm_unpackhi_epi16(s[3], s[4]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); - v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); - v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); - v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); - v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); - v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); - v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); - v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); - v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); - v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); - v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); - v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); - v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); - v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); - v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); - v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[1] = _mm_packs_epi32(v[0], v[1]); - in[9] = _mm_packs_epi32(v[2], v[3]); - in[5] = _mm_packs_epi32(v[4], v[5]); - in[13] = _mm_packs_epi32(v[6], v[7]); - in[3] = _mm_packs_epi32(v[8], v[9]); - in[11] = _mm_packs_epi32(v[10], v[11]); - in[7] = _mm_packs_epi32(v[12], v[13]); - in[15] = _mm_packs_epi32(v[14], v[15]); -} - -void fadst16_8col_avx2(__m128i *in) { - // perform 16x16 1-D ADST for 8 columns - __m128i s[16], x[16], u[32], v[32]; - const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); - const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); - const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); - - u[0] = _mm_unpacklo_epi16(in[15], in[0]); - u[1] = _mm_unpackhi_epi16(in[15], in[0]); - u[2] = _mm_unpacklo_epi16(in[13], in[2]); - u[3] = _mm_unpackhi_epi16(in[13], in[2]); - u[4] = _mm_unpacklo_epi16(in[11], in[4]); - u[5] = _mm_unpackhi_epi16(in[11], in[4]); - u[6] = _mm_unpacklo_epi16(in[9], in[6]); - u[7] = _mm_unpackhi_epi16(in[9], in[6]); - u[8] = _mm_unpacklo_epi16(in[7], in[8]); - u[9] = _mm_unpackhi_epi16(in[7], in[8]); - u[10] = _mm_unpacklo_epi16(in[5], in[10]); - u[11] = _mm_unpackhi_epi16(in[5], in[10]); - u[12] = _mm_unpacklo_epi16(in[3], in[12]); - u[13] = _mm_unpackhi_epi16(in[3], in[12]); - u[14] = _mm_unpacklo_epi16(in[1], in[14]); - u[15] = _mm_unpackhi_epi16(in[1], in[14]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); - v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); - v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); - v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); - v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); - v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); - v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); - v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); - v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); - v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); - v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); - v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); - v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); - v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); - v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); - v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); - v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); - v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); - v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); - v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); - v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); - v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); - v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); - v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); - v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); - v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); - v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); - v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); - v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); - v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); - v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); - v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); - - u[0] = _mm_add_epi32(v[0], v[16]); - u[1] = _mm_add_epi32(v[1], v[17]); - u[2] = _mm_add_epi32(v[2], v[18]); - u[3] = _mm_add_epi32(v[3], v[19]); - u[4] = _mm_add_epi32(v[4], v[20]); - u[5] = _mm_add_epi32(v[5], v[21]); - u[6] = _mm_add_epi32(v[6], v[22]); - u[7] = _mm_add_epi32(v[7], v[23]); - u[8] = _mm_add_epi32(v[8], v[24]); - u[9] = _mm_add_epi32(v[9], v[25]); - u[10] = _mm_add_epi32(v[10], v[26]); - u[11] = _mm_add_epi32(v[11], v[27]); - u[12] = _mm_add_epi32(v[12], v[28]); - u[13] = _mm_add_epi32(v[13], v[29]); - u[14] = _mm_add_epi32(v[14], v[30]); - u[15] = _mm_add_epi32(v[15], v[31]); - u[16] = _mm_sub_epi32(v[0], v[16]); - u[17] = _mm_sub_epi32(v[1], v[17]); - u[18] = _mm_sub_epi32(v[2], v[18]); - u[19] = _mm_sub_epi32(v[3], v[19]); - u[20] = _mm_sub_epi32(v[4], v[20]); - u[21] = _mm_sub_epi32(v[5], v[21]); - u[22] = _mm_sub_epi32(v[6], v[22]); - u[23] = _mm_sub_epi32(v[7], v[23]); - u[24] = _mm_sub_epi32(v[8], v[24]); - u[25] = _mm_sub_epi32(v[9], v[25]); - u[26] = _mm_sub_epi32(v[10], v[26]); - u[27] = _mm_sub_epi32(v[11], v[27]); - u[28] = _mm_sub_epi32(v[12], v[28]); - u[29] = _mm_sub_epi32(v[13], v[29]); - u[30] = _mm_sub_epi32(v[14], v[30]); - u[31] = _mm_sub_epi32(v[15], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); - v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); - v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); - v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); - v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); - v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); - v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); - v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); - v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); - v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); - v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); - v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); - v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); - v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); - v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); - v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); - u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); - u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); - u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); - u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); - u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); - u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); - u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); - u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); - u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); - u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); - u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); - u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); - u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); - u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); - u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_packs_epi32(u[8], u[9]); - s[5] = _mm_packs_epi32(u[10], u[11]); - s[6] = _mm_packs_epi32(u[12], u[13]); - s[7] = _mm_packs_epi32(u[14], u[15]); - s[8] = _mm_packs_epi32(u[16], u[17]); - s[9] = _mm_packs_epi32(u[18], u[19]); - s[10] = _mm_packs_epi32(u[20], u[21]); - s[11] = _mm_packs_epi32(u[22], u[23]); - s[12] = _mm_packs_epi32(u[24], u[25]); - s[13] = _mm_packs_epi32(u[26], u[27]); - s[14] = _mm_packs_epi32(u[28], u[29]); - s[15] = _mm_packs_epi32(u[30], u[31]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[9]); - u[1] = _mm_unpackhi_epi16(s[8], s[9]); - u[2] = _mm_unpacklo_epi16(s[10], s[11]); - u[3] = _mm_unpackhi_epi16(s[10], s[11]); - u[4] = _mm_unpacklo_epi16(s[12], s[13]); - u[5] = _mm_unpackhi_epi16(s[12], s[13]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); - v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); - v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); - v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); - v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); - v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); - v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); - v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], v[8]); - u[1] = _mm_add_epi32(v[1], v[9]); - u[2] = _mm_add_epi32(v[2], v[10]); - u[3] = _mm_add_epi32(v[3], v[11]); - u[4] = _mm_add_epi32(v[4], v[12]); - u[5] = _mm_add_epi32(v[5], v[13]); - u[6] = _mm_add_epi32(v[6], v[14]); - u[7] = _mm_add_epi32(v[7], v[15]); - u[8] = _mm_sub_epi32(v[0], v[8]); - u[9] = _mm_sub_epi32(v[1], v[9]); - u[10] = _mm_sub_epi32(v[2], v[10]); - u[11] = _mm_sub_epi32(v[3], v[11]); - u[12] = _mm_sub_epi32(v[4], v[12]); - u[13] = _mm_sub_epi32(v[5], v[13]); - u[14] = _mm_sub_epi32(v[6], v[14]); - u[15] = _mm_sub_epi32(v[7], v[15]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - x[0] = _mm_add_epi16(s[0], s[4]); - x[1] = _mm_add_epi16(s[1], s[5]); - x[2] = _mm_add_epi16(s[2], s[6]); - x[3] = _mm_add_epi16(s[3], s[7]); - x[4] = _mm_sub_epi16(s[0], s[4]); - x[5] = _mm_sub_epi16(s[1], s[5]); - x[6] = _mm_sub_epi16(s[2], s[6]); - x[7] = _mm_sub_epi16(s[3], s[7]); - x[8] = _mm_packs_epi32(u[0], u[1]); - x[9] = _mm_packs_epi32(u[2], u[3]); - x[10] = _mm_packs_epi32(u[4], u[5]); - x[11] = _mm_packs_epi32(u[6], u[7]); - x[12] = _mm_packs_epi32(u[8], u[9]); - x[13] = _mm_packs_epi32(u[10], u[11]); - x[14] = _mm_packs_epi32(u[12], u[13]); - x[15] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - u[0] = _mm_unpacklo_epi16(x[4], x[5]); - u[1] = _mm_unpackhi_epi16(x[4], x[5]); - u[2] = _mm_unpacklo_epi16(x[6], x[7]); - u[3] = _mm_unpackhi_epi16(x[6], x[7]); - u[4] = _mm_unpacklo_epi16(x[12], x[13]); - u[5] = _mm_unpackhi_epi16(x[12], x[13]); - u[6] = _mm_unpacklo_epi16(x[14], x[15]); - u[7] = _mm_unpackhi_epi16(x[14], x[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); - v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); - v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], v[4]); - u[1] = _mm_add_epi32(v[1], v[5]); - u[2] = _mm_add_epi32(v[2], v[6]); - u[3] = _mm_add_epi32(v[3], v[7]); - u[4] = _mm_sub_epi32(v[0], v[4]); - u[5] = _mm_sub_epi32(v[1], v[5]); - u[6] = _mm_sub_epi32(v[2], v[6]); - u[7] = _mm_sub_epi32(v[3], v[7]); - u[8] = _mm_add_epi32(v[8], v[12]); - u[9] = _mm_add_epi32(v[9], v[13]); - u[10] = _mm_add_epi32(v[10], v[14]); - u[11] = _mm_add_epi32(v[11], v[15]); - u[12] = _mm_sub_epi32(v[8], v[12]); - u[13] = _mm_sub_epi32(v[9], v[13]); - u[14] = _mm_sub_epi32(v[10], v[14]); - u[15] = _mm_sub_epi32(v[11], v[15]); - - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_add_epi16(x[0], x[2]); - s[1] = _mm_add_epi16(x[1], x[3]); - s[2] = _mm_sub_epi16(x[0], x[2]); - s[3] = _mm_sub_epi16(x[1], x[3]); - s[4] = _mm_packs_epi32(v[0], v[1]); - s[5] = _mm_packs_epi32(v[2], v[3]); - s[6] = _mm_packs_epi32(v[4], v[5]); - s[7] = _mm_packs_epi32(v[6], v[7]); - s[8] = _mm_add_epi16(x[8], x[10]); - s[9] = _mm_add_epi16(x[9], x[11]); - s[10] = _mm_sub_epi16(x[8], x[10]); - s[11] = _mm_sub_epi16(x[9], x[11]); - s[12] = _mm_packs_epi32(v[8], v[9]); - s[13] = _mm_packs_epi32(v[10], v[11]); - s[14] = _mm_packs_epi32(v[12], v[13]); - s[15] = _mm_packs_epi32(v[14], v[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(s[2], s[3]); - u[1] = _mm_unpackhi_epi16(s[2], s[3]); - u[2] = _mm_unpacklo_epi16(s[6], s[7]); - u[3] = _mm_unpackhi_epi16(s[6], s[7]); - u[4] = _mm_unpacklo_epi16(s[10], s[11]); - u[5] = _mm_unpackhi_epi16(s[10], s[11]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); - v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); - v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); - v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); - v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); - v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); - v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); - v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[0] = s[0]; - in[1] = _mm_sub_epi16(kZero, s[8]); - in[2] = s[12]; - in[3] = _mm_sub_epi16(kZero, s[4]); - in[4] = _mm_packs_epi32(v[4], v[5]); - in[5] = _mm_packs_epi32(v[12], v[13]); - in[6] = _mm_packs_epi32(v[8], v[9]); - in[7] = _mm_packs_epi32(v[0], v[1]); - in[8] = _mm_packs_epi32(v[2], v[3]); - in[9] = _mm_packs_epi32(v[10], v[11]); - in[10] = _mm_packs_epi32(v[14], v[15]); - in[11] = _mm_packs_epi32(v[6], v[7]); - in[12] = s[5]; - in[13] = _mm_sub_epi16(kZero, s[13]); - in[14] = s[9]; - in[15] = _mm_sub_epi16(kZero, s[1]); -} - -void fdct16_avx2(__m128i *in0, __m128i *in1) { - fdct16_8col_avx2(in0); - fdct16_8col_avx2(in1); - array_transpose_16x16_avx2(in0, in1); -} - -void fadst16_avx2(__m128i *in0, __m128i *in1) { - fadst16_8col_avx2(in0); - fadst16_8col_avx2(in1); - array_transpose_16x16_avx2(in0, in1); -} - -void vp9_fht16x16_avx2(const int16_t *input, int16_t *output, - int stride, int tx_type) { - __m128i in0[16], in1[16]; - - switch (tx_type) { - case DCT_DCT: - vp9_fdct16x16_avx2(input, output, stride); - break; - case ADST_DCT: - load_buffer_16x16_avx2(input, in0, in1, stride); - fadst16_avx2(in0, in1); - right_shift_16x16_avx2(in0, in1); - fdct16_avx2(in0, in1); - write_buffer_16x16_avx2(output, in0, in1, 16); - break; - case DCT_ADST: - load_buffer_16x16_avx2(input, in0, in1, stride); - fdct16_avx2(in0, in1); - right_shift_16x16_avx2(in0, in1); - fadst16_avx2(in0, in1); - write_buffer_16x16_avx2(output, in0, in1, 16); - break; - case ADST_ADST: - load_buffer_16x16_avx2(input, in0, in1, stride); - fadst16_avx2(in0, in1); - right_shift_16x16_avx2(in0, in1); - fadst16_avx2(in0, in1); - write_buffer_16x16_avx2(output, in0, in1, 16); - break; - default: - assert(0); - break; - } -} #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2 #define FDCT32x32_HIGH_PRECISION 0 diff --git a/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c b/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c index f31b176e5..1feed6256 100644 --- a/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c +++ b/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c @@ -31,7 +31,7 @@ void vp9_sad32x32x4d_avx2(uint8_t *src, sum_ref3 = _mm256_set1_epi16(0); for (i = 0; i < 32 ; i++) { // load src and all refs - src_reg = _mm256_load_si256((__m256i *)(src)); + src_reg = _mm256_loadu_si256((__m256i *)(src)); ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); @@ -103,8 +103,8 @@ void vp9_sad64x64x4d_avx2(uint8_t *src, sum_ref3 = _mm256_set1_epi16(0); for (i = 0; i < 64 ; i++) { // load 64 bytes from src and all refs - src_reg = _mm256_load_si256((__m256i *)(src)); - srcnext_reg = _mm256_load_si256((__m256i *)(src + 32)); + src_reg = _mm256_loadu_si256((__m256i *)(src)); + srcnext_reg = _mm256_loadu_si256((__m256i *)(src + 32)); ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32)); ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c index 34ed1867f..a441cadaf 100644 --- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c +++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c @@ -67,7 +67,7 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { #define LOAD_SRC_DST \ /* load source and destination */ \ src_reg = _mm256_loadu_si256((__m256i const *) (src)); \ - dst_reg = _mm256_load_si256((__m256i const *) (dst)); + dst_reg = _mm256_loadu_si256((__m256i const *) (dst)); #define AVG_NEXT_SRC(src_reg, size_stride) \ src_next_reg = _mm256_loadu_si256((__m256i const *) \ @@ -333,7 +333,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, if (y_offset == 0) { for (i = 0; i < height ; i++) { LOAD_SRC_DST - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); sec+= sec_stride; // expend each byte to 2 bytes @@ -347,7 +347,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, for (i = 0; i < height ; i++) { LOAD_SRC_DST AVG_NEXT_SRC(src_reg, src_stride) - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); sec+= sec_stride; // expend each byte to 2 bytes @@ -369,7 +369,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, MERGE_NEXT_SRC(src_reg, src_stride) FILTER_SRC(filter) src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); sec+= sec_stride; MERGE_WITH_SRC(src_reg, zero_reg) @@ -385,7 +385,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, for (i = 0; i < height ; i++) { LOAD_SRC_DST AVG_NEXT_SRC(src_reg, 1) - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); sec+= sec_stride; // expand each byte to 2 bytes @@ -409,7 +409,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, AVG_NEXT_SRC(src_reg, 1) // average between previous average to current average src_avg = _mm256_avg_epu8(src_avg, src_reg); - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_avg = _mm256_avg_epu8(src_avg, sec_reg); sec+= sec_stride; // expand each byte to 2 bytes @@ -437,7 +437,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, MERGE_WITH_SRC(src_avg, src_reg) FILTER_SRC(filter) src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_avg = _mm256_avg_epu8(src_avg, sec_reg); // expand each byte to 2 bytes MERGE_WITH_SRC(src_avg, zero_reg) @@ -459,7 +459,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, MERGE_NEXT_SRC(src_reg, 1) FILTER_SRC(filter) src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); MERGE_WITH_SRC(src_reg, zero_reg) sec+= sec_stride; @@ -487,7 +487,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); // average between previous pack to the current src_pack = _mm256_avg_epu8(src_pack, src_reg); - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_pack = _mm256_avg_epu8(src_pack, sec_reg); sec+= sec_stride; MERGE_WITH_SRC(src_pack, zero_reg) @@ -524,7 +524,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, // filter the source FILTER_SRC(yfilter) src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_pack = _mm256_avg_epu8(src_pack, sec_reg); MERGE_WITH_SRC(src_pack, zero_reg) src_pack = src_reg; diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index ba9750032..86ad00501 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -67,7 +67,6 @@ VP9_COMMON_SRCS-yes += common/vp9_common_data.h VP9_COMMON_SRCS-yes += common/vp9_scan.c VP9_COMMON_SRCS-yes += common/vp9_scan.h -VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 3d0e6382b..11b8205d9 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -39,6 +39,8 @@ struct vp9_extracfg { unsigned int frame_parallel_decoding_mode; AQ_MODE aq_mode; unsigned int frame_periodic_boost; + vpx_bit_depth_t bit_depth; + vp9e_tune_content content; }; struct extraconfig_map { @@ -68,6 +70,8 @@ static const struct extraconfig_map extracfg_map[] = { 0, // frame_parallel_decoding_mode NO_AQ, // aq_mode 0, // frame_periodic_delta_q + VPX_BITS_8, // Bit depth + VP9E_CONTENT_DEFAULT // content } } }; @@ -174,7 +178,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS); -#ifdef CONFIG_SPATIAL_SVC +#if CONFIG_SPATIAL_SVC if (cfg->ss_number_layers > 1) { unsigned int i, alt_ref_sum = 0; for (i = 0; i < cfg->ss_number_layers; ++i) { @@ -219,6 +223,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(extra_cfg, cq_level, 0, 63); RANGE_CHECK(cfg, g_bit_depth, VPX_BITS_8, VPX_BITS_12); RANGE_CHECK(cfg, g_in_bit_depth, 8, 12); + RANGE_CHECK(extra_cfg, content, + VP9E_CONTENT_DEFAULT, VP9E_CONTENT_INVALID - 1); // TODO(yaowu): remove this when ssim tuning is implemented for vp9 if (extra_cfg->tuning == VP8_TUNE_SSIM) @@ -312,6 +318,7 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, default: ERROR("Invalid image format. Only YV12, I420, I422, I444 images are " "supported."); + break; } if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h) @@ -320,6 +327,20 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static int get_image_bps(const vpx_image_t *img) { + switch (img->fmt) { + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_I420: return 12; + case VPX_IMG_FMT_I422: return 16; + case VPX_IMG_FMT_I444: return 24; + case VPX_IMG_FMT_I42016: return 24; + case VPX_IMG_FMT_I42216: return 32; + case VPX_IMG_FMT_I44416: return 48; + default: assert(0 && "Invalid image format"); break; + } + return 0; +} + static vpx_codec_err_t set_encoder_config( VP9EncoderConfig *oxcf, const vpx_codec_enc_cfg_t *cfg, @@ -330,19 +351,22 @@ static vpx_codec_err_t set_encoder_config( oxcf->bit_depth = cfg->g_bit_depth; oxcf->in_bit_depth = cfg->g_in_bit_depth; // guess a frame rate if out of whack, use 30 - oxcf->framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num; - if (oxcf->framerate > 180) - oxcf->framerate = 30; + oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num; + if (oxcf->init_framerate > 180) + oxcf->init_framerate = 30; switch (cfg->g_pass) { case VPX_RC_ONE_PASS: oxcf->mode = ONE_PASS_GOOD; + oxcf->pass = 0; break; case VPX_RC_FIRST_PASS: oxcf->mode = TWO_PASS_FIRST; + oxcf->pass = 1; break; case VPX_RC_LAST_PASS: oxcf->mode = TWO_PASS_SECOND_BEST; + oxcf->pass = 2; break; } @@ -401,6 +425,7 @@ static vpx_codec_err_t set_encoder_config( oxcf->arnr_type = extra_cfg->arnr_type; oxcf->tuning = extra_cfg->tuning; + oxcf->content = extra_cfg->content; oxcf->tile_columns = extra_cfg->tile_columns; oxcf->tile_rows = extra_cfg->tile_rows; @@ -418,7 +443,7 @@ static vpx_codec_err_t set_encoder_config( int i; for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) { oxcf->ss_target_bitrate[i] = 1000 * cfg->ss_target_bitrate[i]; -#ifdef CONFIG_SPATIAL_SVC +#if CONFIG_SPATIAL_SVC oxcf->ss_play_alternate[i] = cfg->ss_enable_auto_alt_ref[i]; #endif } @@ -687,22 +712,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, priv->extra_cfg = extracfg_map[i].cfg; priv->extra_cfg.pkt_list = &priv->pkt_list.head; - // Maximum buffer size approximated based on having multiple ARF. - priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 8; - -#if CONFIG_VP9_HIGH - if (ctx->init_flags&VPX_CODEC_USE_HIGH) { - priv->cx_data_sz <<= 1; - } -#endif - - if (priv->cx_data_sz < 4096) - priv->cx_data_sz = 4096; - - priv->cx_data = (unsigned char *)malloc(priv->cx_data_sz); - if (priv->cx_data == NULL) - return VPX_CODEC_MEM_ERROR; - vp9_initialize_enc(); res = validate_config(priv, &priv->cfg, &priv->extra_cfg); @@ -823,6 +832,20 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) { return index_sz; } +// vp9 uses 10,000,000 ticks/second as time stamp +#define TICKS_PER_SEC 10000000LL + +static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase, + int64_t n) { + return n * TICKS_PER_SEC * timebase->num / timebase->den; +} + +static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase, + int64_t n) { + const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1; + return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC; +} + static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, const vpx_image_t *img, vpx_codec_pts_t pts, @@ -830,9 +853,26 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, vpx_enc_frame_flags_t flags, unsigned long deadline) { vpx_codec_err_t res = VPX_CODEC_OK; + const vpx_rational_t *const timebase = &ctx->cfg.g_timebase; - if (img) + if (img != NULL) { res = validate_img(ctx, img); + // TODO(jzern) the checks related to cpi's validity should be treated as a + // failure condition, encoder setup is done fully in init() currently. + if (res == VPX_CODEC_OK && ctx->cpi != NULL && ctx->cx_data == NULL) { + // There's no codec control for multiple alt-refs so check the encoder + // instance for its status to determine the compressed data size. + ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h * + get_image_bps(img) / 8 * + (ctx->cpi->multi_arf_allowed ? 8 : 2); + if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096; + + ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz); + if (ctx->cx_data == NULL) { + return VPX_CODEC_MEM_ERROR; + } + } + } pick_quickcompress_mode(ctx, duration, deadline); vpx_codec_pkt_list_init(&ctx->pkt_list); @@ -859,7 +899,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (res == VPX_CODEC_OK && ctx->cpi != NULL) { unsigned int lib_flags = 0; YV12_BUFFER_CONFIG sd; - int64_t dst_time_stamp, dst_end_time_stamp; + int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts); + int64_t dst_end_time_stamp = + timebase_units_to_ticks(timebase, pts + duration); size_t size, cx_data_sz; unsigned char *cx_data; @@ -867,12 +909,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1; - /* vp9 use 10,000,000 ticks/second as time stamp */ - dst_time_stamp = (pts * 10000000 * ctx->cfg.g_timebase.num) - / ctx->cfg.g_timebase.den; - dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / - ctx->cfg.g_timebase.den; - if (img != NULL) { res = image2yuvconfig(img, &sd); @@ -909,19 +945,18 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, cx_data, &dst_time_stamp, &dst_end_time_stamp, !img)) { if (size) { - vpx_codec_pts_t round, delta; - vpx_codec_cx_pkt_t pkt; VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi; + vpx_codec_cx_pkt_t pkt; -#ifdef CONFIG_SPATIAL_SVC - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) +#if CONFIG_SPATIAL_SVC + if (is_spatial_svc(cpi)) cpi->svc.layer_context[cpi->svc.spatial_layer_id].layer_size += size; #endif // Pack invisible frames with the next visible frame if (cpi->common.show_frame == 0 -#ifdef CONFIG_SPATIAL_SVC - || (cpi->use_svc && cpi->svc.number_temporal_layers == 1 && +#if CONFIG_SPATIAL_SVC + || (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) #endif ) { @@ -936,20 +971,15 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, } // Add the frame packet to the list of returned packets. - round = (vpx_codec_pts_t)10000000 * ctx->cfg.g_timebase.num / 2 - 1; - delta = (dst_end_time_stamp - dst_time_stamp); pkt.kind = VPX_CODEC_CX_FRAME_PKT; - pkt.data.frame.pts = - (dst_time_stamp * ctx->cfg.g_timebase.den + round) - / ctx->cfg.g_timebase.num / 10000000; - pkt.data.frame.duration = (unsigned long) - ((delta * ctx->cfg.g_timebase.den + round) - / ctx->cfg.g_timebase.num / 10000000); + pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp); + pkt.data.frame.duration = ticks_to_timebase_units( + timebase, dst_end_time_stamp - dst_time_stamp); pkt.data.frame.flags = lib_flags << 16; if (lib_flags & FRAMEFLAGS_KEY -#ifdef CONFIG_SPATIAL_SVC - || (cpi->use_svc && cpi->svc.number_temporal_layers == 1 && +#if CONFIG_SPATIAL_SVC + || (is_spatial_svc(cpi) && cpi->svc.layer_context[0].is_key_frame) #endif ) @@ -962,9 +992,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // prior PTS so that if a decoder uses pts to schedule when // to do this, we start right after last frame was decoded. // Invisible frames have no duration. - pkt.data.frame.pts = ((cpi->last_time_stamp_seen - * ctx->cfg.g_timebase.den + round) - / ctx->cfg.g_timebase.num / 10000000) + 1; + pkt.data.frame.pts = + ticks_to_timebase_units(timebase, cpi->last_time_stamp_seen) + 1; pkt.data.frame.duration = 0; } @@ -990,8 +1019,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); cx_data += size; cx_data_sz -= size; -#ifdef CONFIG_SPATIAL_SVC - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) { +#if CONFIG_SPATIAL_SVC + if (is_spatial_svc(cpi)) { vpx_codec_cx_pkt_t pkt = {0}; int i; pkt.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES; @@ -1051,9 +1080,9 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, vp9_ref_frame_t *const frame = va_arg(args, vp9_ref_frame_t *); if (frame != NULL) { - YV12_BUFFER_CONFIG *fb; + YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx); + if (fb == NULL) return VPX_CODEC_ERROR; - vp9_get_reference_enc(ctx->cpi, frame->idx, &fb); yuvconfig2image(&frame->img, fb, NULL); return VPX_CODEC_OK; } else { @@ -1221,6 +1250,13 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.content = CAST(VP9E_SET_TUNE_CONTENT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { {VP8_COPY_REFERENCE, ctrl_copy_reference}, {VP8E_UPD_ENTROPY, ctrl_update_entropy}, @@ -1253,6 +1289,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { {VP9E_SET_SVC, ctrl_set_svc}, {VP9E_SET_SVC_PARAMETERS, ctrl_set_svc_parameters}, {VP9E_SET_SVC_LAYER_ID, ctrl_set_svc_layer_id}, + {VP9E_SET_TUNE_CONTENT, ctrl_set_tune_content}, // Getters {VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer}, @@ -1314,9 +1351,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { 9999, // kf_max_dist VPX_SS_DEFAULT_LAYERS, // ss_number_layers -#ifdef CONFIG_SPATIAL_SVC {0}, -#endif {0}, // ss_target_bitrate 1, // ts_number_layers {0}, // ts_target_bitrate @@ -1328,7 +1363,6 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { #endif } }, - { -1, {NOT_IMPLEMENTED}} }; #ifndef VERSION_STRING @@ -1346,8 +1380,6 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = { encoder_init, // vpx_codec_init_fn_t encoder_destroy, // vpx_codec_destroy_fn_t encoder_ctrl_maps, // vpx_codec_ctrl_fn_map_t - NOT_IMPLEMENTED, // vpx_codec_get_mmap_fn_t - NOT_IMPLEMENTED, // vpx_codec_set_mmap_fn_t { // NOLINT NOT_IMPLEMENTED, // vpx_codec_peek_si_fn_t NOT_IMPLEMENTED, // vpx_codec_get_si_fn_t @@ -1356,6 +1388,7 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = { NOT_IMPLEMENTED // vpx_codec_set_fb_fn_t }, { // NOLINT + 1, // 1 cfg map encoder_usage_cfg_map, // vpx_codec_enc_cfg_map_t encoder_encode, // vpx_codec_encode_fn_t encoder_get_cxdata, // vpx_codec_get_cx_data_fn_t diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index a2f77b11d..28475f39e 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -41,6 +41,7 @@ struct vpx_codec_alg_priv { void *decrypt_state; vpx_image_t img; int img_avail; + int flushed; int invert_tile_order; int frame_parallel_decode; // frame-based threading. @@ -70,6 +71,7 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, ctx->priv->alg_priv = alg_priv; ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si); ctx->priv->init_flags = ctx->init_flags; + ctx->priv->alg_priv->flushed = 0; ctx->priv->alg_priv->frame_parallel_decode = (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING); @@ -174,6 +176,7 @@ static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data, vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h); } else { intra_only_flag = show_frame ? 0 : vp9_rb_read_bit(&rb); + rb.bit_offset += error_resilient ? 0 : 2; // reset_frame_context if (intra_only_flag) { @@ -413,8 +416,13 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, uint32_t frame_sizes[8]; int frame_count; - if (data == NULL || data_sz == 0) - return VPX_CODEC_INVALID_PARAM; + if (data == NULL && data_sz == 0) { + ctx->flushed = 1; + return VPX_CODEC_OK; + } + + // Reset flushed when receiving a valid frame. + ctx->flushed = 0; res = parse_superframe_index(data, data_sz, frame_sizes, &frame_count, ctx->decrypt_cb, ctx->decrypt_state); @@ -578,9 +586,9 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *); if (data) { - YV12_BUFFER_CONFIG* fb; + YV12_BUFFER_CONFIG* fb = get_ref_frame(&ctx->pbi->common, data->idx); + if (fb == NULL) return VPX_CODEC_ERROR; - vp9_get_reference_dec(ctx->pbi, data->idx, &fb); yuvconfig2image(&data->img, fb, NULL); return VPX_CODEC_OK; } else { @@ -634,11 +642,10 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, va_list args) { int *corrupted = va_arg(args, int *); - if (corrupted) { - if (ctx->pbi) - *corrupted = ctx->pbi->common.frame_to_show->corrupted; - else - return VPX_CODEC_ERROR; + if (corrupted != NULL && ctx->pbi != NULL) { + const YV12_BUFFER_CONFIG *const frame = ctx->pbi->common.frame_to_show; + if (frame == NULL) return VPX_CODEC_ERROR; + *corrupted = frame->corrupted; return VPX_CODEC_OK; } else { return VPX_CODEC_INVALID_PARAM; @@ -729,8 +736,6 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = { decoder_init, // vpx_codec_init_fn_t decoder_destroy, // vpx_codec_destroy_fn_t decoder_ctrl_maps, // vpx_codec_ctrl_fn_map_t - NOT_IMPLEMENTED, // vpx_codec_get_mmap_fn_t - NOT_IMPLEMENTED, // vpx_codec_set_mmap_fn_t { // NOLINT decoder_peek_si, // vpx_codec_peek_si_fn_t decoder_get_si, // vpx_codec_get_si_fn_t @@ -739,6 +744,7 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = { decoder_set_fb_fn, // vpx_codec_set_fb_fn_t }, { // NOLINT + 0, NOT_IMPLEMENTED, // vpx_codec_enc_cfg_map_t NOT_IMPLEMENTED, // vpx_codec_encode_fn_t NOT_IMPLEMENTED, // vpx_codec_get_cx_data_fn_t diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h index f21f00307..bb37201f6 100644 --- a/vp9/vp9_iface_common.h +++ b/vp9/vp9_iface_common.h @@ -41,11 +41,11 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, img->planes[VPX_PLANE_Y] = yv12->y_buffer; img->planes[VPX_PLANE_U] = yv12->u_buffer; img->planes[VPX_PLANE_V] = yv12->v_buffer; - img->planes[VPX_PLANE_ALPHA] = yv12->alpha_buffer; + img->planes[VPX_PLANE_ALPHA] = NULL; img->stride[VPX_PLANE_Y] = yv12->y_stride; img->stride[VPX_PLANE_U] = yv12->uv_stride; img->stride[VPX_PLANE_V] = yv12->uv_stride; - img->stride[VPX_PLANE_ALPHA] = yv12->alpha_stride; + img->stride[VPX_PLANE_ALPHA] = yv12->y_stride; #if CONFIG_VP9_HIGH if (yv12->flags & YV12_FLAG_HIGH) { // VPX IMG uses byte strides and a pointer to the first byte of the image @@ -57,12 +57,11 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, CONVERT_TO_SHORTPTR(yv12->u_buffer); img->planes[VPX_PLANE_V] = (uint8_t*) CONVERT_TO_SHORTPTR(yv12->v_buffer); - img->planes[VPX_PLANE_ALPHA] = (uint8_t*) - CONVERT_TO_SHORTPTR(yv12->alpha_buffer); + img->planes[VPX_PLANE_ALPHA] = NULL; img->stride[VPX_PLANE_Y] = 2 * yv12->y_stride; img->stride[VPX_PLANE_U] = 2 * yv12->uv_stride; img->stride[VPX_PLANE_V] = 2 * yv12->uv_stride; - img->stride[VPX_PLANE_ALPHA] = 2 * yv12->alpha_stride; + img->stride[VPX_PLANE_ALPHA] = 2 * yv12->y_stride;; } #endif img->bps = bps; @@ -77,7 +76,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->y_buffer = img->planes[VPX_PLANE_Y]; yv12->u_buffer = img->planes[VPX_PLANE_U]; yv12->v_buffer = img->planes[VPX_PLANE_V]; - yv12->alpha_buffer = img->planes[VPX_PLANE_ALPHA]; yv12->y_crop_width = img->d_w; yv12->y_crop_height = img->d_h; @@ -89,12 +87,8 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height; - yv12->alpha_width = yv12->alpha_buffer ? img->d_w : 0; - yv12->alpha_height = yv12->alpha_buffer ? img->d_h : 0; - yv12->y_stride = img->stride[VPX_PLANE_Y]; yv12->uv_stride = img->stride[VPX_PLANE_U]; - yv12->alpha_stride = yv12->alpha_buffer ? img->stride[VPX_PLANE_ALPHA] : 0; #if CONFIG_VP9_HIGH if (img->fmt & VPX_IMG_FMT_HIGH) { // In vpx_image_t @@ -114,7 +108,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer); yv12->y_stride >>= 1; yv12->uv_stride >>= 1; - yv12->alpha_stride >>= 1; yv12->flags = YV12_FLAG_HIGH; } else { yv12->flags = 0; @@ -122,13 +115,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->border = (yv12->y_stride - img->w) / 2; #else yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2; -#endif -#if CONFIG_ALPHA - // For development purposes, force alpha to hold the same data as Y for now. - yv12->alpha_buffer = yv12->y_buffer; - yv12->alpha_width = yv12->y_width; - yv12->alpha_height = yv12->y_height; - yv12->alpha_stride = yv12->y_stride; #endif return VPX_CODEC_OK; } diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 3f2f5b9d9..dc46c4e35 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -23,8 +23,8 @@ VP9_CX_SRCS-yes += encoder/vp9_context_tree.h VP9_CX_SRCS-yes += encoder/vp9_cost.h VP9_CX_SRCS-yes += encoder/vp9_cost.c VP9_CX_SRCS-yes += encoder/vp9_dct.c -VP9_CX_SRCS-$(CONFIG_DENOISING) += encoder/vp9_denoiser.c -VP9_CX_SRCS-$(CONFIG_DENOISING) += encoder/vp9_denoiser.h +VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.c +VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.h VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h VP9_CX_SRCS-yes += encoder/vp9_encodemb.c @@ -130,5 +130,9 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h index 82d2bc3c0..a7716d130 100644 --- a/vpx/internal/vpx_codec_internal.h +++ b/vpx/internal/vpx_codec_internal.h @@ -248,37 +248,6 @@ typedef vpx_codec_err_t (*vpx_codec_set_fb_fn_t)( vpx_get_frame_buffer_cb_fn_t cb_get, vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); -/*\brief eXternal Memory Allocation memory map get iterator - * - * Iterates over a list of the memory maps requested by the decoder. The - * iterator storage should be initialized to NULL to start the iteration. - * Iteration is complete when this function returns NULL. - * - * \param[in out] iter Iterator storage, initialized to NULL - * - * \return Returns a pointer to an memory segment descriptor, or NULL to - * indicate end-of-list. - */ -typedef vpx_codec_err_t (*vpx_codec_get_mmap_fn_t)(const vpx_codec_ctx_t *ctx, - vpx_codec_mmap_t *mmap, - vpx_codec_iter_t *iter); - - -/*\brief eXternal Memory Allocation memory map set iterator - * - * Sets a memory descriptor inside the decoder instance. - * - * \param[in] ctx Pointer to this instance's context - * \param[in] mmap Memory map to store. - * - * \retval #VPX_CODEC_OK - * The memory map was accepted and stored. - * \retval #VPX_CODEC_MEM_ERROR - * The memory map was rejected. - */ -typedef vpx_codec_err_t (*vpx_codec_set_mmap_fn_t)(vpx_codec_ctx_t *ctx, - const vpx_codec_mmap_t *mmap); - typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t *ctx, const vpx_image_t *img, @@ -330,8 +299,6 @@ struct vpx_codec_iface { vpx_codec_init_fn_t init; /**< \copydoc ::vpx_codec_init_fn_t */ vpx_codec_destroy_fn_t destroy; /**< \copydoc ::vpx_codec_destroy_fn_t */ vpx_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::vpx_codec_ctrl_fn_map_t */ - vpx_codec_get_mmap_fn_t get_mmap; /**< \copydoc ::vpx_codec_get_mmap_fn_t */ - vpx_codec_set_mmap_fn_t set_mmap; /**< \copydoc ::vpx_codec_set_mmap_fn_t */ struct vpx_codec_dec_iface { vpx_codec_peek_si_fn_t peek_si; /**< \copydoc ::vpx_codec_peek_si_fn_t */ vpx_codec_get_si_fn_t get_si; /**< \copydoc ::vpx_codec_get_si_fn_t */ @@ -340,6 +307,7 @@ struct vpx_codec_iface { vpx_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::vpx_codec_set_fb_fn_t */ } dec; struct vpx_codec_enc_iface { + int cfg_map_count; vpx_codec_enc_cfg_map_t *cfg_maps; /**< \copydoc ::vpx_codec_enc_cfg_map_t */ vpx_codec_encode_fn_t encode; /**< \copydoc ::vpx_codec_encode_fn_t */ vpx_codec_get_cx_data_fn_t get_cx_data; /**< \copydoc ::vpx_codec_get_cx_data_fn_t */ @@ -401,11 +369,11 @@ struct vpx_codec_priv_enc_mr_cfg #undef VPX_CTRL_USE_TYPE #define VPX_CTRL_USE_TYPE(id, typ) \ - static typ id##__value(va_list args) {return va_arg(args, typ);} \ + static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);} #undef VPX_CTRL_USE_TYPE_DEPRECATED #define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ - static typ id##__value(va_list args) {return va_arg(args, typ);} \ + static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);} #define CAST(id, arg) id##__value(arg) @@ -463,54 +431,11 @@ struct vpx_internal_error_info { jmp_buf jmp; }; -static void vpx_internal_error(struct vpx_internal_error_info *info, - vpx_codec_err_t error, - const char *fmt, - ...) { - va_list ap; +void vpx_internal_error(struct vpx_internal_error_info *info, + vpx_codec_err_t error, + const char *fmt, + ...); - info->error_code = error; - info->has_detail = 0; - - if (fmt) { - size_t sz = sizeof(info->detail); - - info->has_detail = 1; - va_start(ap, fmt); - vsnprintf(info->detail, sz - 1, fmt, ap); - va_end(ap); - info->detail[sz - 1] = '\0'; - } - - if (info->setjmp) - longjmp(info->jmp, info->error_code); -} - -//------------------------------------------------------------------------------ -// mmap interface - -typedef struct { - unsigned int id; - unsigned long sz; - unsigned int align; - unsigned int flags; - unsigned long (*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t); -} mem_req_t; - -// Allocates mmap.priv and sets mmap.base based on mmap.sz/align/flags -// requirements. -// Returns #VPX_CODEC_OK on success, #VPX_CODEC_MEM_ERROR otherwise. -vpx_codec_err_t vpx_mmap_alloc(vpx_codec_mmap_t *mmap); - -// Frees mmap.base allocated by a call to vpx_mmap_alloc(). -void vpx_mmap_dtor(vpx_codec_mmap_t *mmap); - -// Checks each mmap has the size requirement specificied by mem_reqs. -// Returns #VPX_CODEC_OK on success, #VPX_CODEC_MEM_ERROR otherwise. -vpx_codec_err_t vpx_validate_mmaps(const vpx_codec_stream_info_t *si, - const vpx_codec_mmap_t *mmaps, - const mem_req_t *mem_reqs, int nreqs, - vpx_codec_flags_t init_flags); #ifdef __cplusplus } // extern "C" #endif diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c index 13abad2e7..ecd88431d 100644 --- a/vpx/src/svc_encodeframe.c +++ b/vpx/src/svc_encodeframe.c @@ -517,8 +517,10 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, } } +#if CONFIG_SPATIAL_SVC for (i = 0; i < si->layers; ++i) enc_cfg->ss_enable_auto_alt_ref[i] = si->enable_auto_alt_ref[i]; +#endif // modify encoder configuration enc_cfg->ss_number_layers = si->layers; @@ -724,12 +726,14 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, si->rc_stats_buf_used += cx_pkt->data.twopass_stats.sz; break; } +#if CONFIG_SPATIAL_SVC case VPX_CODEC_SPATIAL_SVC_LAYER_SIZES: { int i; for (i = 0; i < si->layers; ++i) si->bytes_sum[i] += cx_pkt->data.layer_sizes[i]; break; } +#endif default: { break; } diff --git a/vpx/src/vpx_codec.c b/vpx/src/vpx_codec.c index 6fb8f522d..d175eae64 100644 --- a/vpx/src/vpx_codec.c +++ b/vpx/src/vpx_codec.c @@ -135,50 +135,25 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, return SAVE_STATUS(ctx, res); } -//------------------------------------------------------------------------------ -// mmap interface +void vpx_internal_error(struct vpx_internal_error_info *info, + vpx_codec_err_t error, + const char *fmt, + ...) { + va_list ap; -vpx_codec_err_t vpx_mmap_alloc(vpx_codec_mmap_t *mmap) { - unsigned int align = mmap->align ? mmap->align - 1 : 0; + info->error_code = error; + info->has_detail = 0; - if (mmap->flags & VPX_CODEC_MEM_ZERO) - mmap->priv = calloc(1, mmap->sz + align); - else - mmap->priv = malloc(mmap->sz + align); + if (fmt) { + size_t sz = sizeof(info->detail); - if (mmap->priv == NULL) return VPX_CODEC_MEM_ERROR; - mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align); - mmap->dtor = vpx_mmap_dtor; - return VPX_CODEC_OK; -} - -void vpx_mmap_dtor(vpx_codec_mmap_t *mmap) { - free(mmap->priv); -} - -vpx_codec_err_t vpx_validate_mmaps(const vpx_codec_stream_info_t *si, - const vpx_codec_mmap_t *mmaps, - const mem_req_t *mem_reqs, int nreqs, - vpx_codec_flags_t init_flags) { - int i; - - for (i = 0; i < nreqs - 1; ++i) { - /* Ensure the segment has been allocated */ - if (mmaps[i].base == NULL) { - return VPX_CODEC_MEM_ERROR; - } - - /* Verify variable size segment is big enough for the current si. */ - if (mem_reqs[i].calc_sz != NULL) { - vpx_codec_dec_cfg_t cfg; - - cfg.w = si->w; - cfg.h = si->h; - - if (mmaps[i].sz < mem_reqs[i].calc_sz(&cfg, init_flags)) { - return VPX_CODEC_MEM_ERROR; - } - } + info->has_detail = 1; + va_start(ap, fmt); + vsnprintf(info->detail, sz - 1, fmt, ap); + va_end(ap); + info->detail[sz - 1] = '\0'; } - return VPX_CODEC_OK; + + if (info->setjmp) + longjmp(info->jmp, info->error_code); } diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c index 63fdaf308..4d22a0847 100644 --- a/vpx/src/vpx_decoder.c +++ b/vpx/src/vpx_decoder.c @@ -31,8 +31,6 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, res = VPX_CODEC_INVALID_PARAM; else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION) res = VPX_CODEC_ABI_MISMATCH; - else if ((flags & VPX_CODEC_USE_XMA) && !(iface->caps & VPX_CODEC_CAP_XMA)) - res = VPX_CODEC_INCAPABLE; else if ((flags & VPX_CODEC_USE_POSTPROC) && !(iface->caps & VPX_CODEC_CAP_POSTPROC)) res = VPX_CODEC_INCAPABLE; else if ((flags & VPX_CODEC_USE_ERROR_CONCEALMENT) && @@ -50,19 +48,15 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, ctx->priv = NULL; ctx->init_flags = flags; ctx->config.dec = cfg; - res = VPX_CODEC_OK; - if (!(flags & VPX_CODEC_USE_XMA)) { - res = ctx->iface->init(ctx, NULL); - - if (res) { - ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; - vpx_codec_destroy(ctx); - } - - if (ctx->priv) - ctx->priv->iface = ctx->iface; + res = ctx->iface->init(ctx, NULL); + if (res) { + ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; + vpx_codec_destroy(ctx); } + + if (ctx->priv) + ctx->priv->iface = ctx->iface; } return SAVE_STATUS(ctx, res); @@ -119,7 +113,7 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, /* Sanity checks */ /* NULL data ptr allowed if data_sz is 0 too */ - if (!ctx || (!data && data_sz)) + if (!ctx || (!data && data_sz) || (data && !data_sz)) res = VPX_CODEC_INVALID_PARAM; else if (!ctx->iface || !ctx->priv) res = VPX_CODEC_ERROR; @@ -183,50 +177,6 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx return SAVE_STATUS(ctx, res); } - -vpx_codec_err_t vpx_codec_get_mem_map(vpx_codec_ctx_t *ctx, - vpx_codec_mmap_t *mmap, - vpx_codec_iter_t *iter) { - vpx_codec_err_t res = VPX_CODEC_OK; - - if (!ctx || !mmap || !iter || !ctx->iface) - res = VPX_CODEC_INVALID_PARAM; - else if (!(ctx->iface->caps & VPX_CODEC_CAP_XMA)) - res = VPX_CODEC_ERROR; - else - res = ctx->iface->get_mmap(ctx, mmap, iter); - - return SAVE_STATUS(ctx, res); -} - - -vpx_codec_err_t vpx_codec_set_mem_map(vpx_codec_ctx_t *ctx, - vpx_codec_mmap_t *mmap, - unsigned int num_maps) { - vpx_codec_err_t res = VPX_CODEC_MEM_ERROR; - - if (!ctx || !mmap || !ctx->iface) - res = VPX_CODEC_INVALID_PARAM; - else if (!(ctx->iface->caps & VPX_CODEC_CAP_XMA)) - res = VPX_CODEC_ERROR; - else { - unsigned int i; - - for (i = 0; i < num_maps; i++, mmap++) { - if (!mmap->base) - break; - - /* Everything look ok, set the mmap in the decoder */ - res = ctx->iface->set_mmap(ctx, mmap); - - if (res) - break; - } - } - - return SAVE_STATUS(ctx, res); -} - vpx_codec_err_t vpx_codec_set_frame_buffer_functions( vpx_codec_ctx_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get, vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c index ea1b346c4..cd88a8480 100644 --- a/vpx/src/vpx_encoder.c +++ b/vpx/src/vpx_encoder.c @@ -35,8 +35,6 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, res = VPX_CODEC_ABI_MISMATCH; else if (!(iface->caps & VPX_CODEC_CAP_ENCODER)) res = VPX_CODEC_INCAPABLE; - else if ((flags & VPX_CODEC_USE_XMA) && !(iface->caps & VPX_CODEC_CAP_XMA)) - res = VPX_CODEC_INCAPABLE; else if ((flags & VPX_CODEC_USE_PSNR) && !(iface->caps & VPX_CODEC_CAP_PSNR)) res = VPX_CODEC_INCAPABLE; @@ -83,8 +81,6 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, res = VPX_CODEC_ABI_MISMATCH; else if (!(iface->caps & VPX_CODEC_CAP_ENCODER)) res = VPX_CODEC_INCAPABLE; - else if ((flags & VPX_CODEC_USE_XMA) && !(iface->caps & VPX_CODEC_CAP_XMA)) - res = VPX_CODEC_INCAPABLE; else if ((flags & VPX_CODEC_USE_PSNR) && !(iface->caps & VPX_CODEC_CAP_PSNR)) res = VPX_CODEC_INCAPABLE; @@ -165,6 +161,7 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, unsigned int usage) { vpx_codec_err_t res; vpx_codec_enc_cfg_map_t *map; + int i; if (!iface || !cfg || usage > INT_MAX) res = VPX_CODEC_INVALID_PARAM; @@ -173,7 +170,8 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, else { res = VPX_CODEC_INVALID_PARAM; - for (map = iface->enc.cfg_maps; map->usage >= 0; map++) { + for (i = 0; i < iface->enc.cfg_map_count; ++i) { + map = iface->enc.cfg_maps + i; if (map->usage == (int)usage) { *cfg = map->cfg; cfg->g_usage = usage; diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 3995c8c70..796a7a1c2 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -206,6 +206,7 @@ enum vp8e_enc_control_id { * temporal layer. */ VP9E_SET_SVC_LAYER_ID, + VP9E_SET_TUNE_CONTENT }; /*!\brief vpx 1-D scaling mode @@ -277,6 +278,12 @@ typedef enum { VP8_EIGHT_TOKENPARTITION = 3 } vp8e_token_partitions; +/*!brief VP9 encoder content type */ +typedef enum { + VP9E_CONTENT_DEFAULT, + VP9E_CONTENT_SCREEN, + VP9E_CONTENT_INVALID +} vp9e_tune_content; /*!\brief VP8 model tuning parameters * @@ -370,6 +377,7 @@ VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int) VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int) +VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */ /*! @} - end defgroup vp8_encoder */ #ifdef __cplusplus } // extern "C" diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h index 45e702354..07df72a78 100644 --- a/vpx/vpx_codec.h +++ b/vpx/vpx_codec.h @@ -153,7 +153,6 @@ extern "C" { typedef long vpx_codec_caps_t; #define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */ #define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */ -#define VPX_CODEC_CAP_XMA 0x4 /**< Supports eXternal Memory Allocation */ /*! \brief Initialization-time Feature Enabling @@ -164,7 +163,6 @@ extern "C" { * The available flags are specified by VPX_CODEC_USE_* defines. */ typedef long vpx_codec_flags_t; -#define VPX_CODEC_USE_XMA 0x00000001 /**< Use eXternal Memory Allocation mode */ /*!\brief Codec interface structure. @@ -471,94 +469,6 @@ extern "C" { #endif - - /*!\defgroup cap_xma External Memory Allocation Functions - * - * The following functions are required to be implemented for all codecs - * that advertise the VPX_CODEC_CAP_XMA capability. Calling these functions - * for codecs that don't advertise this capability will result in an error - * code being returned, usually VPX_CODEC_INCAPABLE - * @{ - */ - - - /*!\brief Memory Map Entry - * - * This structure is used to contain the properties of a memory segment. It - * is populated by the codec in the request phase, and by the calling - * application once the requested allocation has been performed. - */ - typedef struct vpx_codec_mmap { - /* - * The following members are set by the codec when requesting a segment - */ - unsigned int id; /**< identifier for the segment's contents */ - unsigned long sz; /**< size of the segment, in bytes */ - unsigned int align; /**< required alignment of the segment, in bytes */ - unsigned int flags; /**< bitfield containing segment properties */ -#define VPX_CODEC_MEM_ZERO 0x1 /**< Segment must be zeroed by allocation */ -#define VPX_CODEC_MEM_WRONLY 0x2 /**< Segment need not be readable */ -#define VPX_CODEC_MEM_FAST 0x4 /**< Place in fast memory, if available */ - - /* The following members are to be filled in by the allocation function */ - void *base; /**< pointer to the allocated segment */ - void (*dtor)(struct vpx_codec_mmap *map); /**< destructor to call */ - void *priv; /**< allocator private storage */ - } vpx_codec_mmap_t; /**< alias for struct vpx_codec_mmap */ - - - /*!\brief Iterate over the list of segments to allocate. - * - * Iterates over a list of the segments to allocate. The iterator storage - * should be initialized to NULL to start the iteration. Iteration is complete - * when this function returns VPX_CODEC_LIST_END. The amount of memory needed to - * allocate is dependent upon the size of the encoded stream. In cases where the - * stream is not available at allocation time, a fixed size must be requested. - * The codec will not be able to operate on streams larger than the size used at - * allocation time. - * - * \param[in] ctx Pointer to this instance's context. - * \param[out] mmap Pointer to the memory map entry to populate. - * \param[in,out] iter Iterator storage, initialized to NULL - * - * \retval #VPX_CODEC_OK - * The memory map entry was populated. - * \retval #VPX_CODEC_ERROR - * Codec does not support XMA mode. - * \retval #VPX_CODEC_MEM_ERROR - * Unable to determine segment size from stream info. - */ - vpx_codec_err_t vpx_codec_get_mem_map(vpx_codec_ctx_t *ctx, - vpx_codec_mmap_t *mmap, - vpx_codec_iter_t *iter); - - - /*!\brief Identify allocated segments to codec instance - * - * Stores a list of allocated segments in the codec. Segments \ref MUST be - * passed in the order they are read from vpx_codec_get_mem_map(), but may be - * passed in groups of any size. Segments \ref MUST be set only once. The - * allocation function \ref MUST ensure that the vpx_codec_mmap_t::base member - * is non-NULL. If the segment requires cleanup handling (e.g., calling free() - * or close()) then the vpx_codec_mmap_t::dtor member \ref MUST be populated. - * - * \param[in] ctx Pointer to this instance's context. - * \param[in] mmaps Pointer to the first memory map entry in the list. - * \param[in] num_maps Number of entries being set at this time - * - * \retval #VPX_CODEC_OK - * The segment was stored in the codec context. - * \retval #VPX_CODEC_INCAPABLE - * Codec does not support XMA mode. - * \retval #VPX_CODEC_MEM_ERROR - * Segment base address was not set, or segment was already stored. - - */ - vpx_codec_err_t vpx_codec_set_mem_map(vpx_codec_ctx_t *ctx, - vpx_codec_mmap_t *mmaps, - unsigned int num_maps); - - /*!@} - end defgroup cap_xma*/ /*!@} - end defgroup codec*/ #ifdef __cplusplus } diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h index ba183283e..10b89fa0f 100644 --- a/vpx/vpx_decoder.h +++ b/vpx/vpx_decoder.h @@ -122,10 +122,6 @@ extern "C" { * is not thread safe and should be guarded with a lock if being used * in a multithreaded context. * - * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags - * parameter), the storage pointed to by the cfg parameter must be - * kept readable and stable until all memory maps have been set. - * * \param[in] ctx Pointer to this instance's context. * \param[in] iface Pointer to the algorithm interface to use. * \param[in] cfg Configuration to use, if known. May be NULL. diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 123502b71..eec7f30f4 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -160,7 +160,7 @@ extern "C" { VPX_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ -#ifdef CONFIG_SPATIAL_SVC +#if CONFIG_SPATIAL_SVC VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/ #endif VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions */ @@ -199,7 +199,7 @@ extern "C" { double psnr[4]; /**< PSNR, total/y/u/v */ } psnr; /**< data for PSNR packet */ struct vpx_fixed_buf raw; /**< data for arbitrary packets */ -#ifdef CONFIG_SPATIAL_SVC +#if CONFIG_SPATIAL_SVC size_t layer_sizes[VPX_SS_MAX_LAYERS]; #endif @@ -656,14 +656,12 @@ extern "C" { */ unsigned int ss_number_layers; -#ifdef CONFIG_SPATIAL_SVC /*!\brief Enable auto alt reference flags for each spatial layer. * * These values specify if auto alt reference frame is enabled for each * spatial layer. */ int ss_enable_auto_alt_ref[VPX_SS_MAX_LAYERS]; -#endif /*!\brief Target bitrate for each spatial layer. * @@ -723,10 +721,6 @@ extern "C" { * is not thread safe and should be guarded with a lock if being used * in a multithreaded context. * - * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags - * parameter), the storage pointed to by the cfg parameter must be - * kept readable and stable until all memory maps have been set. - * * \param[in] ctx Pointer to this instance's context. * \param[in] iface Pointer to the algorithm interface to use. * \param[in] cfg Configuration to use, if known. May be NULL. @@ -760,10 +754,6 @@ extern "C" { * instead of this function directly, to ensure that the ABI version number * parameter is properly initialized. * - * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags - * parameter), the storage pointed to by the cfg parameter must be - * kept readable and stable until all memory maps have been set. - * * \param[in] ctx Pointer to this instance's context. * \param[in] iface Pointer to the algorithm interface to use. * \param[in] cfg Configuration to use, if known. May be NULL. diff --git a/vpx/vpx_integer.h b/vpx/vpx_integer.h index 258618bbd..ffeefb819 100644 --- a/vpx/vpx_integer.h +++ b/vpx/vpx_integer.h @@ -15,6 +15,15 @@ /* get ptrdiff_t, size_t, wchar_t, NULL */ #include +#if defined(_MSC_VER) +#define VPX_FORCE_INLINE __forceinline +#define VPX_INLINE __inline +#else +#define VPX_FORCE_INLINE __inline__ __attribute__(always_inline) +// TODO(jbb): Allow a way to force inline off for older compilers. +#define VPX_INLINE inline +#endif + #if (defined(_MSC_VER) && (_MSC_VER < 1600)) || defined(VPX_EMULATE_INTTYPES) typedef signed char int8_t; typedef signed short int16_t; diff --git a/vpx_ports/mem_ops.h b/vpx_ports/mem_ops.h index 8c8b52618..d4a3d773f 100644 --- a/vpx_ports/mem_ops.h +++ b/vpx_ports/mem_ops.h @@ -133,7 +133,7 @@ static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) { } #define mem_get_s_generic(end,sz) \ - static signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) {\ + static VPX_INLINE signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) {\ const MAU_T *mem = (const MAU_T*)vmem;\ signed MEM_VALUE_T val = mem_get_##end##sz(mem);\ return (val << (MEM_VALUE_T_SZ_BITS - sz)) >> (MEM_VALUE_T_SZ_BITS - sz);\ @@ -165,7 +165,7 @@ mem_get_s_generic(le, 32) #undef mem_put_be16 #define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16) -static void mem_put_be16(void *vmem, MEM_VALUE_T val) { +static VPX_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (val >> 8) & 0xff; @@ -174,7 +174,7 @@ static void mem_put_be16(void *vmem, MEM_VALUE_T val) { #undef mem_put_be24 #define mem_put_be24 mem_ops_wrap_symbol(mem_put_be24) -static void mem_put_be24(void *vmem, MEM_VALUE_T val) { +static VPX_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (val >> 16) & 0xff; @@ -184,7 +184,7 @@ static void mem_put_be24(void *vmem, MEM_VALUE_T val) { #undef mem_put_be32 #define mem_put_be32 mem_ops_wrap_symbol(mem_put_be32) -static void mem_put_be32(void *vmem, MEM_VALUE_T val) { +static VPX_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (val >> 24) & 0xff; @@ -195,7 +195,7 @@ static void mem_put_be32(void *vmem, MEM_VALUE_T val) { #undef mem_put_le16 #define mem_put_le16 mem_ops_wrap_symbol(mem_put_le16) -static void mem_put_le16(void *vmem, MEM_VALUE_T val) { +static VPX_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (val >> 0) & 0xff; @@ -204,7 +204,7 @@ static void mem_put_le16(void *vmem, MEM_VALUE_T val) { #undef mem_put_le24 #define mem_put_le24 mem_ops_wrap_symbol(mem_put_le24) -static void mem_put_le24(void *vmem, MEM_VALUE_T val) { +static VPX_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (val >> 0) & 0xff; @@ -214,7 +214,7 @@ static void mem_put_le24(void *vmem, MEM_VALUE_T val) { #undef mem_put_le32 #define mem_put_le32 mem_ops_wrap_symbol(mem_put_le32) -static void mem_put_le32(void *vmem, MEM_VALUE_T val) { +static VPX_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (val >> 0) & 0xff; diff --git a/vpx_ports/mem_ops_aligned.h b/vpx_ports/mem_ops_aligned.h index 24743c8d6..c16111fec 100644 --- a/vpx_ports/mem_ops_aligned.h +++ b/vpx_ports/mem_ops_aligned.h @@ -44,19 +44,22 @@ #define swap_endian_32_se(val,raw) swap_endian_32(val,raw) #define mem_get_ne_aligned_generic(end,sz) \ - static unsigned MEM_VALUE_T mem_get_##end##sz##_aligned(const void *vmem) {\ + static VPX_INLINE unsigned MEM_VALUE_T \ + mem_get_##end##sz##_aligned(const void *vmem) {\ const uint##sz##_t *mem = (const uint##sz##_t *)vmem;\ return *mem;\ } #define mem_get_sne_aligned_generic(end,sz) \ - static signed MEM_VALUE_T mem_get_s##end##sz##_aligned(const void *vmem) {\ + static VPX_INLINE signed MEM_VALUE_T \ + mem_get_s##end##sz##_aligned(const void *vmem) {\ const int##sz##_t *mem = (const int##sz##_t *)vmem;\ return *mem;\ } #define mem_get_se_aligned_generic(end,sz) \ - static unsigned MEM_VALUE_T mem_get_##end##sz##_aligned(const void *vmem) {\ + static VPX_INLINE unsigned MEM_VALUE_T \ + mem_get_##end##sz##_aligned(const void *vmem) {\ const uint##sz##_t *mem = (const uint##sz##_t *)vmem;\ unsigned MEM_VALUE_T val, raw = *mem;\ swap_endian_##sz(val,raw);\ @@ -64,7 +67,8 @@ } #define mem_get_sse_aligned_generic(end,sz) \ - static signed MEM_VALUE_T mem_get_s##end##sz##_aligned(const void *vmem) {\ + static VPX_INLINE signed MEM_VALUE_T \ + mem_get_s##end##sz##_aligned(const void *vmem) {\ const int##sz##_t *mem = (const int##sz##_t *)vmem;\ unsigned MEM_VALUE_T val, raw = *mem;\ swap_endian_##sz##_se(val,raw);\ @@ -72,13 +76,15 @@ } #define mem_put_ne_aligned_generic(end,sz) \ - static void mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\ + static VPX_INLINE void \ + mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\ uint##sz##_t *mem = (uint##sz##_t *)vmem;\ *mem = (uint##sz##_t)val;\ } #define mem_put_se_aligned_generic(end,sz) \ - static void mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\ + static VPX_INLINE void \ + mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\ uint##sz##_t *mem = (uint##sz##_t *)vmem, raw;\ swap_endian_##sz(raw,val);\ *mem = (uint##sz##_t)raw;\ diff --git a/vpx_ports/vpx_once.h b/vpx_ports/vpx_once.h index 182892acf..8772c6eea 100644 --- a/vpx_ports/vpx_once.h +++ b/vpx_ports/vpx_once.h @@ -73,6 +73,31 @@ static void once(void (*func)(void)) } +#elif CONFIG_MULTITHREAD && defined(__OS2__) +#define INCL_DOS +#include +static void once(void (*func)(void)) { + static int done; + + /* If the initialization is complete, return early. */ + if (done) + return; + + /* Causes all other threads in the process to block themselves + * and give up their time slice. + */ + DosEnterCritSec(); + + if (!done) { + func(); + done = 1; + } + + /* Restores normal thread dispatching for the current process. */ + DosExitCritSec(); +} + + #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H #include static void once(void (*func)(void)) diff --git a/vpx_ports/vpx_timer.h b/vpx_ports/vpx_timer.h index 9e2015e62..870338b4f 100644 --- a/vpx_ports/vpx_timer.h +++ b/vpx_ports/vpx_timer.h @@ -53,7 +53,7 @@ struct vpx_usec_timer { }; -static void +static INLINE void vpx_usec_timer_start(struct vpx_usec_timer *t) { #if defined(_WIN32) QueryPerformanceCounter(&t->begin); @@ -63,7 +63,7 @@ vpx_usec_timer_start(struct vpx_usec_timer *t) { } -static void +static INLINE void vpx_usec_timer_mark(struct vpx_usec_timer *t) { #if defined(_WIN32) QueryPerformanceCounter(&t->end); @@ -73,7 +73,7 @@ vpx_usec_timer_mark(struct vpx_usec_timer *t) { } -static int64_t +static INLINE int64_t vpx_usec_timer_elapsed(struct vpx_usec_timer *t) { #if defined(_WIN32) LARGE_INTEGER freq, diff; @@ -101,13 +101,13 @@ struct vpx_usec_timer { void *dummy; }; -static void +static INLINE void vpx_usec_timer_start(struct vpx_usec_timer *t) { } -static void +static INLINE void vpx_usec_timer_mark(struct vpx_usec_timer *t) { } -static long +static INLINE int vpx_usec_timer_elapsed(struct vpx_usec_timer *t) { return 0; } diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h index bc99f89d8..81c2b8b87 100644 --- a/vpx_ports/x86.h +++ b/vpx_ports/x86.h @@ -116,7 +116,7 @@ void __cpuid(int CPUInfo[4], int info_type); #define BIT(n) (1<oxcf.Width and -;cpi->oxcf.Height, which can be ANY numbers(NOT always multiples of 16 or 4). - -;void vp8_yv12_copy_src_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc, -; YV12_BUFFER_CONFIG *dst_ybc); - -|vp8_yv12_copy_src_frame_func_neon| PROC - push {r4 - r11, lr} - vpush {d8 - d15} - - ;Copy Y plane - ldr r4, [r0, #yv12_buffer_config_y_height] - ldr r5, [r0, #yv12_buffer_config_y_width] - ldr r6, [r0, #yv12_buffer_config_y_stride] - ldr r7, [r1, #yv12_buffer_config_y_stride] - ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 - ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 - - add r10, r2, r6 ;second row src - add r11, r3, r7 ;second row dst - mov r6, r6, lsl #1 - mov r7, r7, lsl #1 - sub r6, r6, r5 ;adjust stride - sub r7, r7, r5 - - ; copy two rows at one time - mov lr, r4, lsr #1 - -cp_src_to_dst_height_loop - mov r12, r5 - -cp_width_128_loop - vld1.8 {q0, q1}, [r2]! - vld1.8 {q4, q5}, [r10]! - vld1.8 {q2, q3}, [r2]! - vld1.8 {q6, q7}, [r10]! - vld1.8 {q8, q9}, [r2]! - vld1.8 {q12, q13}, [r10]! - vld1.8 {q10, q11}, [r2]! - vld1.8 {q14, q15}, [r10]! - sub r12, r12, #128 - cmp r12, #128 - vst1.8 {q0, q1}, [r3]! - vst1.8 {q4, q5}, [r11]! - vst1.8 {q2, q3}, [r3]! - vst1.8 {q6, q7}, [r11]! - vst1.8 {q8, q9}, [r3]! - vst1.8 {q12, q13}, [r11]! - vst1.8 {q10, q11}, [r3]! - vst1.8 {q14, q15}, [r11]! - bhs cp_width_128_loop - - cmp r12, #0 - beq cp_width_done - -cp_width_8_loop - vld1.8 {d0}, [r2]! - vld1.8 {d1}, [r10]! - sub r12, r12, #8 - cmp r12, #8 - vst1.8 {d0}, [r3]! - vst1.8 {d1}, [r11]! - bhs cp_width_8_loop - - cmp r12, #0 - beq cp_width_done - -cp_width_1_loop - ldrb r8, [r2], #1 - subs r12, r12, #1 - strb r8, [r3], #1 - ldrb r8, [r10], #1 - strb r8, [r11], #1 - bne cp_width_1_loop - -cp_width_done - subs lr, lr, #1 - add r2, r2, r6 - add r3, r3, r7 - add r10, r10, r6 - add r11, r11, r7 - bne cp_src_to_dst_height_loop - -;copy last line for Y if y_height is odd - tst r4, #1 - beq cp_width_done_1 - mov r12, r5 - -cp_width_128_loop_1 - vld1.8 {q0, q1}, [r2]! - vld1.8 {q2, q3}, [r2]! - vld1.8 {q8, q9}, [r2]! - vld1.8 {q10, q11}, [r2]! - sub r12, r12, #128 - cmp r12, #128 - vst1.8 {q0, q1}, [r3]! - vst1.8 {q2, q3}, [r3]! - vst1.8 {q8, q9}, [r3]! - vst1.8 {q10, q11}, [r3]! - bhs cp_width_128_loop_1 - - cmp r12, #0 - beq cp_width_done_1 - -cp_width_8_loop_1 - vld1.8 {d0}, [r2]! - sub r12, r12, #8 - cmp r12, #8 - vst1.8 {d0}, [r3]! - bhs cp_width_8_loop_1 - - cmp r12, #0 - beq cp_width_done_1 - -cp_width_1_loop_1 - ldrb r8, [r2], #1 - subs r12, r12, #1 - strb r8, [r3], #1 - bne cp_width_1_loop_1 -cp_width_done_1 - -;Copy U & V planes - ldr r4, [r0, #yv12_buffer_config_uv_height] - ldr r5, [r0, #yv12_buffer_config_uv_width] - ldr r6, [r0, #yv12_buffer_config_uv_stride] - ldr r7, [r1, #yv12_buffer_config_uv_stride] - ldr r2, [r0, #yv12_buffer_config_u_buffer] ;srcptr1 - ldr r3, [r1, #yv12_buffer_config_u_buffer] ;dstptr1 - - add r10, r2, r6 ;second row src - add r11, r3, r7 ;second row dst - mov r6, r6, lsl #1 - mov r7, r7, lsl #1 - sub r6, r6, r5 ;adjust stride - sub r7, r7, r5 - - mov r9, #2 - -cp_uv_loop - ;copy two rows at one time - mov lr, r4, lsr #1 - -cp_src_to_dst_height_uv_loop - mov r12, r5 - -cp_width_uv_64_loop - vld1.8 {q0, q1}, [r2]! - vld1.8 {q4, q5}, [r10]! - vld1.8 {q2, q3}, [r2]! - vld1.8 {q6, q7}, [r10]! - sub r12, r12, #64 - cmp r12, #64 - vst1.8 {q0, q1}, [r3]! - vst1.8 {q4, q5}, [r11]! - vst1.8 {q2, q3}, [r3]! - vst1.8 {q6, q7}, [r11]! - bhs cp_width_uv_64_loop - - cmp r12, #0 - beq cp_width_uv_done - -cp_width_uv_8_loop - vld1.8 {d0}, [r2]! - vld1.8 {d1}, [r10]! - sub r12, r12, #8 - cmp r12, #8 - vst1.8 {d0}, [r3]! - vst1.8 {d1}, [r11]! - bhs cp_width_uv_8_loop - - cmp r12, #0 - beq cp_width_uv_done - -cp_width_uv_1_loop - ldrb r8, [r2], #1 - subs r12, r12, #1 - strb r8, [r3], #1 - ldrb r8, [r10], #1 - strb r8, [r11], #1 - bne cp_width_uv_1_loop - -cp_width_uv_done - subs lr, lr, #1 - add r2, r2, r6 - add r3, r3, r7 - add r10, r10, r6 - add r11, r11, r7 - bne cp_src_to_dst_height_uv_loop - -;copy last line for U & V if uv_height is odd - tst r4, #1 - beq cp_width_uv_done_1 - mov r12, r5 - -cp_width_uv_64_loop_1 - vld1.8 {q0, q1}, [r2]! - vld1.8 {q2, q3}, [r2]! - sub r12, r12, #64 - cmp r12, #64 - vst1.8 {q0, q1}, [r3]! - vst1.8 {q2, q3}, [r3]! - bhs cp_width_uv_64_loop_1 - - cmp r12, #0 - beq cp_width_uv_done_1 - -cp_width_uv_8_loop_1 - vld1.8 {d0}, [r2]! - sub r12, r12, #8 - cmp r12, #8 - vst1.8 {d0}, [r3]! - bhs cp_width_uv_8_loop_1 - - cmp r12, #0 - beq cp_width_uv_done_1 - -cp_width_uv_1_loop_1 - ldrb r8, [r2], #1 - subs r12, r12, #1 - strb r8, [r3], #1 - bne cp_width_uv_1_loop_1 -cp_width_uv_done_1 - - subs r9, r9, #1 - ldrne r2, [r0, #yv12_buffer_config_v_buffer] ;srcptr1 - ldrne r3, [r1, #yv12_buffer_config_v_buffer] ;dstptr1 - ldrne r10, [r0, #yv12_buffer_config_uv_stride] - ldrne r11, [r1, #yv12_buffer_config_uv_stride] - - addne r10, r2, r10 ;second row src - addne r11, r3, r11 ;second row dst - - bne cp_uv_loop - - vpop {d8 - d15} - pop {r4 - r11, pc} - - ENDP - END diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm deleted file mode 100644 index b2eb9eb0f..000000000 --- a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm +++ /dev/null @@ -1,308 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_yv12_extend_frame_borders_neon| - ARM - REQUIRE8 - PRESERVE8 - - INCLUDE vpx_scale_asm_offsets.asm - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf); -; we depend on VP8BORDERINPIXELS being 32 - -|vp8_yv12_extend_frame_borders_neon| PROC - push {r4 - r10, lr} - vpush {d8 - d15} - - ; Border = 32 - ldr r3, [r0, #yv12_buffer_config_y_width] ; plane_width - ldr r1, [r0, #yv12_buffer_config_y_buffer] ; src_ptr1 - ldr r4, [r0, #yv12_buffer_config_y_height] ; plane_height - ldr lr, [r0, #yv12_buffer_config_y_stride] ; plane_stride - -; Border copy for Y plane -; copy the left and right most columns out - add r6, r1, r3 ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width) - sub r2, r6, #1 ; src_ptr2 = src_ptr1 + plane_width - 1 - sub r5, r1, #32 ; dest_ptr1 = src_ptr1 - Border - - mov r12, r4, lsr #2 ; plane_height / 4 - -copy_left_right_y - vld1.8 {d0[], d1[]}, [r1], lr - vld1.8 {d4[], d5[]}, [r2], lr - vld1.8 {d8[], d9[]}, [r1], lr - vld1.8 {d12[], d13[]}, [r2], lr - vld1.8 {d16[], d17[]}, [r1], lr - vld1.8 {d20[], d21[]}, [r2], lr - vld1.8 {d24[], d25[]}, [r1], lr - vld1.8 {d28[], d29[]}, [r2], lr - - vmov q1, q0 - vmov q3, q2 - vmov q5, q4 - vmov q7, q6 - vmov q9, q8 - vmov q11, q10 - vmov q13, q12 - vmov q15, q14 - - subs r12, r12, #1 - - vst1.8 {q0, q1}, [r5], lr - vst1.8 {q2, q3}, [r6], lr - vst1.8 {q4, q5}, [r5], lr - vst1.8 {q6, q7}, [r6], lr - vst1.8 {q8, q9}, [r5], lr - vst1.8 {q10, q11}, [r6], lr - vst1.8 {q12, q13}, [r5], lr - vst1.8 {q14, q15}, [r6], lr - - bne copy_left_right_y - -;Now copy the top and bottom source lines into each line of the respective borders - ldr r1, [r0, #yv12_buffer_config_y_buffer] ; y_buffer - mul r8, r4, lr ; plane_height * plane_stride - - ; copy width is plane_stride - movs r12, lr, lsr #7 ; plane_stride / 128 - - sub r1, r1, #32 ; src_ptr1 = y_buffer - Border - add r6, r1, r8 ; dest_ptr2 = src_ptr2 - plane_stride (src_ptr1 + (plane_height * plane_stride)) - sub r2, r6, lr ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride - sub r5, r1, lr, asl #5 ; dest_ptr1 = src_ptr1 - (Border * plane_stride) - ble extra_y_copy_needed ; plane stride < 128 - -copy_top_bottom_y - vld1.8 {q0, q1}, [r1]! - vld1.8 {q8, q9}, [r2]! - vld1.8 {q2, q3}, [r1]! - vld1.8 {q10, q11}, [r2]! - vld1.8 {q4, q5}, [r1]! - vld1.8 {q12, q13}, [r2]! - vld1.8 {q6, q7}, [r1]! - vld1.8 {q14, q15}, [r2]! - - mov r7, #32 ; Border - -top_bottom_32 - subs r7, r7, #1 - - vst1.8 {q0, q1}, [r5]! - vst1.8 {q8, q9}, [r6]! - vst1.8 {q2, q3}, [r5]! - vst1.8 {q10, q11}, [r6]! - vst1.8 {q4, q5}, [r5]! - vst1.8 {q12, q13}, [r6]! - vst1.8 {q6, q7}, [r5]! - vst1.8 {q14, q15}, [r6]! - - add r5, r5, lr ; dest_ptr1 += plane_stride - sub r5, r5, #128 ; dest_ptr1 -= 128 - add r6, r6, lr ; dest_ptr2 += plane_stride - sub r6, r6, #128 ; dest_ptr2 -= 128 - - bne top_bottom_32 - - sub r5, r1, lr, asl #5 ; src_ptr1 - (Border* plane_stride) - add r6, r2, lr ; src_ptr2 + plane_stride - - subs r12, r12, #1 - bne copy_top_bottom_y - -extra_y_copy_needed - mov r7, lr, lsr #4 ; check to see if extra copy is needed - ands r7, r7, #0x7 - bne extra_top_bottom_y -end_of_border_copy_y - -;Border copy for U, V planes -; Border = 16 - ldr r7, [r0, #yv12_buffer_config_u_buffer] ; src_ptr1 - ldr lr, [r0, #yv12_buffer_config_uv_stride] ; plane_stride - ldr r3, [r0, #yv12_buffer_config_uv_width] ; plane_width - ldr r4, [r0, #yv12_buffer_config_uv_height] ; plane_height - - mov r10, #2 - -;copy the left and right most columns out -border_copy_uv - mov r1, r7 ; src_ptr1 needs to be saved for second half of loop - sub r5, r1, #16 ; dest_ptr1 = src_ptr1 - Border - add r6, r1, r3 ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width) - sub r2, r6, #1 ; src_ptr2 = src_ptr1 + plane_width - 1 - - mov r12, r4, lsr #3 ; plane_height / 8 - -copy_left_right_uv - vld1.8 {d0[], d1[]}, [r1], lr - vld1.8 {d2[], d3[]}, [r2], lr - vld1.8 {d4[], d5[]}, [r1], lr - vld1.8 {d6[], d7[]}, [r2], lr - vld1.8 {d8[], d9[]}, [r1], lr - vld1.8 {d10[], d11[]}, [r2], lr - vld1.8 {d12[], d13[]}, [r1], lr - vld1.8 {d14[], d15[]}, [r2], lr - vld1.8 {d16[], d17[]}, [r1], lr - vld1.8 {d18[], d19[]}, [r2], lr - vld1.8 {d20[], d21[]}, [r1], lr - vld1.8 {d22[], d23[]}, [r2], lr - vld1.8 {d24[], d25[]}, [r1], lr - vld1.8 {d26[], d27[]}, [r2], lr - vld1.8 {d28[], d29[]}, [r1], lr - vld1.8 {d30[], d31[]}, [r2], lr - - subs r12, r12, #1 - - vst1.8 {q0}, [r5], lr - vst1.8 {q1}, [r6], lr - vst1.8 {q2}, [r5], lr - vst1.8 {q3}, [r6], lr - vst1.8 {q4}, [r5], lr - vst1.8 {q5}, [r6], lr - vst1.8 {q6}, [r5], lr - vst1.8 {q7}, [r6], lr - vst1.8 {q8}, [r5], lr - vst1.8 {q9}, [r6], lr - vst1.8 {q10}, [r5], lr - vst1.8 {q11}, [r6], lr - vst1.8 {q12}, [r5], lr - vst1.8 {q13}, [r6], lr - vst1.8 {q14}, [r5], lr - vst1.8 {q15}, [r6], lr - - bne copy_left_right_uv - -;Now copy the top and bottom source lines into each line of the respective borders - mov r1, r7 - mul r8, r4, lr ; plane_height * plane_stride - movs r12, lr, lsr #6 ; plane_stride / 64 - - sub r1, r1, #16 ; src_ptr1 = u_buffer - Border - add r6, r1, r8 ; dest_ptr2 = src_ptr2 + plane_stride (src_ptr1 + (plane_height * plane_stride) - sub r2, r6, lr ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride - sub r5, r1, lr, asl #4 ; dest_ptr1 = src_ptr1 - (Border * plane_stride) - ble extra_uv_copy_needed ; plane_stride < 64 - -copy_top_bottom_uv - vld1.8 {q0, q1}, [r1]! - vld1.8 {q8, q9}, [r2]! - vld1.8 {q2, q3}, [r1]! - vld1.8 {q10, q11}, [r2]! - - mov r7, #16 ; Border - -top_bottom_16 - subs r7, r7, #1 - - vst1.8 {q0, q1}, [r5]! - vst1.8 {q8, q9}, [r6]! - vst1.8 {q2, q3}, [r5]! - vst1.8 {q10, q11}, [r6]! - - add r5, r5, lr ; dest_ptr1 += plane_stride - sub r5, r5, #64 - add r6, r6, lr ; dest_ptr2 += plane_stride - sub r6, r6, #64 - - bne top_bottom_16 - - sub r5, r1, lr, asl #4 ; dest_ptr1 = src_ptr1 - (Border * plane_stride) - add r6, r2, lr ; dest_ptr2 = src_ptr2 + plane_stride - - subs r12, r12, #1 - bne copy_top_bottom_uv -extra_uv_copy_needed - mov r7, lr, lsr #3 ; check to see if extra copy is needed - ands r7, r7, #0x7 - bne extra_top_bottom_uv - -end_of_border_copy_uv - subs r10, r10, #1 - ldrne r7, [r0, #yv12_buffer_config_v_buffer] ; src_ptr1 - bne border_copy_uv - - vpop {d8 - d15} - pop {r4 - r10, pc} - -;;;;;;;;;;;;;;;;;;;;;; -extra_top_bottom_y - vld1.8 {q0}, [r1]! - vld1.8 {q2}, [r2]! - - mov r9, #4 ; 32 >> 3 - -extra_top_bottom_32 - subs r9, r9, #1 - - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - bne extra_top_bottom_32 - - sub r5, r1, lr, asl #5 ; src_ptr1 - (Border * plane_stride) - add r6, r2, lr ; src_ptr2 + plane_stride - subs r7, r7, #1 - bne extra_top_bottom_y - - b end_of_border_copy_y - -extra_top_bottom_uv - vld1.8 {d0}, [r1]! - vld1.8 {d8}, [r2]! - - mov r9, #2 ; 16 >> 3 - -extra_top_bottom_16 - subs r9, r9, #1 - - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - bne extra_top_bottom_16 - - sub r5, r1, lr, asl #4 ; src_ptr1 - (Border * plane_stride) - add r6, r2, lr ; src_ptr2 + plane_stride - subs r7, r7, #1 - bne extra_top_bottom_uv - - b end_of_border_copy_uv - - ENDP - END diff --git a/vpx_scale/arm/neon/yv12extend_arm.c b/vpx_scale/arm/neon/yv12extend_arm.c deleted file mode 100644 index fac7bbc1b..000000000 --- a/vpx_scale/arm/neon/yv12extend_arm.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_scale_rtcd.h" - -extern void vp8_yv12_copy_frame_func_neon( - const struct yv12_buffer_config *src_ybc, - struct yv12_buffer_config *dst_ybc); - -void vp8_yv12_copy_frame_neon(const struct yv12_buffer_config *src_ybc, - struct yv12_buffer_config *dst_ybc) { - vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc); - vp8_yv12_extend_frame_borders_neon(dst_ybc); -} diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c index 92f8b85ad..de6ba1f44 100644 --- a/vpx_scale/generic/yv12config.c +++ b/vpx_scale/generic/yv12config.c @@ -84,6 +84,8 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, ybf->y_height = aligned_height; ybf->y_stride = y_stride; + ybf->uv_crop_width = (width + 1) / 2; + ybf->uv_crop_height = (height + 1) / 2; ybf->uv_width = uv_width; ybf->uv_height = uv_height; ybf->uv_stride = uv_stride; @@ -216,7 +218,7 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, if (!ybf->buffer_alloc) return -1; - ybf->buffer_alloc_sz = frame_size; + ybf->buffer_alloc_sz = (int)frame_size; // This memset is needed for fixing valgrind error from C loop filter // due to access uninitialized memory in frame border. It could be @@ -245,7 +247,7 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, ybf->uv_stride = uv_stride; ybf->border = border; - ybf->frame_size = frame_size; + ybf->frame_size = (int)frame_size; #if CONFIG_VP9_HIGH if (use_high) { // Store uint16 addresses when using 16bit framebuffers diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c index 1f8b5b7dc..76515fd12 100644 --- a/vpx_scale/generic/yv12extend.c +++ b/vpx_scale/generic/yv12extend.c @@ -103,6 +103,9 @@ static void extend_plane_high(uint8_t *const src8, int src_stride, #endif void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { + const int uv_border = ybf->border / 2; + + assert(ybf->border % 2 == 0); assert(ybf->y_height - ybf->y_crop_height < 16); assert(ybf->y_width - ybf->y_crop_width < 16); assert(ybf->y_height - ybf->y_crop_height >= 0); @@ -137,16 +140,16 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { ybf->border + ybf->y_width - ybf->y_crop_width); extend_plane(ybf->u_buffer, ybf->uv_stride, - (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2, - ybf->border / 2, ybf->border / 2, - (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2, - (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2); + ybf->uv_crop_width, ybf->uv_crop_height, + uv_border, uv_border, + uv_border + ybf->uv_height - ybf->uv_crop_height, + uv_border + ybf->uv_width - ybf->uv_crop_width); extend_plane(ybf->v_buffer, ybf->uv_stride, - (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2, - ybf->border / 2, ybf->border / 2, - (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2, - (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2); + ybf->uv_crop_width, ybf->uv_crop_height, + uv_border, uv_border, + uv_border + ybf->uv_height - ybf->uv_crop_height, + uv_border + ybf->uv_width - ybf->uv_crop_width); } #if CONFIG_VP9 diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk index 95e7483e6..0a1594bd8 100644 --- a/vpx_scale/vpx_scale.mk +++ b/vpx_scale/vpx_scale.mk @@ -9,12 +9,6 @@ SCALE_SRCS-yes += vpx_scale_asm_offsets.c SCALE_SRCS-yes += vpx_scale_rtcd.c SCALE_SRCS-yes += vpx_scale_rtcd.pl -#neon -SCALE_SRCS-$(HAVE_NEON_ASM) += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM) -SCALE_SRCS-$(HAVE_NEON_ASM) += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM) -SCALE_SRCS-$(HAVE_NEON_ASM) += arm/neon/vp8_vpxyv12_extendframeborders_neon$(ASM) -SCALE_SRCS-$(HAVE_NEON_ASM) += arm/neon/yv12extend_arm.c - #mips(dspr2) SCALE_SRCS-$(HAVE_DSPR2) += mips/dspr2/yv12extend_dspr2.c diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl index 2e3f1ffbe..d4a2b81a5 100644 --- a/vpx_scale/vpx_scale_rtcd.pl +++ b/vpx_scale/vpx_scale_rtcd.pl @@ -17,12 +17,8 @@ if (vpx_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") { } add_proto qw/void vp8_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf"; -specialize qw/vp8_yv12_extend_frame_borders neon_asm/; -$vp8_yv12_extend_frame_borders_neon_asm=vp8_yv12_extend_frame_borders_neon; add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"; -specialize qw/vp8_yv12_copy_frame neon_asm/; -$vp8_yv12_copy_frame_neon_asm=vp8_yv12_copy_frame_neon; add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"; diff --git a/vpxdec.c b/vpxdec.c index 6a4685045..231b3468d 100644 --- a/vpxdec.c +++ b/vpxdec.c @@ -131,8 +131,8 @@ static const arg_def_t *vp8_pp_args[] = { }; #endif -static int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst, - FilterModeEnum mode) { +static INLINE int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst, + FilterModeEnum mode) { #if CONFIG_VP9_HIGH if (src->fmt == VPX_IMG_FMT_I42016) { assert(dst->fmt == VPX_IMG_FMT_I42016); @@ -458,6 +458,7 @@ void generate_filename(const char *pattern, char *out, size_t q_len, break; default: die("Unrecognized pattern %%%c\n", p[1]); + break; } pat_len = strlen(q); @@ -716,7 +717,7 @@ int main_loop(int argc, const char **argv_) { int use_y4m = 1; int opt_yv12 = 0; int opt_i420 = 0; - vpx_codec_dec_cfg_t cfg = {0}; + vpx_codec_dec_cfg_t cfg = {0, 0, 0}; #if CONFIG_VP9_HIGH int out_bit_depth = 0; #endif @@ -736,7 +737,7 @@ int main_loop(int argc, const char **argv_) { #endif int frame_avail, got_data; int num_external_frame_buffers = 0; - struct ExternalFrameBufferList ext_fb_list = {0}; + struct ExternalFrameBufferList ext_fb_list = {0, NULL}; const char *outfile_pattern = NULL; char outfile_name[PATH_MAX] = {0}; @@ -745,8 +746,8 @@ int main_loop(int argc, const char **argv_) { MD5Context md5_ctx; unsigned char md5_digest[16]; - struct VpxDecInputContext input = {0}; - struct VpxInputContext vpx_input_ctx = {0}; + struct VpxDecInputContext input = {NULL, NULL}; + struct VpxInputContext vpx_input_ctx; #if CONFIG_WEBM_IO struct WebmInputContext webm_ctx = {0}; input.webm_ctx = &webm_ctx; diff --git a/vpxenc.c b/vpxenc.c index 2d1e41d61..675874e6a 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -408,11 +408,20 @@ static const arg_def_t inbitdeptharg = ARG_DEF(NULL, "input-bit-depth", 1, "Bit depth of input"); #endif +static const struct arg_enum_list tune_content_enum[] = { + {"default", VP9E_CONTENT_DEFAULT}, + {"screen", VP9E_CONTENT_SCREEN}, + {NULL, 0} +}; + +static const arg_def_t tune_content = ARG_DEF_ENUM( + NULL, "tune-content", 1, "Tune content type", tune_content_enum); + static const arg_def_t *vp9_args[] = { &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh, &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type, &tune_ssim, &cq_level, &max_intra_rate_pct, &lossless, - &frame_parallel_decoding, &aq_mode, &frame_periodic_boost, + &frame_parallel_decoding, &aq_mode, &frame_periodic_boost, &tune_content, #if CONFIG_VP9_HIGH &bitdeptharg, &inbitdeptharg, #endif @@ -425,7 +434,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE, VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT, VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE, - VP9E_SET_FRAME_PERIODIC_BOOST, + VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_TUNE_CONTENT, 0 }; #endif diff --git a/y4menc.c b/y4menc.c index 9211452a4..b647e8dcc 100644 --- a/y4menc.c +++ b/y4menc.c @@ -48,6 +48,7 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height, "C420p16 XYSCSS=420P16\n"; break; default: + color = NULL; assert(0); } return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s", width, height,